1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); 82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 83 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 84 static unsigned int ip6_mtu(const struct dst_entry *dst); 85 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 86 static void ip6_dst_destroy(struct dst_entry *); 87 static void ip6_dst_ifdown(struct dst_entry *, 88 struct net_device *dev, int how); 89 static int ip6_dst_gc(struct dst_ops *ops); 90 91 static int ip6_pkt_discard(struct sk_buff *skb); 92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 93 static int ip6_pkt_prohibit(struct sk_buff *skb); 94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 95 static void ip6_link_failure(struct sk_buff *skb); 96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 97 struct sk_buff *skb, u32 mtu); 98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 99 struct sk_buff *skb); 100 static void rt6_dst_from_metrics_check(struct rt6_info *rt); 101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 102 static size_t rt6_nlmsg_size(struct rt6_info *rt); 103 static int rt6_fill_node(struct net *net, 104 struct sk_buff *skb, struct rt6_info *rt, 105 struct in6_addr *dst, struct in6_addr *src, 106 int iif, int type, u32 portid, u32 seq, 107 unsigned int flags); 108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 109 struct in6_addr *daddr, 110 struct in6_addr *saddr); 111 112 #ifdef CONFIG_IPV6_ROUTE_INFO 113 static struct rt6_info *rt6_add_route_info(struct net *net, 114 const struct in6_addr *prefix, int prefixlen, 115 const struct in6_addr *gwaddr, 116 struct net_device *dev, 117 unsigned int pref); 118 static struct rt6_info *rt6_get_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev); 122 #endif 123 124 struct uncached_list { 125 spinlock_t lock; 126 struct list_head head; 127 }; 128 129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 130 131 static void rt6_uncached_list_add(struct rt6_info *rt) 132 { 133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 134 135 rt->rt6i_uncached_list = ul; 136 137 spin_lock_bh(&ul->lock); 138 list_add_tail(&rt->rt6i_uncached, &ul->head); 139 spin_unlock_bh(&ul->lock); 140 } 141 142 static void rt6_uncached_list_del(struct rt6_info *rt) 143 { 144 if (!list_empty(&rt->rt6i_uncached)) { 145 struct uncached_list *ul = rt->rt6i_uncached_list; 146 struct net *net = dev_net(rt->dst.dev); 147 148 spin_lock_bh(&ul->lock); 149 list_del(&rt->rt6i_uncached); 150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); 151 spin_unlock_bh(&ul->lock); 152 } 153 } 154 155 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 156 { 157 struct net_device *loopback_dev = net->loopback_dev; 158 int cpu; 159 160 if (dev == loopback_dev) 161 return; 162 163 for_each_possible_cpu(cpu) { 164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 165 struct rt6_info *rt; 166 167 spin_lock_bh(&ul->lock); 168 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 169 struct inet6_dev *rt_idev = rt->rt6i_idev; 170 struct net_device *rt_dev = rt->dst.dev; 171 172 if (rt_idev->dev == dev) { 173 rt->rt6i_idev = in6_dev_get(loopback_dev); 174 in6_dev_put(rt_idev); 175 } 176 177 if (rt_dev == dev) { 178 rt->dst.dev = loopback_dev; 179 dev_hold(rt->dst.dev); 180 dev_put(rt_dev); 181 } 182 } 183 spin_unlock_bh(&ul->lock); 184 } 185 } 186 187 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 188 { 189 return dst_metrics_write_ptr(rt->dst.from); 190 } 191 192 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 193 { 194 struct rt6_info *rt = (struct rt6_info *)dst; 195 196 if (rt->rt6i_flags & RTF_PCPU) 197 return rt6_pcpu_cow_metrics(rt); 198 else if (rt->rt6i_flags & RTF_CACHE) 199 return NULL; 200 else 201 return dst_cow_metrics_generic(dst, old); 202 } 203 204 static inline const void *choose_neigh_daddr(struct rt6_info *rt, 205 struct sk_buff *skb, 206 const void *daddr) 207 { 208 struct in6_addr *p = &rt->rt6i_gateway; 209 210 if (!ipv6_addr_any(p)) 211 return (const void *) p; 212 else if (skb) 213 return &ipv6_hdr(skb)->daddr; 214 return daddr; 215 } 216 217 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 218 struct sk_buff *skb, 219 const void *daddr) 220 { 221 struct rt6_info *rt = (struct rt6_info *) dst; 222 struct neighbour *n; 223 224 daddr = choose_neigh_daddr(rt, skb, daddr); 225 n = __ipv6_neigh_lookup(dst->dev, daddr); 226 if (n) 227 return n; 228 return neigh_create(&nd_tbl, daddr, dst->dev); 229 } 230 231 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 232 { 233 struct net_device *dev = dst->dev; 234 struct rt6_info *rt = (struct rt6_info *)dst; 235 236 daddr = choose_neigh_daddr(rt, NULL, daddr); 237 if (!daddr) 238 return; 239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 240 return; 241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 242 return; 243 __ipv6_confirm_neigh(dev, daddr); 244 } 245 246 static struct dst_ops ip6_dst_ops_template = { 247 .family = AF_INET6, 248 .gc = ip6_dst_gc, 249 .gc_thresh = 1024, 250 .check = ip6_dst_check, 251 .default_advmss = ip6_default_advmss, 252 .mtu = ip6_mtu, 253 .cow_metrics = ipv6_cow_metrics, 254 .destroy = ip6_dst_destroy, 255 .ifdown = ip6_dst_ifdown, 256 .negative_advice = ip6_negative_advice, 257 .link_failure = ip6_link_failure, 258 .update_pmtu = ip6_rt_update_pmtu, 259 .redirect = rt6_do_redirect, 260 .local_out = __ip6_local_out, 261 .neigh_lookup = ip6_neigh_lookup, 262 .confirm_neigh = ip6_confirm_neigh, 263 }; 264 265 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 266 { 267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 268 269 return mtu ? : dst->dev->mtu; 270 } 271 272 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 273 struct sk_buff *skb, u32 mtu) 274 { 275 } 276 277 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 278 struct sk_buff *skb) 279 { 280 } 281 282 static struct dst_ops ip6_dst_blackhole_ops = { 283 .family = AF_INET6, 284 .destroy = ip6_dst_destroy, 285 .check = ip6_dst_check, 286 .mtu = ip6_blackhole_mtu, 287 .default_advmss = ip6_default_advmss, 288 .update_pmtu = ip6_rt_blackhole_update_pmtu, 289 .redirect = ip6_rt_blackhole_redirect, 290 .cow_metrics = dst_cow_metrics_generic, 291 .neigh_lookup = ip6_neigh_lookup, 292 }; 293 294 static const u32 ip6_template_metrics[RTAX_MAX] = { 295 [RTAX_HOPLIMIT - 1] = 0, 296 }; 297 298 static const struct rt6_info ip6_null_entry_template = { 299 .dst = { 300 .__refcnt = ATOMIC_INIT(1), 301 .__use = 1, 302 .obsolete = DST_OBSOLETE_FORCE_CHK, 303 .error = -ENETUNREACH, 304 .input = ip6_pkt_discard, 305 .output = ip6_pkt_discard_out, 306 }, 307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 308 .rt6i_protocol = RTPROT_KERNEL, 309 .rt6i_metric = ~(u32) 0, 310 .rt6i_ref = ATOMIC_INIT(1), 311 }; 312 313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 314 315 static const struct rt6_info ip6_prohibit_entry_template = { 316 .dst = { 317 .__refcnt = ATOMIC_INIT(1), 318 .__use = 1, 319 .obsolete = DST_OBSOLETE_FORCE_CHK, 320 .error = -EACCES, 321 .input = ip6_pkt_prohibit, 322 .output = ip6_pkt_prohibit_out, 323 }, 324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 325 .rt6i_protocol = RTPROT_KERNEL, 326 .rt6i_metric = ~(u32) 0, 327 .rt6i_ref = ATOMIC_INIT(1), 328 }; 329 330 static const struct rt6_info ip6_blk_hole_entry_template = { 331 .dst = { 332 .__refcnt = ATOMIC_INIT(1), 333 .__use = 1, 334 .obsolete = DST_OBSOLETE_FORCE_CHK, 335 .error = -EINVAL, 336 .input = dst_discard, 337 .output = dst_discard_out, 338 }, 339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 340 .rt6i_protocol = RTPROT_KERNEL, 341 .rt6i_metric = ~(u32) 0, 342 .rt6i_ref = ATOMIC_INIT(1), 343 }; 344 345 #endif 346 347 static void rt6_info_init(struct rt6_info *rt) 348 { 349 struct dst_entry *dst = &rt->dst; 350 351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 352 INIT_LIST_HEAD(&rt->rt6i_siblings); 353 INIT_LIST_HEAD(&rt->rt6i_uncached); 354 } 355 356 /* allocate dst with ip6_dst_ops */ 357 static struct rt6_info *__ip6_dst_alloc(struct net *net, 358 struct net_device *dev, 359 int flags) 360 { 361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 362 1, DST_OBSOLETE_FORCE_CHK, flags); 363 364 if (rt) { 365 rt6_info_init(rt); 366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 367 } 368 369 return rt; 370 } 371 372 struct rt6_info *ip6_dst_alloc(struct net *net, 373 struct net_device *dev, 374 int flags) 375 { 376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); 377 378 if (rt) { 379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); 380 if (rt->rt6i_pcpu) { 381 int cpu; 382 383 for_each_possible_cpu(cpu) { 384 struct rt6_info **p; 385 386 p = per_cpu_ptr(rt->rt6i_pcpu, cpu); 387 /* no one shares rt */ 388 *p = NULL; 389 } 390 } else { 391 dst_release_immediate(&rt->dst); 392 return NULL; 393 } 394 } 395 396 return rt; 397 } 398 EXPORT_SYMBOL(ip6_dst_alloc); 399 400 static void ip6_dst_destroy(struct dst_entry *dst) 401 { 402 struct rt6_info *rt = (struct rt6_info *)dst; 403 struct rt6_exception_bucket *bucket; 404 struct dst_entry *from = dst->from; 405 struct inet6_dev *idev; 406 407 dst_destroy_metrics_generic(dst); 408 free_percpu(rt->rt6i_pcpu); 409 rt6_uncached_list_del(rt); 410 411 idev = rt->rt6i_idev; 412 if (idev) { 413 rt->rt6i_idev = NULL; 414 in6_dev_put(idev); 415 } 416 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); 417 if (bucket) { 418 rt->rt6i_exception_bucket = NULL; 419 kfree(bucket); 420 } 421 422 dst->from = NULL; 423 dst_release(from); 424 } 425 426 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 427 int how) 428 { 429 struct rt6_info *rt = (struct rt6_info *)dst; 430 struct inet6_dev *idev = rt->rt6i_idev; 431 struct net_device *loopback_dev = 432 dev_net(dev)->loopback_dev; 433 434 if (idev && idev->dev != loopback_dev) { 435 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 436 if (loopback_idev) { 437 rt->rt6i_idev = loopback_idev; 438 in6_dev_put(idev); 439 } 440 } 441 } 442 443 static bool __rt6_check_expired(const struct rt6_info *rt) 444 { 445 if (rt->rt6i_flags & RTF_EXPIRES) 446 return time_after(jiffies, rt->dst.expires); 447 else 448 return false; 449 } 450 451 static bool rt6_check_expired(const struct rt6_info *rt) 452 { 453 if (rt->rt6i_flags & RTF_EXPIRES) { 454 if (time_after(jiffies, rt->dst.expires)) 455 return true; 456 } else if (rt->dst.from) { 457 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 458 rt6_check_expired((struct rt6_info *)rt->dst.from); 459 } 460 return false; 461 } 462 463 static struct rt6_info *rt6_multipath_select(struct rt6_info *match, 464 struct flowi6 *fl6, int oif, 465 int strict) 466 { 467 struct rt6_info *sibling, *next_sibling; 468 int route_choosen; 469 470 /* We might have already computed the hash for ICMPv6 errors. In such 471 * case it will always be non-zero. Otherwise now is the time to do it. 472 */ 473 if (!fl6->mp_hash) 474 fl6->mp_hash = rt6_multipath_hash(fl6, NULL); 475 476 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); 477 /* Don't change the route, if route_choosen == 0 478 * (siblings does not include ourself) 479 */ 480 if (route_choosen) 481 list_for_each_entry_safe(sibling, next_sibling, 482 &match->rt6i_siblings, rt6i_siblings) { 483 route_choosen--; 484 if (route_choosen == 0) { 485 if (rt6_score_route(sibling, oif, strict) < 0) 486 break; 487 match = sibling; 488 break; 489 } 490 } 491 return match; 492 } 493 494 /* 495 * Route lookup. rcu_read_lock() should be held. 496 */ 497 498 static inline struct rt6_info *rt6_device_match(struct net *net, 499 struct rt6_info *rt, 500 const struct in6_addr *saddr, 501 int oif, 502 int flags) 503 { 504 struct rt6_info *local = NULL; 505 struct rt6_info *sprt; 506 507 if (!oif && ipv6_addr_any(saddr)) 508 goto out; 509 510 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { 511 struct net_device *dev = sprt->dst.dev; 512 513 if (oif) { 514 if (dev->ifindex == oif) 515 return sprt; 516 if (dev->flags & IFF_LOOPBACK) { 517 if (!sprt->rt6i_idev || 518 sprt->rt6i_idev->dev->ifindex != oif) { 519 if (flags & RT6_LOOKUP_F_IFACE) 520 continue; 521 if (local && 522 local->rt6i_idev->dev->ifindex == oif) 523 continue; 524 } 525 local = sprt; 526 } 527 } else { 528 if (ipv6_chk_addr(net, saddr, dev, 529 flags & RT6_LOOKUP_F_IFACE)) 530 return sprt; 531 } 532 } 533 534 if (oif) { 535 if (local) 536 return local; 537 538 if (flags & RT6_LOOKUP_F_IFACE) 539 return net->ipv6.ip6_null_entry; 540 } 541 out: 542 return rt; 543 } 544 545 #ifdef CONFIG_IPV6_ROUTER_PREF 546 struct __rt6_probe_work { 547 struct work_struct work; 548 struct in6_addr target; 549 struct net_device *dev; 550 }; 551 552 static void rt6_probe_deferred(struct work_struct *w) 553 { 554 struct in6_addr mcaddr; 555 struct __rt6_probe_work *work = 556 container_of(w, struct __rt6_probe_work, work); 557 558 addrconf_addr_solict_mult(&work->target, &mcaddr); 559 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 560 dev_put(work->dev); 561 kfree(work); 562 } 563 564 static void rt6_probe(struct rt6_info *rt) 565 { 566 struct __rt6_probe_work *work; 567 struct neighbour *neigh; 568 /* 569 * Okay, this does not seem to be appropriate 570 * for now, however, we need to check if it 571 * is really so; aka Router Reachability Probing. 572 * 573 * Router Reachability Probe MUST be rate-limited 574 * to no more than one per minute. 575 */ 576 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 577 return; 578 rcu_read_lock_bh(); 579 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 580 if (neigh) { 581 if (neigh->nud_state & NUD_VALID) 582 goto out; 583 584 work = NULL; 585 write_lock(&neigh->lock); 586 if (!(neigh->nud_state & NUD_VALID) && 587 time_after(jiffies, 588 neigh->updated + 589 rt->rt6i_idev->cnf.rtr_probe_interval)) { 590 work = kmalloc(sizeof(*work), GFP_ATOMIC); 591 if (work) 592 __neigh_set_probe_once(neigh); 593 } 594 write_unlock(&neigh->lock); 595 } else { 596 work = kmalloc(sizeof(*work), GFP_ATOMIC); 597 } 598 599 if (work) { 600 INIT_WORK(&work->work, rt6_probe_deferred); 601 work->target = rt->rt6i_gateway; 602 dev_hold(rt->dst.dev); 603 work->dev = rt->dst.dev; 604 schedule_work(&work->work); 605 } 606 607 out: 608 rcu_read_unlock_bh(); 609 } 610 #else 611 static inline void rt6_probe(struct rt6_info *rt) 612 { 613 } 614 #endif 615 616 /* 617 * Default Router Selection (RFC 2461 6.3.6) 618 */ 619 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 620 { 621 struct net_device *dev = rt->dst.dev; 622 if (!oif || dev->ifindex == oif) 623 return 2; 624 if ((dev->flags & IFF_LOOPBACK) && 625 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 626 return 1; 627 return 0; 628 } 629 630 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 631 { 632 struct neighbour *neigh; 633 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 634 635 if (rt->rt6i_flags & RTF_NONEXTHOP || 636 !(rt->rt6i_flags & RTF_GATEWAY)) 637 return RT6_NUD_SUCCEED; 638 639 rcu_read_lock_bh(); 640 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 641 if (neigh) { 642 read_lock(&neigh->lock); 643 if (neigh->nud_state & NUD_VALID) 644 ret = RT6_NUD_SUCCEED; 645 #ifdef CONFIG_IPV6_ROUTER_PREF 646 else if (!(neigh->nud_state & NUD_FAILED)) 647 ret = RT6_NUD_SUCCEED; 648 else 649 ret = RT6_NUD_FAIL_PROBE; 650 #endif 651 read_unlock(&neigh->lock); 652 } else { 653 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 654 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 655 } 656 rcu_read_unlock_bh(); 657 658 return ret; 659 } 660 661 static int rt6_score_route(struct rt6_info *rt, int oif, 662 int strict) 663 { 664 int m; 665 666 m = rt6_check_dev(rt, oif); 667 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 668 return RT6_NUD_FAIL_HARD; 669 #ifdef CONFIG_IPV6_ROUTER_PREF 670 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 671 #endif 672 if (strict & RT6_LOOKUP_F_REACHABLE) { 673 int n = rt6_check_neigh(rt); 674 if (n < 0) 675 return n; 676 } 677 return m; 678 } 679 680 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 681 int *mpri, struct rt6_info *match, 682 bool *do_rr) 683 { 684 int m; 685 bool match_do_rr = false; 686 struct inet6_dev *idev = rt->rt6i_idev; 687 struct net_device *dev = rt->dst.dev; 688 689 if (dev && !netif_carrier_ok(dev) && 690 idev->cnf.ignore_routes_with_linkdown && 691 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 692 goto out; 693 694 if (rt6_check_expired(rt)) 695 goto out; 696 697 m = rt6_score_route(rt, oif, strict); 698 if (m == RT6_NUD_FAIL_DO_RR) { 699 match_do_rr = true; 700 m = 0; /* lowest valid score */ 701 } else if (m == RT6_NUD_FAIL_HARD) { 702 goto out; 703 } 704 705 if (strict & RT6_LOOKUP_F_REACHABLE) 706 rt6_probe(rt); 707 708 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 709 if (m > *mpri) { 710 *do_rr = match_do_rr; 711 *mpri = m; 712 match = rt; 713 } 714 out: 715 return match; 716 } 717 718 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 719 struct rt6_info *leaf, 720 struct rt6_info *rr_head, 721 u32 metric, int oif, int strict, 722 bool *do_rr) 723 { 724 struct rt6_info *rt, *match, *cont; 725 int mpri = -1; 726 727 match = NULL; 728 cont = NULL; 729 for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { 730 if (rt->rt6i_metric != metric) { 731 cont = rt; 732 break; 733 } 734 735 match = find_match(rt, oif, strict, &mpri, match, do_rr); 736 } 737 738 for (rt = leaf; rt && rt != rr_head; 739 rt = rcu_dereference(rt->dst.rt6_next)) { 740 if (rt->rt6i_metric != metric) { 741 cont = rt; 742 break; 743 } 744 745 match = find_match(rt, oif, strict, &mpri, match, do_rr); 746 } 747 748 if (match || !cont) 749 return match; 750 751 for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) 752 match = find_match(rt, oif, strict, &mpri, match, do_rr); 753 754 return match; 755 } 756 757 static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, 758 int oif, int strict) 759 { 760 struct rt6_info *leaf = rcu_dereference(fn->leaf); 761 struct rt6_info *match, *rt0; 762 bool do_rr = false; 763 int key_plen; 764 765 if (!leaf) 766 return net->ipv6.ip6_null_entry; 767 768 rt0 = rcu_dereference(fn->rr_ptr); 769 if (!rt0) 770 rt0 = leaf; 771 772 /* Double check to make sure fn is not an intermediate node 773 * and fn->leaf does not points to its child's leaf 774 * (This might happen if all routes under fn are deleted from 775 * the tree and fib6_repair_tree() is called on the node.) 776 */ 777 key_plen = rt0->rt6i_dst.plen; 778 #ifdef CONFIG_IPV6_SUBTREES 779 if (rt0->rt6i_src.plen) 780 key_plen = rt0->rt6i_src.plen; 781 #endif 782 if (fn->fn_bit != key_plen) 783 return net->ipv6.ip6_null_entry; 784 785 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, 786 &do_rr); 787 788 if (do_rr) { 789 struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); 790 791 /* no entries matched; do round-robin */ 792 if (!next || next->rt6i_metric != rt0->rt6i_metric) 793 next = leaf; 794 795 if (next != rt0) { 796 spin_lock_bh(&leaf->rt6i_table->tb6_lock); 797 /* make sure next is not being deleted from the tree */ 798 if (next->rt6i_node) 799 rcu_assign_pointer(fn->rr_ptr, next); 800 spin_unlock_bh(&leaf->rt6i_table->tb6_lock); 801 } 802 } 803 804 return match ? match : net->ipv6.ip6_null_entry; 805 } 806 807 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 808 { 809 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 810 } 811 812 #ifdef CONFIG_IPV6_ROUTE_INFO 813 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 814 const struct in6_addr *gwaddr) 815 { 816 struct net *net = dev_net(dev); 817 struct route_info *rinfo = (struct route_info *) opt; 818 struct in6_addr prefix_buf, *prefix; 819 unsigned int pref; 820 unsigned long lifetime; 821 struct rt6_info *rt; 822 823 if (len < sizeof(struct route_info)) { 824 return -EINVAL; 825 } 826 827 /* Sanity check for prefix_len and length */ 828 if (rinfo->length > 3) { 829 return -EINVAL; 830 } else if (rinfo->prefix_len > 128) { 831 return -EINVAL; 832 } else if (rinfo->prefix_len > 64) { 833 if (rinfo->length < 2) { 834 return -EINVAL; 835 } 836 } else if (rinfo->prefix_len > 0) { 837 if (rinfo->length < 1) { 838 return -EINVAL; 839 } 840 } 841 842 pref = rinfo->route_pref; 843 if (pref == ICMPV6_ROUTER_PREF_INVALID) 844 return -EINVAL; 845 846 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 847 848 if (rinfo->length == 3) 849 prefix = (struct in6_addr *)rinfo->prefix; 850 else { 851 /* this function is safe */ 852 ipv6_addr_prefix(&prefix_buf, 853 (struct in6_addr *)rinfo->prefix, 854 rinfo->prefix_len); 855 prefix = &prefix_buf; 856 } 857 858 if (rinfo->prefix_len == 0) 859 rt = rt6_get_dflt_router(gwaddr, dev); 860 else 861 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 862 gwaddr, dev); 863 864 if (rt && !lifetime) { 865 ip6_del_rt(rt); 866 rt = NULL; 867 } 868 869 if (!rt && lifetime) 870 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 871 dev, pref); 872 else if (rt) 873 rt->rt6i_flags = RTF_ROUTEINFO | 874 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 875 876 if (rt) { 877 if (!addrconf_finite_timeout(lifetime)) 878 rt6_clean_expires(rt); 879 else 880 rt6_set_expires(rt, jiffies + HZ * lifetime); 881 882 ip6_rt_put(rt); 883 } 884 return 0; 885 } 886 #endif 887 888 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 889 struct in6_addr *saddr) 890 { 891 struct fib6_node *pn, *sn; 892 while (1) { 893 if (fn->fn_flags & RTN_TL_ROOT) 894 return NULL; 895 pn = rcu_dereference(fn->parent); 896 sn = FIB6_SUBTREE(pn); 897 if (sn && sn != fn) 898 fn = fib6_lookup(sn, NULL, saddr); 899 else 900 fn = pn; 901 if (fn->fn_flags & RTN_RTINFO) 902 return fn; 903 } 904 } 905 906 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, 907 bool null_fallback) 908 { 909 struct rt6_info *rt = *prt; 910 911 if (dst_hold_safe(&rt->dst)) 912 return true; 913 if (null_fallback) { 914 rt = net->ipv6.ip6_null_entry; 915 dst_hold(&rt->dst); 916 } else { 917 rt = NULL; 918 } 919 *prt = rt; 920 return false; 921 } 922 923 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 924 struct fib6_table *table, 925 struct flowi6 *fl6, int flags) 926 { 927 struct rt6_info *rt, *rt_cache; 928 struct fib6_node *fn; 929 930 rcu_read_lock(); 931 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 932 restart: 933 rt = rcu_dereference(fn->leaf); 934 if (!rt) { 935 rt = net->ipv6.ip6_null_entry; 936 } else { 937 rt = rt6_device_match(net, rt, &fl6->saddr, 938 fl6->flowi6_oif, flags); 939 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 940 rt = rt6_multipath_select(rt, fl6, 941 fl6->flowi6_oif, flags); 942 } 943 if (rt == net->ipv6.ip6_null_entry) { 944 fn = fib6_backtrack(fn, &fl6->saddr); 945 if (fn) 946 goto restart; 947 } 948 /* Search through exception table */ 949 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); 950 if (rt_cache) 951 rt = rt_cache; 952 953 if (ip6_hold_safe(net, &rt, true)) 954 dst_use_noref(&rt->dst, jiffies); 955 956 rcu_read_unlock(); 957 958 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 959 960 return rt; 961 962 } 963 964 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 965 int flags) 966 { 967 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); 968 } 969 EXPORT_SYMBOL_GPL(ip6_route_lookup); 970 971 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 972 const struct in6_addr *saddr, int oif, int strict) 973 { 974 struct flowi6 fl6 = { 975 .flowi6_oif = oif, 976 .daddr = *daddr, 977 }; 978 struct dst_entry *dst; 979 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 980 981 if (saddr) { 982 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 983 flags |= RT6_LOOKUP_F_HAS_SADDR; 984 } 985 986 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 987 if (dst->error == 0) 988 return (struct rt6_info *) dst; 989 990 dst_release(dst); 991 992 return NULL; 993 } 994 EXPORT_SYMBOL(rt6_lookup); 995 996 /* ip6_ins_rt is called with FREE table->tb6_lock. 997 * It takes new route entry, the addition fails by any reason the 998 * route is released. 999 * Caller must hold dst before calling it. 1000 */ 1001 1002 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 1003 struct mx6_config *mxc, 1004 struct netlink_ext_ack *extack) 1005 { 1006 int err; 1007 struct fib6_table *table; 1008 1009 table = rt->rt6i_table; 1010 spin_lock_bh(&table->tb6_lock); 1011 err = fib6_add(&table->tb6_root, rt, info, mxc, extack); 1012 spin_unlock_bh(&table->tb6_lock); 1013 1014 return err; 1015 } 1016 1017 int ip6_ins_rt(struct rt6_info *rt) 1018 { 1019 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 1020 struct mx6_config mxc = { .mx = NULL, }; 1021 1022 /* Hold dst to account for the reference from the fib6 tree */ 1023 dst_hold(&rt->dst); 1024 return __ip6_ins_rt(rt, &info, &mxc, NULL); 1025 } 1026 1027 /* called with rcu_lock held */ 1028 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) 1029 { 1030 struct net_device *dev = rt->dst.dev; 1031 1032 if (rt->rt6i_flags & RTF_LOCAL) { 1033 /* for copies of local routes, dst->dev needs to be the 1034 * device if it is a master device, the master device if 1035 * device is enslaved, and the loopback as the default 1036 */ 1037 if (netif_is_l3_slave(dev) && 1038 !rt6_need_strict(&rt->rt6i_dst.addr)) 1039 dev = l3mdev_master_dev_rcu(dev); 1040 else if (!netif_is_l3_master(dev)) 1041 dev = dev_net(dev)->loopback_dev; 1042 /* last case is netif_is_l3_master(dev) is true in which 1043 * case we want dev returned to be dev 1044 */ 1045 } 1046 1047 return dev; 1048 } 1049 1050 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, 1051 const struct in6_addr *daddr, 1052 const struct in6_addr *saddr) 1053 { 1054 struct net_device *dev; 1055 struct rt6_info *rt; 1056 1057 /* 1058 * Clone the route. 1059 */ 1060 1061 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1062 ort = (struct rt6_info *)ort->dst.from; 1063 1064 rcu_read_lock(); 1065 dev = ip6_rt_get_dev_rcu(ort); 1066 rt = __ip6_dst_alloc(dev_net(dev), dev, 0); 1067 rcu_read_unlock(); 1068 if (!rt) 1069 return NULL; 1070 1071 ip6_rt_copy_init(rt, ort); 1072 rt->rt6i_flags |= RTF_CACHE; 1073 rt->rt6i_metric = 0; 1074 rt->dst.flags |= DST_HOST; 1075 rt->rt6i_dst.addr = *daddr; 1076 rt->rt6i_dst.plen = 128; 1077 1078 if (!rt6_is_gw_or_nonexthop(ort)) { 1079 if (ort->rt6i_dst.plen != 128 && 1080 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 1081 rt->rt6i_flags |= RTF_ANYCAST; 1082 #ifdef CONFIG_IPV6_SUBTREES 1083 if (rt->rt6i_src.plen && saddr) { 1084 rt->rt6i_src.addr = *saddr; 1085 rt->rt6i_src.plen = 128; 1086 } 1087 #endif 1088 } 1089 1090 return rt; 1091 } 1092 1093 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 1094 { 1095 struct net_device *dev; 1096 struct rt6_info *pcpu_rt; 1097 1098 rcu_read_lock(); 1099 dev = ip6_rt_get_dev_rcu(rt); 1100 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); 1101 rcu_read_unlock(); 1102 if (!pcpu_rt) 1103 return NULL; 1104 ip6_rt_copy_init(pcpu_rt, rt); 1105 pcpu_rt->rt6i_protocol = rt->rt6i_protocol; 1106 pcpu_rt->rt6i_flags |= RTF_PCPU; 1107 return pcpu_rt; 1108 } 1109 1110 /* It should be called with rcu_read_lock() acquired */ 1111 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1112 { 1113 struct rt6_info *pcpu_rt, **p; 1114 1115 p = this_cpu_ptr(rt->rt6i_pcpu); 1116 pcpu_rt = *p; 1117 1118 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) 1119 rt6_dst_from_metrics_check(pcpu_rt); 1120 1121 return pcpu_rt; 1122 } 1123 1124 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) 1125 { 1126 struct rt6_info *pcpu_rt, *prev, **p; 1127 1128 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1129 if (!pcpu_rt) { 1130 struct net *net = dev_net(rt->dst.dev); 1131 1132 dst_hold(&net->ipv6.ip6_null_entry->dst); 1133 return net->ipv6.ip6_null_entry; 1134 } 1135 1136 dst_hold(&pcpu_rt->dst); 1137 p = this_cpu_ptr(rt->rt6i_pcpu); 1138 prev = cmpxchg(p, NULL, pcpu_rt); 1139 BUG_ON(prev); 1140 1141 rt6_dst_from_metrics_check(pcpu_rt); 1142 return pcpu_rt; 1143 } 1144 1145 /* exception hash table implementation 1146 */ 1147 static DEFINE_SPINLOCK(rt6_exception_lock); 1148 1149 /* Remove rt6_ex from hash table and free the memory 1150 * Caller must hold rt6_exception_lock 1151 */ 1152 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1153 struct rt6_exception *rt6_ex) 1154 { 1155 struct net *net = dev_net(rt6_ex->rt6i->dst.dev); 1156 1157 if (!bucket || !rt6_ex) 1158 return; 1159 rt6_ex->rt6i->rt6i_node = NULL; 1160 hlist_del_rcu(&rt6_ex->hlist); 1161 rt6_release(rt6_ex->rt6i); 1162 kfree_rcu(rt6_ex, rcu); 1163 WARN_ON_ONCE(!bucket->depth); 1164 bucket->depth--; 1165 net->ipv6.rt6_stats->fib_rt_cache--; 1166 } 1167 1168 /* Remove oldest rt6_ex in bucket and free the memory 1169 * Caller must hold rt6_exception_lock 1170 */ 1171 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1172 { 1173 struct rt6_exception *rt6_ex, *oldest = NULL; 1174 1175 if (!bucket) 1176 return; 1177 1178 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1179 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1180 oldest = rt6_ex; 1181 } 1182 rt6_remove_exception(bucket, oldest); 1183 } 1184 1185 static u32 rt6_exception_hash(const struct in6_addr *dst, 1186 const struct in6_addr *src) 1187 { 1188 static u32 seed __read_mostly; 1189 u32 val; 1190 1191 net_get_random_once(&seed, sizeof(seed)); 1192 val = jhash(dst, sizeof(*dst), seed); 1193 1194 #ifdef CONFIG_IPV6_SUBTREES 1195 if (src) 1196 val = jhash(src, sizeof(*src), val); 1197 #endif 1198 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1199 } 1200 1201 /* Helper function to find the cached rt in the hash table 1202 * and update bucket pointer to point to the bucket for this 1203 * (daddr, saddr) pair 1204 * Caller must hold rt6_exception_lock 1205 */ 1206 static struct rt6_exception * 1207 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1208 const struct in6_addr *daddr, 1209 const struct in6_addr *saddr) 1210 { 1211 struct rt6_exception *rt6_ex; 1212 u32 hval; 1213 1214 if (!(*bucket) || !daddr) 1215 return NULL; 1216 1217 hval = rt6_exception_hash(daddr, saddr); 1218 *bucket += hval; 1219 1220 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1221 struct rt6_info *rt6 = rt6_ex->rt6i; 1222 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1223 1224 #ifdef CONFIG_IPV6_SUBTREES 1225 if (matched && saddr) 1226 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1227 #endif 1228 if (matched) 1229 return rt6_ex; 1230 } 1231 return NULL; 1232 } 1233 1234 /* Helper function to find the cached rt in the hash table 1235 * and update bucket pointer to point to the bucket for this 1236 * (daddr, saddr) pair 1237 * Caller must hold rcu_read_lock() 1238 */ 1239 static struct rt6_exception * 1240 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1241 const struct in6_addr *daddr, 1242 const struct in6_addr *saddr) 1243 { 1244 struct rt6_exception *rt6_ex; 1245 u32 hval; 1246 1247 WARN_ON_ONCE(!rcu_read_lock_held()); 1248 1249 if (!(*bucket) || !daddr) 1250 return NULL; 1251 1252 hval = rt6_exception_hash(daddr, saddr); 1253 *bucket += hval; 1254 1255 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1256 struct rt6_info *rt6 = rt6_ex->rt6i; 1257 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1258 1259 #ifdef CONFIG_IPV6_SUBTREES 1260 if (matched && saddr) 1261 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1262 #endif 1263 if (matched) 1264 return rt6_ex; 1265 } 1266 return NULL; 1267 } 1268 1269 static int rt6_insert_exception(struct rt6_info *nrt, 1270 struct rt6_info *ort) 1271 { 1272 struct net *net = dev_net(ort->dst.dev); 1273 struct rt6_exception_bucket *bucket; 1274 struct in6_addr *src_key = NULL; 1275 struct rt6_exception *rt6_ex; 1276 int err = 0; 1277 1278 /* ort can't be a cache or pcpu route */ 1279 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1280 ort = (struct rt6_info *)ort->dst.from; 1281 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); 1282 1283 spin_lock_bh(&rt6_exception_lock); 1284 1285 if (ort->exception_bucket_flushed) { 1286 err = -EINVAL; 1287 goto out; 1288 } 1289 1290 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1291 lockdep_is_held(&rt6_exception_lock)); 1292 if (!bucket) { 1293 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1294 GFP_ATOMIC); 1295 if (!bucket) { 1296 err = -ENOMEM; 1297 goto out; 1298 } 1299 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1300 } 1301 1302 #ifdef CONFIG_IPV6_SUBTREES 1303 /* rt6i_src.plen != 0 indicates ort is in subtree 1304 * and exception table is indexed by a hash of 1305 * both rt6i_dst and rt6i_src. 1306 * Otherwise, the exception table is indexed by 1307 * a hash of only rt6i_dst. 1308 */ 1309 if (ort->rt6i_src.plen) 1310 src_key = &nrt->rt6i_src.addr; 1311 #endif 1312 1313 /* Update rt6i_prefsrc as it could be changed 1314 * in rt6_remove_prefsrc() 1315 */ 1316 nrt->rt6i_prefsrc = ort->rt6i_prefsrc; 1317 /* rt6_mtu_change() might lower mtu on ort. 1318 * Only insert this exception route if its mtu 1319 * is less than ort's mtu value. 1320 */ 1321 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { 1322 err = -EINVAL; 1323 goto out; 1324 } 1325 1326 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1327 src_key); 1328 if (rt6_ex) 1329 rt6_remove_exception(bucket, rt6_ex); 1330 1331 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1332 if (!rt6_ex) { 1333 err = -ENOMEM; 1334 goto out; 1335 } 1336 rt6_ex->rt6i = nrt; 1337 rt6_ex->stamp = jiffies; 1338 atomic_inc(&nrt->rt6i_ref); 1339 nrt->rt6i_node = ort->rt6i_node; 1340 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1341 bucket->depth++; 1342 net->ipv6.rt6_stats->fib_rt_cache++; 1343 1344 if (bucket->depth > FIB6_MAX_DEPTH) 1345 rt6_exception_remove_oldest(bucket); 1346 1347 out: 1348 spin_unlock_bh(&rt6_exception_lock); 1349 1350 /* Update fn->fn_sernum to invalidate all cached dst */ 1351 if (!err) 1352 fib6_update_sernum(ort); 1353 1354 return err; 1355 } 1356 1357 void rt6_flush_exceptions(struct rt6_info *rt) 1358 { 1359 struct rt6_exception_bucket *bucket; 1360 struct rt6_exception *rt6_ex; 1361 struct hlist_node *tmp; 1362 int i; 1363 1364 spin_lock_bh(&rt6_exception_lock); 1365 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1366 rt->exception_bucket_flushed = 1; 1367 1368 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1369 lockdep_is_held(&rt6_exception_lock)); 1370 if (!bucket) 1371 goto out; 1372 1373 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1374 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1375 rt6_remove_exception(bucket, rt6_ex); 1376 WARN_ON_ONCE(bucket->depth); 1377 bucket++; 1378 } 1379 1380 out: 1381 spin_unlock_bh(&rt6_exception_lock); 1382 } 1383 1384 /* Find cached rt in the hash table inside passed in rt 1385 * Caller has to hold rcu_read_lock() 1386 */ 1387 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 1388 struct in6_addr *daddr, 1389 struct in6_addr *saddr) 1390 { 1391 struct rt6_exception_bucket *bucket; 1392 struct in6_addr *src_key = NULL; 1393 struct rt6_exception *rt6_ex; 1394 struct rt6_info *res = NULL; 1395 1396 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1397 1398 #ifdef CONFIG_IPV6_SUBTREES 1399 /* rt6i_src.plen != 0 indicates rt is in subtree 1400 * and exception table is indexed by a hash of 1401 * both rt6i_dst and rt6i_src. 1402 * Otherwise, the exception table is indexed by 1403 * a hash of only rt6i_dst. 1404 */ 1405 if (rt->rt6i_src.plen) 1406 src_key = saddr; 1407 #endif 1408 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1409 1410 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1411 res = rt6_ex->rt6i; 1412 1413 return res; 1414 } 1415 1416 /* Remove the passed in cached rt from the hash table that contains it */ 1417 int rt6_remove_exception_rt(struct rt6_info *rt) 1418 { 1419 struct rt6_info *from = (struct rt6_info *)rt->dst.from; 1420 struct rt6_exception_bucket *bucket; 1421 struct in6_addr *src_key = NULL; 1422 struct rt6_exception *rt6_ex; 1423 int err; 1424 1425 if (!from || 1426 !(rt->rt6i_flags | RTF_CACHE)) 1427 return -EINVAL; 1428 1429 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1430 return -ENOENT; 1431 1432 spin_lock_bh(&rt6_exception_lock); 1433 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1434 lockdep_is_held(&rt6_exception_lock)); 1435 #ifdef CONFIG_IPV6_SUBTREES 1436 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1437 * and exception table is indexed by a hash of 1438 * both rt6i_dst and rt6i_src. 1439 * Otherwise, the exception table is indexed by 1440 * a hash of only rt6i_dst. 1441 */ 1442 if (from->rt6i_src.plen) 1443 src_key = &rt->rt6i_src.addr; 1444 #endif 1445 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1446 &rt->rt6i_dst.addr, 1447 src_key); 1448 if (rt6_ex) { 1449 rt6_remove_exception(bucket, rt6_ex); 1450 err = 0; 1451 } else { 1452 err = -ENOENT; 1453 } 1454 1455 spin_unlock_bh(&rt6_exception_lock); 1456 return err; 1457 } 1458 1459 /* Find rt6_ex which contains the passed in rt cache and 1460 * refresh its stamp 1461 */ 1462 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1463 { 1464 struct rt6_info *from = (struct rt6_info *)rt->dst.from; 1465 struct rt6_exception_bucket *bucket; 1466 struct in6_addr *src_key = NULL; 1467 struct rt6_exception *rt6_ex; 1468 1469 if (!from || 1470 !(rt->rt6i_flags | RTF_CACHE)) 1471 return; 1472 1473 rcu_read_lock(); 1474 bucket = rcu_dereference(from->rt6i_exception_bucket); 1475 1476 #ifdef CONFIG_IPV6_SUBTREES 1477 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1478 * and exception table is indexed by a hash of 1479 * both rt6i_dst and rt6i_src. 1480 * Otherwise, the exception table is indexed by 1481 * a hash of only rt6i_dst. 1482 */ 1483 if (from->rt6i_src.plen) 1484 src_key = &rt->rt6i_src.addr; 1485 #endif 1486 rt6_ex = __rt6_find_exception_rcu(&bucket, 1487 &rt->rt6i_dst.addr, 1488 src_key); 1489 if (rt6_ex) 1490 rt6_ex->stamp = jiffies; 1491 1492 rcu_read_unlock(); 1493 } 1494 1495 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) 1496 { 1497 struct rt6_exception_bucket *bucket; 1498 struct rt6_exception *rt6_ex; 1499 int i; 1500 1501 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1502 lockdep_is_held(&rt6_exception_lock)); 1503 1504 if (bucket) { 1505 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1506 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1507 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1508 } 1509 bucket++; 1510 } 1511 } 1512 } 1513 1514 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) 1515 { 1516 struct rt6_exception_bucket *bucket; 1517 struct rt6_exception *rt6_ex; 1518 int i; 1519 1520 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1521 lockdep_is_held(&rt6_exception_lock)); 1522 1523 if (bucket) { 1524 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1525 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1526 struct rt6_info *entry = rt6_ex->rt6i; 1527 /* For RTF_CACHE with rt6i_pmtu == 0 1528 * (i.e. a redirected route), 1529 * the metrics of its rt->dst.from has already 1530 * been updated. 1531 */ 1532 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) 1533 entry->rt6i_pmtu = mtu; 1534 } 1535 bucket++; 1536 } 1537 } 1538 } 1539 1540 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1541 1542 static void rt6_exceptions_clean_tohost(struct rt6_info *rt, 1543 struct in6_addr *gateway) 1544 { 1545 struct rt6_exception_bucket *bucket; 1546 struct rt6_exception *rt6_ex; 1547 struct hlist_node *tmp; 1548 int i; 1549 1550 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1551 return; 1552 1553 spin_lock_bh(&rt6_exception_lock); 1554 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1555 lockdep_is_held(&rt6_exception_lock)); 1556 1557 if (bucket) { 1558 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1559 hlist_for_each_entry_safe(rt6_ex, tmp, 1560 &bucket->chain, hlist) { 1561 struct rt6_info *entry = rt6_ex->rt6i; 1562 1563 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1564 RTF_CACHE_GATEWAY && 1565 ipv6_addr_equal(gateway, 1566 &entry->rt6i_gateway)) { 1567 rt6_remove_exception(bucket, rt6_ex); 1568 } 1569 } 1570 bucket++; 1571 } 1572 } 1573 1574 spin_unlock_bh(&rt6_exception_lock); 1575 } 1576 1577 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1578 struct rt6_exception *rt6_ex, 1579 struct fib6_gc_args *gc_args, 1580 unsigned long now) 1581 { 1582 struct rt6_info *rt = rt6_ex->rt6i; 1583 1584 if (atomic_read(&rt->dst.__refcnt) == 1 && 1585 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1586 RT6_TRACE("aging clone %p\n", rt); 1587 rt6_remove_exception(bucket, rt6_ex); 1588 return; 1589 } else if (rt->rt6i_flags & RTF_GATEWAY) { 1590 struct neighbour *neigh; 1591 __u8 neigh_flags = 0; 1592 1593 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); 1594 if (neigh) { 1595 neigh_flags = neigh->flags; 1596 neigh_release(neigh); 1597 } 1598 if (!(neigh_flags & NTF_ROUTER)) { 1599 RT6_TRACE("purging route %p via non-router but gateway\n", 1600 rt); 1601 rt6_remove_exception(bucket, rt6_ex); 1602 return; 1603 } 1604 } 1605 gc_args->more++; 1606 } 1607 1608 void rt6_age_exceptions(struct rt6_info *rt, 1609 struct fib6_gc_args *gc_args, 1610 unsigned long now) 1611 { 1612 struct rt6_exception_bucket *bucket; 1613 struct rt6_exception *rt6_ex; 1614 struct hlist_node *tmp; 1615 int i; 1616 1617 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1618 return; 1619 1620 spin_lock_bh(&rt6_exception_lock); 1621 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1622 lockdep_is_held(&rt6_exception_lock)); 1623 1624 if (bucket) { 1625 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1626 hlist_for_each_entry_safe(rt6_ex, tmp, 1627 &bucket->chain, hlist) { 1628 rt6_age_examine_exception(bucket, rt6_ex, 1629 gc_args, now); 1630 } 1631 bucket++; 1632 } 1633 } 1634 spin_unlock_bh(&rt6_exception_lock); 1635 } 1636 1637 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1638 int oif, struct flowi6 *fl6, int flags) 1639 { 1640 struct fib6_node *fn, *saved_fn; 1641 struct rt6_info *rt, *rt_cache; 1642 int strict = 0; 1643 1644 strict |= flags & RT6_LOOKUP_F_IFACE; 1645 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1646 if (net->ipv6.devconf_all->forwarding == 0) 1647 strict |= RT6_LOOKUP_F_REACHABLE; 1648 1649 rcu_read_lock(); 1650 1651 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1652 saved_fn = fn; 1653 1654 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1655 oif = 0; 1656 1657 redo_rt6_select: 1658 rt = rt6_select(net, fn, oif, strict); 1659 if (rt->rt6i_nsiblings) 1660 rt = rt6_multipath_select(rt, fl6, oif, strict); 1661 if (rt == net->ipv6.ip6_null_entry) { 1662 fn = fib6_backtrack(fn, &fl6->saddr); 1663 if (fn) 1664 goto redo_rt6_select; 1665 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1666 /* also consider unreachable route */ 1667 strict &= ~RT6_LOOKUP_F_REACHABLE; 1668 fn = saved_fn; 1669 goto redo_rt6_select; 1670 } 1671 } 1672 1673 /*Search through exception table */ 1674 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); 1675 if (rt_cache) 1676 rt = rt_cache; 1677 1678 if (rt == net->ipv6.ip6_null_entry) { 1679 rcu_read_unlock(); 1680 dst_hold(&rt->dst); 1681 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 1682 return rt; 1683 } else if (rt->rt6i_flags & RTF_CACHE) { 1684 if (ip6_hold_safe(net, &rt, true)) { 1685 dst_use_noref(&rt->dst, jiffies); 1686 rt6_dst_from_metrics_check(rt); 1687 } 1688 rcu_read_unlock(); 1689 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 1690 return rt; 1691 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1692 !(rt->rt6i_flags & RTF_GATEWAY))) { 1693 /* Create a RTF_CACHE clone which will not be 1694 * owned by the fib6 tree. It is for the special case where 1695 * the daddr in the skb during the neighbor look-up is different 1696 * from the fl6->daddr used to look-up route here. 1697 */ 1698 1699 struct rt6_info *uncached_rt; 1700 1701 if (ip6_hold_safe(net, &rt, true)) { 1702 dst_use_noref(&rt->dst, jiffies); 1703 } else { 1704 rcu_read_unlock(); 1705 uncached_rt = rt; 1706 goto uncached_rt_out; 1707 } 1708 rcu_read_unlock(); 1709 1710 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1711 dst_release(&rt->dst); 1712 1713 if (uncached_rt) { 1714 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1715 * No need for another dst_hold() 1716 */ 1717 rt6_uncached_list_add(uncached_rt); 1718 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 1719 } else { 1720 uncached_rt = net->ipv6.ip6_null_entry; 1721 dst_hold(&uncached_rt->dst); 1722 } 1723 1724 uncached_rt_out: 1725 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); 1726 return uncached_rt; 1727 1728 } else { 1729 /* Get a percpu copy */ 1730 1731 struct rt6_info *pcpu_rt; 1732 1733 dst_use_noref(&rt->dst, jiffies); 1734 local_bh_disable(); 1735 pcpu_rt = rt6_get_pcpu_route(rt); 1736 1737 if (!pcpu_rt) { 1738 /* atomic_inc_not_zero() is needed when using rcu */ 1739 if (atomic_inc_not_zero(&rt->rt6i_ref)) { 1740 /* No dst_hold() on rt is needed because grabbing 1741 * rt->rt6i_ref makes sure rt can't be released. 1742 */ 1743 pcpu_rt = rt6_make_pcpu_route(rt); 1744 rt6_release(rt); 1745 } else { 1746 /* rt is already removed from tree */ 1747 pcpu_rt = net->ipv6.ip6_null_entry; 1748 dst_hold(&pcpu_rt->dst); 1749 } 1750 } 1751 local_bh_enable(); 1752 rcu_read_unlock(); 1753 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); 1754 return pcpu_rt; 1755 } 1756 } 1757 EXPORT_SYMBOL_GPL(ip6_pol_route); 1758 1759 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 1760 struct flowi6 *fl6, int flags) 1761 { 1762 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1763 } 1764 1765 struct dst_entry *ip6_route_input_lookup(struct net *net, 1766 struct net_device *dev, 1767 struct flowi6 *fl6, int flags) 1768 { 1769 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1770 flags |= RT6_LOOKUP_F_IFACE; 1771 1772 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1773 } 1774 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1775 1776 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1777 struct flow_keys *keys) 1778 { 1779 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1780 const struct ipv6hdr *key_iph = outer_iph; 1781 const struct ipv6hdr *inner_iph; 1782 const struct icmp6hdr *icmph; 1783 struct ipv6hdr _inner_iph; 1784 1785 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1786 goto out; 1787 1788 icmph = icmp6_hdr(skb); 1789 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1790 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1791 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1792 icmph->icmp6_type != ICMPV6_PARAMPROB) 1793 goto out; 1794 1795 inner_iph = skb_header_pointer(skb, 1796 skb_transport_offset(skb) + sizeof(*icmph), 1797 sizeof(_inner_iph), &_inner_iph); 1798 if (!inner_iph) 1799 goto out; 1800 1801 key_iph = inner_iph; 1802 out: 1803 memset(keys, 0, sizeof(*keys)); 1804 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1805 keys->addrs.v6addrs.src = key_iph->saddr; 1806 keys->addrs.v6addrs.dst = key_iph->daddr; 1807 keys->tags.flow_label = ip6_flowinfo(key_iph); 1808 keys->basic.ip_proto = key_iph->nexthdr; 1809 } 1810 1811 /* if skb is set it will be used and fl6 can be NULL */ 1812 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) 1813 { 1814 struct flow_keys hash_keys; 1815 1816 if (skb) { 1817 ip6_multipath_l3_keys(skb, &hash_keys); 1818 return flow_hash_from_keys(&hash_keys); 1819 } 1820 1821 return get_hash_from_flowi6(fl6); 1822 } 1823 1824 void ip6_route_input(struct sk_buff *skb) 1825 { 1826 const struct ipv6hdr *iph = ipv6_hdr(skb); 1827 struct net *net = dev_net(skb->dev); 1828 int flags = RT6_LOOKUP_F_HAS_SADDR; 1829 struct ip_tunnel_info *tun_info; 1830 struct flowi6 fl6 = { 1831 .flowi6_iif = skb->dev->ifindex, 1832 .daddr = iph->daddr, 1833 .saddr = iph->saddr, 1834 .flowlabel = ip6_flowinfo(iph), 1835 .flowi6_mark = skb->mark, 1836 .flowi6_proto = iph->nexthdr, 1837 }; 1838 1839 tun_info = skb_tunnel_info(skb); 1840 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1841 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 1842 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 1843 fl6.mp_hash = rt6_multipath_hash(&fl6, skb); 1844 skb_dst_drop(skb); 1845 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); 1846 } 1847 1848 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 1849 struct flowi6 *fl6, int flags) 1850 { 1851 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 1852 } 1853 1854 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 1855 struct flowi6 *fl6, int flags) 1856 { 1857 bool any_src; 1858 1859 if (rt6_need_strict(&fl6->daddr)) { 1860 struct dst_entry *dst; 1861 1862 dst = l3mdev_link_scope_lookup(net, fl6); 1863 if (dst) 1864 return dst; 1865 } 1866 1867 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1868 1869 any_src = ipv6_addr_any(&fl6->saddr); 1870 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 1871 (fl6->flowi6_oif && any_src)) 1872 flags |= RT6_LOOKUP_F_IFACE; 1873 1874 if (!any_src) 1875 flags |= RT6_LOOKUP_F_HAS_SADDR; 1876 else if (sk) 1877 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 1878 1879 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 1880 } 1881 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 1882 1883 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1884 { 1885 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1886 struct net_device *loopback_dev = net->loopback_dev; 1887 struct dst_entry *new = NULL; 1888 1889 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 1890 DST_OBSOLETE_NONE, 0); 1891 if (rt) { 1892 rt6_info_init(rt); 1893 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); 1894 1895 new = &rt->dst; 1896 new->__use = 1; 1897 new->input = dst_discard; 1898 new->output = dst_discard_out; 1899 1900 dst_copy_metrics(new, &ort->dst); 1901 1902 rt->rt6i_idev = in6_dev_get(loopback_dev); 1903 rt->rt6i_gateway = ort->rt6i_gateway; 1904 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 1905 rt->rt6i_metric = 0; 1906 1907 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 1908 #ifdef CONFIG_IPV6_SUBTREES 1909 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1910 #endif 1911 } 1912 1913 dst_release(dst_orig); 1914 return new ? new : ERR_PTR(-ENOMEM); 1915 } 1916 1917 /* 1918 * Destination cache support functions 1919 */ 1920 1921 static void rt6_dst_from_metrics_check(struct rt6_info *rt) 1922 { 1923 if (rt->dst.from && 1924 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) 1925 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); 1926 } 1927 1928 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 1929 { 1930 u32 rt_cookie = 0; 1931 1932 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) 1933 return NULL; 1934 1935 if (rt6_check_expired(rt)) 1936 return NULL; 1937 1938 return &rt->dst; 1939 } 1940 1941 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 1942 { 1943 if (!__rt6_check_expired(rt) && 1944 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1945 rt6_check((struct rt6_info *)(rt->dst.from), cookie)) 1946 return &rt->dst; 1947 else 1948 return NULL; 1949 } 1950 1951 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 1952 { 1953 struct rt6_info *rt; 1954 1955 rt = (struct rt6_info *) dst; 1956 1957 /* All IPV6 dsts are created with ->obsolete set to the value 1958 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1959 * into this function always. 1960 */ 1961 1962 rt6_dst_from_metrics_check(rt); 1963 1964 if (rt->rt6i_flags & RTF_PCPU || 1965 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) 1966 return rt6_dst_from_check(rt, cookie); 1967 else 1968 return rt6_check(rt, cookie); 1969 } 1970 1971 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 1972 { 1973 struct rt6_info *rt = (struct rt6_info *) dst; 1974 1975 if (rt) { 1976 if (rt->rt6i_flags & RTF_CACHE) { 1977 if (rt6_check_expired(rt)) { 1978 ip6_del_rt(rt); 1979 dst = NULL; 1980 } 1981 } else { 1982 dst_release(dst); 1983 dst = NULL; 1984 } 1985 } 1986 return dst; 1987 } 1988 1989 static void ip6_link_failure(struct sk_buff *skb) 1990 { 1991 struct rt6_info *rt; 1992 1993 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 1994 1995 rt = (struct rt6_info *) skb_dst(skb); 1996 if (rt) { 1997 if (rt->rt6i_flags & RTF_CACHE) { 1998 if (dst_hold_safe(&rt->dst)) 1999 ip6_del_rt(rt); 2000 } else { 2001 struct fib6_node *fn; 2002 2003 rcu_read_lock(); 2004 fn = rcu_dereference(rt->rt6i_node); 2005 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 2006 fn->fn_sernum = -1; 2007 rcu_read_unlock(); 2008 } 2009 } 2010 } 2011 2012 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 2013 { 2014 struct net *net = dev_net(rt->dst.dev); 2015 2016 rt->rt6i_flags |= RTF_MODIFIED; 2017 rt->rt6i_pmtu = mtu; 2018 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 2019 } 2020 2021 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 2022 { 2023 return !(rt->rt6i_flags & RTF_CACHE) && 2024 (rt->rt6i_flags & RTF_PCPU || 2025 rcu_access_pointer(rt->rt6i_node)); 2026 } 2027 2028 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 2029 const struct ipv6hdr *iph, u32 mtu) 2030 { 2031 const struct in6_addr *daddr, *saddr; 2032 struct rt6_info *rt6 = (struct rt6_info *)dst; 2033 2034 if (rt6->rt6i_flags & RTF_LOCAL) 2035 return; 2036 2037 if (dst_metric_locked(dst, RTAX_MTU)) 2038 return; 2039 2040 if (iph) { 2041 daddr = &iph->daddr; 2042 saddr = &iph->saddr; 2043 } else if (sk) { 2044 daddr = &sk->sk_v6_daddr; 2045 saddr = &inet6_sk(sk)->saddr; 2046 } else { 2047 daddr = NULL; 2048 saddr = NULL; 2049 } 2050 dst_confirm_neigh(dst, daddr); 2051 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 2052 if (mtu >= dst_mtu(dst)) 2053 return; 2054 2055 if (!rt6_cache_allowed_for_pmtu(rt6)) { 2056 rt6_do_update_pmtu(rt6, mtu); 2057 /* update rt6_ex->stamp for cache */ 2058 if (rt6->rt6i_flags & RTF_CACHE) 2059 rt6_update_exception_stamp_rt(rt6); 2060 } else if (daddr) { 2061 struct rt6_info *nrt6; 2062 2063 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 2064 if (nrt6) { 2065 rt6_do_update_pmtu(nrt6, mtu); 2066 if (rt6_insert_exception(nrt6, rt6)) 2067 dst_release_immediate(&nrt6->dst); 2068 } 2069 } 2070 } 2071 2072 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2073 struct sk_buff *skb, u32 mtu) 2074 { 2075 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2076 } 2077 2078 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2079 int oif, u32 mark, kuid_t uid) 2080 { 2081 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2082 struct dst_entry *dst; 2083 struct flowi6 fl6; 2084 2085 memset(&fl6, 0, sizeof(fl6)); 2086 fl6.flowi6_oif = oif; 2087 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2088 fl6.daddr = iph->daddr; 2089 fl6.saddr = iph->saddr; 2090 fl6.flowlabel = ip6_flowinfo(iph); 2091 fl6.flowi6_uid = uid; 2092 2093 dst = ip6_route_output(net, NULL, &fl6); 2094 if (!dst->error) 2095 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2096 dst_release(dst); 2097 } 2098 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2099 2100 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2101 { 2102 struct dst_entry *dst; 2103 2104 ip6_update_pmtu(skb, sock_net(sk), mtu, 2105 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2106 2107 dst = __sk_dst_get(sk); 2108 if (!dst || !dst->obsolete || 2109 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2110 return; 2111 2112 bh_lock_sock(sk); 2113 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2114 ip6_datagram_dst_update(sk, false); 2115 bh_unlock_sock(sk); 2116 } 2117 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2118 2119 /* Handle redirects */ 2120 struct ip6rd_flowi { 2121 struct flowi6 fl6; 2122 struct in6_addr gateway; 2123 }; 2124 2125 static struct rt6_info *__ip6_route_redirect(struct net *net, 2126 struct fib6_table *table, 2127 struct flowi6 *fl6, 2128 int flags) 2129 { 2130 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2131 struct rt6_info *rt, *rt_cache; 2132 struct fib6_node *fn; 2133 2134 /* Get the "current" route for this destination and 2135 * check if the redirect has come from appropriate router. 2136 * 2137 * RFC 4861 specifies that redirects should only be 2138 * accepted if they come from the nexthop to the target. 2139 * Due to the way the routes are chosen, this notion 2140 * is a bit fuzzy and one might need to check all possible 2141 * routes. 2142 */ 2143 2144 rcu_read_lock(); 2145 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2146 restart: 2147 for_each_fib6_node_rt_rcu(fn) { 2148 if (rt6_check_expired(rt)) 2149 continue; 2150 if (rt->dst.error) 2151 break; 2152 if (!(rt->rt6i_flags & RTF_GATEWAY)) 2153 continue; 2154 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 2155 continue; 2156 /* rt_cache's gateway might be different from its 'parent' 2157 * in the case of an ip redirect. 2158 * So we keep searching in the exception table if the gateway 2159 * is different. 2160 */ 2161 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { 2162 rt_cache = rt6_find_cached_rt(rt, 2163 &fl6->daddr, 2164 &fl6->saddr); 2165 if (rt_cache && 2166 ipv6_addr_equal(&rdfl->gateway, 2167 &rt_cache->rt6i_gateway)) { 2168 rt = rt_cache; 2169 break; 2170 } 2171 continue; 2172 } 2173 break; 2174 } 2175 2176 if (!rt) 2177 rt = net->ipv6.ip6_null_entry; 2178 else if (rt->dst.error) { 2179 rt = net->ipv6.ip6_null_entry; 2180 goto out; 2181 } 2182 2183 if (rt == net->ipv6.ip6_null_entry) { 2184 fn = fib6_backtrack(fn, &fl6->saddr); 2185 if (fn) 2186 goto restart; 2187 } 2188 2189 out: 2190 ip6_hold_safe(net, &rt, true); 2191 2192 rcu_read_unlock(); 2193 2194 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 2195 return rt; 2196 }; 2197 2198 static struct dst_entry *ip6_route_redirect(struct net *net, 2199 const struct flowi6 *fl6, 2200 const struct in6_addr *gateway) 2201 { 2202 int flags = RT6_LOOKUP_F_HAS_SADDR; 2203 struct ip6rd_flowi rdfl; 2204 2205 rdfl.fl6 = *fl6; 2206 rdfl.gateway = *gateway; 2207 2208 return fib6_rule_lookup(net, &rdfl.fl6, 2209 flags, __ip6_route_redirect); 2210 } 2211 2212 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2213 kuid_t uid) 2214 { 2215 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2216 struct dst_entry *dst; 2217 struct flowi6 fl6; 2218 2219 memset(&fl6, 0, sizeof(fl6)); 2220 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2221 fl6.flowi6_oif = oif; 2222 fl6.flowi6_mark = mark; 2223 fl6.daddr = iph->daddr; 2224 fl6.saddr = iph->saddr; 2225 fl6.flowlabel = ip6_flowinfo(iph); 2226 fl6.flowi6_uid = uid; 2227 2228 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 2229 rt6_do_redirect(dst, NULL, skb); 2230 dst_release(dst); 2231 } 2232 EXPORT_SYMBOL_GPL(ip6_redirect); 2233 2234 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2235 u32 mark) 2236 { 2237 const struct ipv6hdr *iph = ipv6_hdr(skb); 2238 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2239 struct dst_entry *dst; 2240 struct flowi6 fl6; 2241 2242 memset(&fl6, 0, sizeof(fl6)); 2243 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2244 fl6.flowi6_oif = oif; 2245 fl6.flowi6_mark = mark; 2246 fl6.daddr = msg->dest; 2247 fl6.saddr = iph->daddr; 2248 fl6.flowi6_uid = sock_net_uid(net, NULL); 2249 2250 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 2251 rt6_do_redirect(dst, NULL, skb); 2252 dst_release(dst); 2253 } 2254 2255 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2256 { 2257 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2258 sk->sk_uid); 2259 } 2260 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2261 2262 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2263 { 2264 struct net_device *dev = dst->dev; 2265 unsigned int mtu = dst_mtu(dst); 2266 struct net *net = dev_net(dev); 2267 2268 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2269 2270 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2271 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2272 2273 /* 2274 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2275 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2276 * IPV6_MAXPLEN is also valid and means: "any MSS, 2277 * rely only on pmtu discovery" 2278 */ 2279 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2280 mtu = IPV6_MAXPLEN; 2281 return mtu; 2282 } 2283 2284 static unsigned int ip6_mtu(const struct dst_entry *dst) 2285 { 2286 const struct rt6_info *rt = (const struct rt6_info *)dst; 2287 unsigned int mtu = rt->rt6i_pmtu; 2288 struct inet6_dev *idev; 2289 2290 if (mtu) 2291 goto out; 2292 2293 mtu = dst_metric_raw(dst, RTAX_MTU); 2294 if (mtu) 2295 goto out; 2296 2297 mtu = IPV6_MIN_MTU; 2298 2299 rcu_read_lock(); 2300 idev = __in6_dev_get(dst->dev); 2301 if (idev) 2302 mtu = idev->cnf.mtu6; 2303 rcu_read_unlock(); 2304 2305 out: 2306 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2307 2308 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2309 } 2310 2311 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2312 struct flowi6 *fl6) 2313 { 2314 struct dst_entry *dst; 2315 struct rt6_info *rt; 2316 struct inet6_dev *idev = in6_dev_get(dev); 2317 struct net *net = dev_net(dev); 2318 2319 if (unlikely(!idev)) 2320 return ERR_PTR(-ENODEV); 2321 2322 rt = ip6_dst_alloc(net, dev, 0); 2323 if (unlikely(!rt)) { 2324 in6_dev_put(idev); 2325 dst = ERR_PTR(-ENOMEM); 2326 goto out; 2327 } 2328 2329 rt->dst.flags |= DST_HOST; 2330 rt->dst.output = ip6_output; 2331 rt->rt6i_gateway = fl6->daddr; 2332 rt->rt6i_dst.addr = fl6->daddr; 2333 rt->rt6i_dst.plen = 128; 2334 rt->rt6i_idev = idev; 2335 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2336 2337 /* Add this dst into uncached_list so that rt6_ifdown() can 2338 * do proper release of the net_device 2339 */ 2340 rt6_uncached_list_add(rt); 2341 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); 2342 2343 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2344 2345 out: 2346 return dst; 2347 } 2348 2349 static int ip6_dst_gc(struct dst_ops *ops) 2350 { 2351 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2352 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2353 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2354 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2355 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2356 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2357 int entries; 2358 2359 entries = dst_entries_get_fast(ops); 2360 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2361 entries <= rt_max_size) 2362 goto out; 2363 2364 net->ipv6.ip6_rt_gc_expire++; 2365 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2366 entries = dst_entries_get_slow(ops); 2367 if (entries < ops->gc_thresh) 2368 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2369 out: 2370 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2371 return entries > rt_max_size; 2372 } 2373 2374 static int ip6_convert_metrics(struct mx6_config *mxc, 2375 const struct fib6_config *cfg) 2376 { 2377 bool ecn_ca = false; 2378 struct nlattr *nla; 2379 int remaining; 2380 u32 *mp; 2381 2382 if (!cfg->fc_mx) 2383 return 0; 2384 2385 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 2386 if (unlikely(!mp)) 2387 return -ENOMEM; 2388 2389 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 2390 int type = nla_type(nla); 2391 u32 val; 2392 2393 if (!type) 2394 continue; 2395 if (unlikely(type > RTAX_MAX)) 2396 goto err; 2397 2398 if (type == RTAX_CC_ALGO) { 2399 char tmp[TCP_CA_NAME_MAX]; 2400 2401 nla_strlcpy(tmp, nla, sizeof(tmp)); 2402 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 2403 if (val == TCP_CA_UNSPEC) 2404 goto err; 2405 } else { 2406 val = nla_get_u32(nla); 2407 } 2408 if (type == RTAX_HOPLIMIT && val > 255) 2409 val = 255; 2410 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) 2411 goto err; 2412 2413 mp[type - 1] = val; 2414 __set_bit(type - 1, mxc->mx_valid); 2415 } 2416 2417 if (ecn_ca) { 2418 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); 2419 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; 2420 } 2421 2422 mxc->mx = mp; 2423 return 0; 2424 err: 2425 kfree(mp); 2426 return -EINVAL; 2427 } 2428 2429 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2430 struct fib6_config *cfg, 2431 const struct in6_addr *gw_addr) 2432 { 2433 struct flowi6 fl6 = { 2434 .flowi6_oif = cfg->fc_ifindex, 2435 .daddr = *gw_addr, 2436 .saddr = cfg->fc_prefsrc, 2437 }; 2438 struct fib6_table *table; 2439 struct rt6_info *rt; 2440 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; 2441 2442 table = fib6_get_table(net, cfg->fc_table); 2443 if (!table) 2444 return NULL; 2445 2446 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2447 flags |= RT6_LOOKUP_F_HAS_SADDR; 2448 2449 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); 2450 2451 /* if table lookup failed, fall back to full lookup */ 2452 if (rt == net->ipv6.ip6_null_entry) { 2453 ip6_rt_put(rt); 2454 rt = NULL; 2455 } 2456 2457 return rt; 2458 } 2459 2460 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, 2461 struct netlink_ext_ack *extack) 2462 { 2463 struct net *net = cfg->fc_nlinfo.nl_net; 2464 struct rt6_info *rt = NULL; 2465 struct net_device *dev = NULL; 2466 struct inet6_dev *idev = NULL; 2467 struct fib6_table *table; 2468 int addr_type; 2469 int err = -EINVAL; 2470 2471 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2472 if (cfg->fc_flags & RTF_PCPU) { 2473 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2474 goto out; 2475 } 2476 2477 if (cfg->fc_dst_len > 128) { 2478 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2479 goto out; 2480 } 2481 if (cfg->fc_src_len > 128) { 2482 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2483 goto out; 2484 } 2485 #ifndef CONFIG_IPV6_SUBTREES 2486 if (cfg->fc_src_len) { 2487 NL_SET_ERR_MSG(extack, 2488 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2489 goto out; 2490 } 2491 #endif 2492 if (cfg->fc_ifindex) { 2493 err = -ENODEV; 2494 dev = dev_get_by_index(net, cfg->fc_ifindex); 2495 if (!dev) 2496 goto out; 2497 idev = in6_dev_get(dev); 2498 if (!idev) 2499 goto out; 2500 } 2501 2502 if (cfg->fc_metric == 0) 2503 cfg->fc_metric = IP6_RT_PRIO_USER; 2504 2505 err = -ENOBUFS; 2506 if (cfg->fc_nlinfo.nlh && 2507 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2508 table = fib6_get_table(net, cfg->fc_table); 2509 if (!table) { 2510 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2511 table = fib6_new_table(net, cfg->fc_table); 2512 } 2513 } else { 2514 table = fib6_new_table(net, cfg->fc_table); 2515 } 2516 2517 if (!table) 2518 goto out; 2519 2520 rt = ip6_dst_alloc(net, NULL, 2521 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); 2522 2523 if (!rt) { 2524 err = -ENOMEM; 2525 goto out; 2526 } 2527 2528 if (cfg->fc_flags & RTF_EXPIRES) 2529 rt6_set_expires(rt, jiffies + 2530 clock_t_to_jiffies(cfg->fc_expires)); 2531 else 2532 rt6_clean_expires(rt); 2533 2534 if (cfg->fc_protocol == RTPROT_UNSPEC) 2535 cfg->fc_protocol = RTPROT_BOOT; 2536 rt->rt6i_protocol = cfg->fc_protocol; 2537 2538 addr_type = ipv6_addr_type(&cfg->fc_dst); 2539 2540 if (addr_type & IPV6_ADDR_MULTICAST) 2541 rt->dst.input = ip6_mc_input; 2542 else if (cfg->fc_flags & RTF_LOCAL) 2543 rt->dst.input = ip6_input; 2544 else 2545 rt->dst.input = ip6_forward; 2546 2547 rt->dst.output = ip6_output; 2548 2549 if (cfg->fc_encap) { 2550 struct lwtunnel_state *lwtstate; 2551 2552 err = lwtunnel_build_state(cfg->fc_encap_type, 2553 cfg->fc_encap, AF_INET6, cfg, 2554 &lwtstate, extack); 2555 if (err) 2556 goto out; 2557 rt->dst.lwtstate = lwtstate_get(lwtstate); 2558 if (lwtunnel_output_redirect(rt->dst.lwtstate)) { 2559 rt->dst.lwtstate->orig_output = rt->dst.output; 2560 rt->dst.output = lwtunnel_output; 2561 } 2562 if (lwtunnel_input_redirect(rt->dst.lwtstate)) { 2563 rt->dst.lwtstate->orig_input = rt->dst.input; 2564 rt->dst.input = lwtunnel_input; 2565 } 2566 } 2567 2568 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2569 rt->rt6i_dst.plen = cfg->fc_dst_len; 2570 if (rt->rt6i_dst.plen == 128) 2571 rt->dst.flags |= DST_HOST; 2572 2573 #ifdef CONFIG_IPV6_SUBTREES 2574 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 2575 rt->rt6i_src.plen = cfg->fc_src_len; 2576 #endif 2577 2578 rt->rt6i_metric = cfg->fc_metric; 2579 2580 /* We cannot add true routes via loopback here, 2581 they would result in kernel looping; promote them to reject routes 2582 */ 2583 if ((cfg->fc_flags & RTF_REJECT) || 2584 (dev && (dev->flags & IFF_LOOPBACK) && 2585 !(addr_type & IPV6_ADDR_LOOPBACK) && 2586 !(cfg->fc_flags & RTF_LOCAL))) { 2587 /* hold loopback dev/idev if we haven't done so. */ 2588 if (dev != net->loopback_dev) { 2589 if (dev) { 2590 dev_put(dev); 2591 in6_dev_put(idev); 2592 } 2593 dev = net->loopback_dev; 2594 dev_hold(dev); 2595 idev = in6_dev_get(dev); 2596 if (!idev) { 2597 err = -ENODEV; 2598 goto out; 2599 } 2600 } 2601 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 2602 switch (cfg->fc_type) { 2603 case RTN_BLACKHOLE: 2604 rt->dst.error = -EINVAL; 2605 rt->dst.output = dst_discard_out; 2606 rt->dst.input = dst_discard; 2607 break; 2608 case RTN_PROHIBIT: 2609 rt->dst.error = -EACCES; 2610 rt->dst.output = ip6_pkt_prohibit_out; 2611 rt->dst.input = ip6_pkt_prohibit; 2612 break; 2613 case RTN_THROW: 2614 case RTN_UNREACHABLE: 2615 default: 2616 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN 2617 : (cfg->fc_type == RTN_UNREACHABLE) 2618 ? -EHOSTUNREACH : -ENETUNREACH; 2619 rt->dst.output = ip6_pkt_discard_out; 2620 rt->dst.input = ip6_pkt_discard; 2621 break; 2622 } 2623 goto install_route; 2624 } 2625 2626 if (cfg->fc_flags & RTF_GATEWAY) { 2627 const struct in6_addr *gw_addr; 2628 int gwa_type; 2629 2630 gw_addr = &cfg->fc_gateway; 2631 gwa_type = ipv6_addr_type(gw_addr); 2632 2633 /* if gw_addr is local we will fail to detect this in case 2634 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2635 * will return already-added prefix route via interface that 2636 * prefix route was assigned to, which might be non-loopback. 2637 */ 2638 err = -EINVAL; 2639 if (ipv6_chk_addr_and_flags(net, gw_addr, 2640 gwa_type & IPV6_ADDR_LINKLOCAL ? 2641 dev : NULL, 0, 0)) { 2642 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2643 goto out; 2644 } 2645 rt->rt6i_gateway = *gw_addr; 2646 2647 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 2648 struct rt6_info *grt = NULL; 2649 2650 /* IPv6 strictly inhibits using not link-local 2651 addresses as nexthop address. 2652 Otherwise, router will not able to send redirects. 2653 It is very good, but in some (rare!) circumstances 2654 (SIT, PtP, NBMA NOARP links) it is handy to allow 2655 some exceptions. --ANK 2656 We allow IPv4-mapped nexthops to support RFC4798-type 2657 addressing 2658 */ 2659 if (!(gwa_type & (IPV6_ADDR_UNICAST | 2660 IPV6_ADDR_MAPPED))) { 2661 NL_SET_ERR_MSG(extack, 2662 "Invalid gateway address"); 2663 goto out; 2664 } 2665 2666 if (cfg->fc_table) { 2667 grt = ip6_nh_lookup_table(net, cfg, gw_addr); 2668 2669 if (grt) { 2670 if (grt->rt6i_flags & RTF_GATEWAY || 2671 (dev && dev != grt->dst.dev)) { 2672 ip6_rt_put(grt); 2673 grt = NULL; 2674 } 2675 } 2676 } 2677 2678 if (!grt) 2679 grt = rt6_lookup(net, gw_addr, NULL, 2680 cfg->fc_ifindex, 1); 2681 2682 err = -EHOSTUNREACH; 2683 if (!grt) 2684 goto out; 2685 if (dev) { 2686 if (dev != grt->dst.dev) { 2687 ip6_rt_put(grt); 2688 goto out; 2689 } 2690 } else { 2691 dev = grt->dst.dev; 2692 idev = grt->rt6i_idev; 2693 dev_hold(dev); 2694 in6_dev_hold(grt->rt6i_idev); 2695 } 2696 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2697 err = 0; 2698 ip6_rt_put(grt); 2699 2700 if (err) 2701 goto out; 2702 } 2703 err = -EINVAL; 2704 if (!dev) { 2705 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2706 goto out; 2707 } else if (dev->flags & IFF_LOOPBACK) { 2708 NL_SET_ERR_MSG(extack, 2709 "Egress device can not be loopback device for this route"); 2710 goto out; 2711 } 2712 } 2713 2714 err = -ENODEV; 2715 if (!dev) 2716 goto out; 2717 2718 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 2719 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 2720 NL_SET_ERR_MSG(extack, "Invalid source address"); 2721 err = -EINVAL; 2722 goto out; 2723 } 2724 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 2725 rt->rt6i_prefsrc.plen = 128; 2726 } else 2727 rt->rt6i_prefsrc.plen = 0; 2728 2729 rt->rt6i_flags = cfg->fc_flags; 2730 2731 install_route: 2732 rt->dst.dev = dev; 2733 rt->rt6i_idev = idev; 2734 rt->rt6i_table = table; 2735 2736 cfg->fc_nlinfo.nl_net = dev_net(dev); 2737 2738 return rt; 2739 out: 2740 if (dev) 2741 dev_put(dev); 2742 if (idev) 2743 in6_dev_put(idev); 2744 if (rt) 2745 dst_release_immediate(&rt->dst); 2746 2747 return ERR_PTR(err); 2748 } 2749 2750 int ip6_route_add(struct fib6_config *cfg, 2751 struct netlink_ext_ack *extack) 2752 { 2753 struct mx6_config mxc = { .mx = NULL, }; 2754 struct rt6_info *rt; 2755 int err; 2756 2757 rt = ip6_route_info_create(cfg, extack); 2758 if (IS_ERR(rt)) { 2759 err = PTR_ERR(rt); 2760 rt = NULL; 2761 goto out; 2762 } 2763 2764 err = ip6_convert_metrics(&mxc, cfg); 2765 if (err) 2766 goto out; 2767 2768 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); 2769 2770 kfree(mxc.mx); 2771 2772 return err; 2773 out: 2774 if (rt) 2775 dst_release_immediate(&rt->dst); 2776 2777 return err; 2778 } 2779 2780 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 2781 { 2782 int err; 2783 struct fib6_table *table; 2784 struct net *net = dev_net(rt->dst.dev); 2785 2786 if (rt == net->ipv6.ip6_null_entry) { 2787 err = -ENOENT; 2788 goto out; 2789 } 2790 2791 table = rt->rt6i_table; 2792 spin_lock_bh(&table->tb6_lock); 2793 err = fib6_del(rt, info); 2794 spin_unlock_bh(&table->tb6_lock); 2795 2796 out: 2797 ip6_rt_put(rt); 2798 return err; 2799 } 2800 2801 int ip6_del_rt(struct rt6_info *rt) 2802 { 2803 struct nl_info info = { 2804 .nl_net = dev_net(rt->dst.dev), 2805 }; 2806 return __ip6_del_rt(rt, &info); 2807 } 2808 2809 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) 2810 { 2811 struct nl_info *info = &cfg->fc_nlinfo; 2812 struct net *net = info->nl_net; 2813 struct sk_buff *skb = NULL; 2814 struct fib6_table *table; 2815 int err = -ENOENT; 2816 2817 if (rt == net->ipv6.ip6_null_entry) 2818 goto out_put; 2819 table = rt->rt6i_table; 2820 spin_lock_bh(&table->tb6_lock); 2821 2822 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { 2823 struct rt6_info *sibling, *next_sibling; 2824 2825 /* prefer to send a single notification with all hops */ 2826 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 2827 if (skb) { 2828 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 2829 2830 if (rt6_fill_node(net, skb, rt, 2831 NULL, NULL, 0, RTM_DELROUTE, 2832 info->portid, seq, 0) < 0) { 2833 kfree_skb(skb); 2834 skb = NULL; 2835 } else 2836 info->skip_notify = 1; 2837 } 2838 2839 list_for_each_entry_safe(sibling, next_sibling, 2840 &rt->rt6i_siblings, 2841 rt6i_siblings) { 2842 err = fib6_del(sibling, info); 2843 if (err) 2844 goto out_unlock; 2845 } 2846 } 2847 2848 err = fib6_del(rt, info); 2849 out_unlock: 2850 spin_unlock_bh(&table->tb6_lock); 2851 out_put: 2852 ip6_rt_put(rt); 2853 2854 if (skb) { 2855 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 2856 info->nlh, gfp_any()); 2857 } 2858 return err; 2859 } 2860 2861 static int ip6_route_del(struct fib6_config *cfg, 2862 struct netlink_ext_ack *extack) 2863 { 2864 struct rt6_info *rt, *rt_cache; 2865 struct fib6_table *table; 2866 struct fib6_node *fn; 2867 int err = -ESRCH; 2868 2869 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 2870 if (!table) { 2871 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 2872 return err; 2873 } 2874 2875 rcu_read_lock(); 2876 2877 fn = fib6_locate(&table->tb6_root, 2878 &cfg->fc_dst, cfg->fc_dst_len, 2879 &cfg->fc_src, cfg->fc_src_len, 2880 !(cfg->fc_flags & RTF_CACHE)); 2881 2882 if (fn) { 2883 for_each_fib6_node_rt_rcu(fn) { 2884 if (cfg->fc_flags & RTF_CACHE) { 2885 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, 2886 &cfg->fc_src); 2887 if (!rt_cache) 2888 continue; 2889 rt = rt_cache; 2890 } 2891 if (cfg->fc_ifindex && 2892 (!rt->dst.dev || 2893 rt->dst.dev->ifindex != cfg->fc_ifindex)) 2894 continue; 2895 if (cfg->fc_flags & RTF_GATEWAY && 2896 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 2897 continue; 2898 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2899 continue; 2900 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) 2901 continue; 2902 if (!dst_hold_safe(&rt->dst)) 2903 break; 2904 rcu_read_unlock(); 2905 2906 /* if gateway was specified only delete the one hop */ 2907 if (cfg->fc_flags & RTF_GATEWAY) 2908 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2909 2910 return __ip6_del_rt_siblings(rt, cfg); 2911 } 2912 } 2913 rcu_read_unlock(); 2914 2915 return err; 2916 } 2917 2918 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 2919 { 2920 struct netevent_redirect netevent; 2921 struct rt6_info *rt, *nrt = NULL; 2922 struct ndisc_options ndopts; 2923 struct inet6_dev *in6_dev; 2924 struct neighbour *neigh; 2925 struct rd_msg *msg; 2926 int optlen, on_link; 2927 u8 *lladdr; 2928 2929 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 2930 optlen -= sizeof(*msg); 2931 2932 if (optlen < 0) { 2933 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 2934 return; 2935 } 2936 2937 msg = (struct rd_msg *)icmp6_hdr(skb); 2938 2939 if (ipv6_addr_is_multicast(&msg->dest)) { 2940 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 2941 return; 2942 } 2943 2944 on_link = 0; 2945 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 2946 on_link = 1; 2947 } else if (ipv6_addr_type(&msg->target) != 2948 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 2949 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 2950 return; 2951 } 2952 2953 in6_dev = __in6_dev_get(skb->dev); 2954 if (!in6_dev) 2955 return; 2956 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 2957 return; 2958 2959 /* RFC2461 8.1: 2960 * The IP source address of the Redirect MUST be the same as the current 2961 * first-hop router for the specified ICMP Destination Address. 2962 */ 2963 2964 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 2965 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 2966 return; 2967 } 2968 2969 lladdr = NULL; 2970 if (ndopts.nd_opts_tgt_lladdr) { 2971 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 2972 skb->dev); 2973 if (!lladdr) { 2974 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 2975 return; 2976 } 2977 } 2978 2979 rt = (struct rt6_info *) dst; 2980 if (rt->rt6i_flags & RTF_REJECT) { 2981 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 2982 return; 2983 } 2984 2985 /* Redirect received -> path was valid. 2986 * Look, redirects are sent only in response to data packets, 2987 * so that this nexthop apparently is reachable. --ANK 2988 */ 2989 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 2990 2991 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 2992 if (!neigh) 2993 return; 2994 2995 /* 2996 * We have finally decided to accept it. 2997 */ 2998 2999 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 3000 NEIGH_UPDATE_F_WEAK_OVERRIDE| 3001 NEIGH_UPDATE_F_OVERRIDE| 3002 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 3003 NEIGH_UPDATE_F_ISROUTER)), 3004 NDISC_REDIRECT, &ndopts); 3005 3006 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 3007 if (!nrt) 3008 goto out; 3009 3010 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 3011 if (on_link) 3012 nrt->rt6i_flags &= ~RTF_GATEWAY; 3013 3014 nrt->rt6i_protocol = RTPROT_REDIRECT; 3015 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 3016 3017 /* No need to remove rt from the exception table if rt is 3018 * a cached route because rt6_insert_exception() will 3019 * takes care of it 3020 */ 3021 if (rt6_insert_exception(nrt, rt)) { 3022 dst_release_immediate(&nrt->dst); 3023 goto out; 3024 } 3025 3026 netevent.old = &rt->dst; 3027 netevent.new = &nrt->dst; 3028 netevent.daddr = &msg->dest; 3029 netevent.neigh = neigh; 3030 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 3031 3032 out: 3033 neigh_release(neigh); 3034 } 3035 3036 /* 3037 * Misc support functions 3038 */ 3039 3040 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) 3041 { 3042 BUG_ON(from->dst.from); 3043 3044 rt->rt6i_flags &= ~RTF_EXPIRES; 3045 dst_hold(&from->dst); 3046 rt->dst.from = &from->dst; 3047 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); 3048 } 3049 3050 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) 3051 { 3052 rt->dst.input = ort->dst.input; 3053 rt->dst.output = ort->dst.output; 3054 rt->rt6i_dst = ort->rt6i_dst; 3055 rt->dst.error = ort->dst.error; 3056 rt->rt6i_idev = ort->rt6i_idev; 3057 if (rt->rt6i_idev) 3058 in6_dev_hold(rt->rt6i_idev); 3059 rt->dst.lastuse = jiffies; 3060 rt->rt6i_gateway = ort->rt6i_gateway; 3061 rt->rt6i_flags = ort->rt6i_flags; 3062 rt6_set_from(rt, ort); 3063 rt->rt6i_metric = ort->rt6i_metric; 3064 #ifdef CONFIG_IPV6_SUBTREES 3065 rt->rt6i_src = ort->rt6i_src; 3066 #endif 3067 rt->rt6i_prefsrc = ort->rt6i_prefsrc; 3068 rt->rt6i_table = ort->rt6i_table; 3069 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); 3070 } 3071 3072 #ifdef CONFIG_IPV6_ROUTE_INFO 3073 static struct rt6_info *rt6_get_route_info(struct net *net, 3074 const struct in6_addr *prefix, int prefixlen, 3075 const struct in6_addr *gwaddr, 3076 struct net_device *dev) 3077 { 3078 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3079 int ifindex = dev->ifindex; 3080 struct fib6_node *fn; 3081 struct rt6_info *rt = NULL; 3082 struct fib6_table *table; 3083 3084 table = fib6_get_table(net, tb_id); 3085 if (!table) 3086 return NULL; 3087 3088 rcu_read_lock(); 3089 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3090 if (!fn) 3091 goto out; 3092 3093 for_each_fib6_node_rt_rcu(fn) { 3094 if (rt->dst.dev->ifindex != ifindex) 3095 continue; 3096 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3097 continue; 3098 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 3099 continue; 3100 ip6_hold_safe(NULL, &rt, false); 3101 break; 3102 } 3103 out: 3104 rcu_read_unlock(); 3105 return rt; 3106 } 3107 3108 static struct rt6_info *rt6_add_route_info(struct net *net, 3109 const struct in6_addr *prefix, int prefixlen, 3110 const struct in6_addr *gwaddr, 3111 struct net_device *dev, 3112 unsigned int pref) 3113 { 3114 struct fib6_config cfg = { 3115 .fc_metric = IP6_RT_PRIO_USER, 3116 .fc_ifindex = dev->ifindex, 3117 .fc_dst_len = prefixlen, 3118 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3119 RTF_UP | RTF_PREF(pref), 3120 .fc_protocol = RTPROT_RA, 3121 .fc_nlinfo.portid = 0, 3122 .fc_nlinfo.nlh = NULL, 3123 .fc_nlinfo.nl_net = net, 3124 }; 3125 3126 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3127 cfg.fc_dst = *prefix; 3128 cfg.fc_gateway = *gwaddr; 3129 3130 /* We should treat it as a default route if prefix length is 0. */ 3131 if (!prefixlen) 3132 cfg.fc_flags |= RTF_DEFAULT; 3133 3134 ip6_route_add(&cfg, NULL); 3135 3136 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3137 } 3138 #endif 3139 3140 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 3141 { 3142 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3143 struct rt6_info *rt; 3144 struct fib6_table *table; 3145 3146 table = fib6_get_table(dev_net(dev), tb_id); 3147 if (!table) 3148 return NULL; 3149 3150 rcu_read_lock(); 3151 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3152 if (dev == rt->dst.dev && 3153 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3154 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 3155 break; 3156 } 3157 if (rt) 3158 ip6_hold_safe(NULL, &rt, false); 3159 rcu_read_unlock(); 3160 return rt; 3161 } 3162 3163 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 3164 struct net_device *dev, 3165 unsigned int pref) 3166 { 3167 struct fib6_config cfg = { 3168 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3169 .fc_metric = IP6_RT_PRIO_USER, 3170 .fc_ifindex = dev->ifindex, 3171 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3172 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3173 .fc_protocol = RTPROT_RA, 3174 .fc_nlinfo.portid = 0, 3175 .fc_nlinfo.nlh = NULL, 3176 .fc_nlinfo.nl_net = dev_net(dev), 3177 }; 3178 3179 cfg.fc_gateway = *gwaddr; 3180 3181 if (!ip6_route_add(&cfg, NULL)) { 3182 struct fib6_table *table; 3183 3184 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3185 if (table) 3186 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3187 } 3188 3189 return rt6_get_dflt_router(gwaddr, dev); 3190 } 3191 3192 static void __rt6_purge_dflt_routers(struct fib6_table *table) 3193 { 3194 struct rt6_info *rt; 3195 3196 restart: 3197 rcu_read_lock(); 3198 for_each_fib6_node_rt_rcu(&table->tb6_root) { 3199 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3200 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 3201 if (dst_hold_safe(&rt->dst)) { 3202 rcu_read_unlock(); 3203 ip6_del_rt(rt); 3204 } else { 3205 rcu_read_unlock(); 3206 } 3207 goto restart; 3208 } 3209 } 3210 rcu_read_unlock(); 3211 3212 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3213 } 3214 3215 void rt6_purge_dflt_routers(struct net *net) 3216 { 3217 struct fib6_table *table; 3218 struct hlist_head *head; 3219 unsigned int h; 3220 3221 rcu_read_lock(); 3222 3223 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3224 head = &net->ipv6.fib_table_hash[h]; 3225 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3226 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3227 __rt6_purge_dflt_routers(table); 3228 } 3229 } 3230 3231 rcu_read_unlock(); 3232 } 3233 3234 static void rtmsg_to_fib6_config(struct net *net, 3235 struct in6_rtmsg *rtmsg, 3236 struct fib6_config *cfg) 3237 { 3238 memset(cfg, 0, sizeof(*cfg)); 3239 3240 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3241 : RT6_TABLE_MAIN; 3242 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3243 cfg->fc_metric = rtmsg->rtmsg_metric; 3244 cfg->fc_expires = rtmsg->rtmsg_info; 3245 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3246 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3247 cfg->fc_flags = rtmsg->rtmsg_flags; 3248 3249 cfg->fc_nlinfo.nl_net = net; 3250 3251 cfg->fc_dst = rtmsg->rtmsg_dst; 3252 cfg->fc_src = rtmsg->rtmsg_src; 3253 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3254 } 3255 3256 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3257 { 3258 struct fib6_config cfg; 3259 struct in6_rtmsg rtmsg; 3260 int err; 3261 3262 switch (cmd) { 3263 case SIOCADDRT: /* Add a route */ 3264 case SIOCDELRT: /* Delete a route */ 3265 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3266 return -EPERM; 3267 err = copy_from_user(&rtmsg, arg, 3268 sizeof(struct in6_rtmsg)); 3269 if (err) 3270 return -EFAULT; 3271 3272 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3273 3274 rtnl_lock(); 3275 switch (cmd) { 3276 case SIOCADDRT: 3277 err = ip6_route_add(&cfg, NULL); 3278 break; 3279 case SIOCDELRT: 3280 err = ip6_route_del(&cfg, NULL); 3281 break; 3282 default: 3283 err = -EINVAL; 3284 } 3285 rtnl_unlock(); 3286 3287 return err; 3288 } 3289 3290 return -EINVAL; 3291 } 3292 3293 /* 3294 * Drop the packet on the floor 3295 */ 3296 3297 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3298 { 3299 int type; 3300 struct dst_entry *dst = skb_dst(skb); 3301 switch (ipstats_mib_noroutes) { 3302 case IPSTATS_MIB_INNOROUTES: 3303 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3304 if (type == IPV6_ADDR_ANY) { 3305 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3306 IPSTATS_MIB_INADDRERRORS); 3307 break; 3308 } 3309 /* FALLTHROUGH */ 3310 case IPSTATS_MIB_OUTNOROUTES: 3311 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3312 ipstats_mib_noroutes); 3313 break; 3314 } 3315 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3316 kfree_skb(skb); 3317 return 0; 3318 } 3319 3320 static int ip6_pkt_discard(struct sk_buff *skb) 3321 { 3322 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3323 } 3324 3325 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3326 { 3327 skb->dev = skb_dst(skb)->dev; 3328 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3329 } 3330 3331 static int ip6_pkt_prohibit(struct sk_buff *skb) 3332 { 3333 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3334 } 3335 3336 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3337 { 3338 skb->dev = skb_dst(skb)->dev; 3339 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3340 } 3341 3342 /* 3343 * Allocate a dst for local (unicast / anycast) address. 3344 */ 3345 3346 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 3347 const struct in6_addr *addr, 3348 bool anycast) 3349 { 3350 u32 tb_id; 3351 struct net *net = dev_net(idev->dev); 3352 struct net_device *dev = idev->dev; 3353 struct rt6_info *rt; 3354 3355 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); 3356 if (!rt) 3357 return ERR_PTR(-ENOMEM); 3358 3359 in6_dev_hold(idev); 3360 3361 rt->dst.flags |= DST_HOST; 3362 rt->dst.input = ip6_input; 3363 rt->dst.output = ip6_output; 3364 rt->rt6i_idev = idev; 3365 3366 rt->rt6i_protocol = RTPROT_KERNEL; 3367 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 3368 if (anycast) 3369 rt->rt6i_flags |= RTF_ANYCAST; 3370 else 3371 rt->rt6i_flags |= RTF_LOCAL; 3372 3373 rt->rt6i_gateway = *addr; 3374 rt->rt6i_dst.addr = *addr; 3375 rt->rt6i_dst.plen = 128; 3376 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3377 rt->rt6i_table = fib6_get_table(net, tb_id); 3378 3379 return rt; 3380 } 3381 3382 /* remove deleted ip from prefsrc entries */ 3383 struct arg_dev_net_ip { 3384 struct net_device *dev; 3385 struct net *net; 3386 struct in6_addr *addr; 3387 }; 3388 3389 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 3390 { 3391 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3392 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3393 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3394 3395 if (((void *)rt->dst.dev == dev || !dev) && 3396 rt != net->ipv6.ip6_null_entry && 3397 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 3398 spin_lock_bh(&rt6_exception_lock); 3399 /* remove prefsrc entry */ 3400 rt->rt6i_prefsrc.plen = 0; 3401 /* need to update cache as well */ 3402 rt6_exceptions_remove_prefsrc(rt); 3403 spin_unlock_bh(&rt6_exception_lock); 3404 } 3405 return 0; 3406 } 3407 3408 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3409 { 3410 struct net *net = dev_net(ifp->idev->dev); 3411 struct arg_dev_net_ip adni = { 3412 .dev = ifp->idev->dev, 3413 .net = net, 3414 .addr = &ifp->addr, 3415 }; 3416 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3417 } 3418 3419 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3420 3421 /* Remove routers and update dst entries when gateway turn into host. */ 3422 static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 3423 { 3424 struct in6_addr *gateway = (struct in6_addr *)arg; 3425 3426 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && 3427 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 3428 return -1; 3429 } 3430 3431 /* Further clean up cached routes in exception table. 3432 * This is needed because cached route may have a different 3433 * gateway than its 'parent' in the case of an ip redirect. 3434 */ 3435 rt6_exceptions_clean_tohost(rt, gateway); 3436 3437 return 0; 3438 } 3439 3440 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3441 { 3442 fib6_clean_all(net, fib6_clean_tohost, gateway); 3443 } 3444 3445 struct arg_dev_net { 3446 struct net_device *dev; 3447 struct net *net; 3448 }; 3449 3450 /* called with write lock held for table with rt */ 3451 static int fib6_ifdown(struct rt6_info *rt, void *arg) 3452 { 3453 const struct arg_dev_net *adn = arg; 3454 const struct net_device *dev = adn->dev; 3455 3456 if ((rt->dst.dev == dev || !dev) && 3457 rt != adn->net->ipv6.ip6_null_entry && 3458 (rt->rt6i_nsiblings == 0 || 3459 (dev && netdev_unregistering(dev)) || 3460 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) 3461 return -1; 3462 3463 return 0; 3464 } 3465 3466 void rt6_ifdown(struct net *net, struct net_device *dev) 3467 { 3468 struct arg_dev_net adn = { 3469 .dev = dev, 3470 .net = net, 3471 }; 3472 3473 fib6_clean_all(net, fib6_ifdown, &adn); 3474 if (dev) 3475 rt6_uncached_list_flush_dev(net, dev); 3476 } 3477 3478 struct rt6_mtu_change_arg { 3479 struct net_device *dev; 3480 unsigned int mtu; 3481 }; 3482 3483 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 3484 { 3485 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 3486 struct inet6_dev *idev; 3487 3488 /* In IPv6 pmtu discovery is not optional, 3489 so that RTAX_MTU lock cannot disable it. 3490 We still use this lock to block changes 3491 caused by addrconf/ndisc. 3492 */ 3493 3494 idev = __in6_dev_get(arg->dev); 3495 if (!idev) 3496 return 0; 3497 3498 /* For administrative MTU increase, there is no way to discover 3499 IPv6 PMTU increase, so PMTU increase should be updated here. 3500 Since RFC 1981 doesn't include administrative MTU increase 3501 update PMTU increase is a MUST. (i.e. jumbo frame) 3502 */ 3503 /* 3504 If new MTU is less than route PMTU, this new MTU will be the 3505 lowest MTU in the path, update the route PMTU to reflect PMTU 3506 decreases; if new MTU is greater than route PMTU, and the 3507 old MTU is the lowest MTU in the path, update the route PMTU 3508 to reflect the increase. In this case if the other nodes' MTU 3509 also have the lowest MTU, TOO BIG MESSAGE will be lead to 3510 PMTU discovery. 3511 */ 3512 if (rt->dst.dev == arg->dev && 3513 dst_metric_raw(&rt->dst, RTAX_MTU) && 3514 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 3515 spin_lock_bh(&rt6_exception_lock); 3516 if (dst_mtu(&rt->dst) >= arg->mtu || 3517 (dst_mtu(&rt->dst) < arg->mtu && 3518 dst_mtu(&rt->dst) == idev->cnf.mtu6)) { 3519 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 3520 } 3521 rt6_exceptions_update_pmtu(rt, arg->mtu); 3522 spin_unlock_bh(&rt6_exception_lock); 3523 } 3524 return 0; 3525 } 3526 3527 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 3528 { 3529 struct rt6_mtu_change_arg arg = { 3530 .dev = dev, 3531 .mtu = mtu, 3532 }; 3533 3534 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 3535 } 3536 3537 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 3538 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 3539 [RTA_OIF] = { .type = NLA_U32 }, 3540 [RTA_IIF] = { .type = NLA_U32 }, 3541 [RTA_PRIORITY] = { .type = NLA_U32 }, 3542 [RTA_METRICS] = { .type = NLA_NESTED }, 3543 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 3544 [RTA_PREF] = { .type = NLA_U8 }, 3545 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 3546 [RTA_ENCAP] = { .type = NLA_NESTED }, 3547 [RTA_EXPIRES] = { .type = NLA_U32 }, 3548 [RTA_UID] = { .type = NLA_U32 }, 3549 [RTA_MARK] = { .type = NLA_U32 }, 3550 }; 3551 3552 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 3553 struct fib6_config *cfg, 3554 struct netlink_ext_ack *extack) 3555 { 3556 struct rtmsg *rtm; 3557 struct nlattr *tb[RTA_MAX+1]; 3558 unsigned int pref; 3559 int err; 3560 3561 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 3562 NULL); 3563 if (err < 0) 3564 goto errout; 3565 3566 err = -EINVAL; 3567 rtm = nlmsg_data(nlh); 3568 memset(cfg, 0, sizeof(*cfg)); 3569 3570 cfg->fc_table = rtm->rtm_table; 3571 cfg->fc_dst_len = rtm->rtm_dst_len; 3572 cfg->fc_src_len = rtm->rtm_src_len; 3573 cfg->fc_flags = RTF_UP; 3574 cfg->fc_protocol = rtm->rtm_protocol; 3575 cfg->fc_type = rtm->rtm_type; 3576 3577 if (rtm->rtm_type == RTN_UNREACHABLE || 3578 rtm->rtm_type == RTN_BLACKHOLE || 3579 rtm->rtm_type == RTN_PROHIBIT || 3580 rtm->rtm_type == RTN_THROW) 3581 cfg->fc_flags |= RTF_REJECT; 3582 3583 if (rtm->rtm_type == RTN_LOCAL) 3584 cfg->fc_flags |= RTF_LOCAL; 3585 3586 if (rtm->rtm_flags & RTM_F_CLONED) 3587 cfg->fc_flags |= RTF_CACHE; 3588 3589 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 3590 cfg->fc_nlinfo.nlh = nlh; 3591 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 3592 3593 if (tb[RTA_GATEWAY]) { 3594 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 3595 cfg->fc_flags |= RTF_GATEWAY; 3596 } 3597 3598 if (tb[RTA_DST]) { 3599 int plen = (rtm->rtm_dst_len + 7) >> 3; 3600 3601 if (nla_len(tb[RTA_DST]) < plen) 3602 goto errout; 3603 3604 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 3605 } 3606 3607 if (tb[RTA_SRC]) { 3608 int plen = (rtm->rtm_src_len + 7) >> 3; 3609 3610 if (nla_len(tb[RTA_SRC]) < plen) 3611 goto errout; 3612 3613 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 3614 } 3615 3616 if (tb[RTA_PREFSRC]) 3617 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 3618 3619 if (tb[RTA_OIF]) 3620 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 3621 3622 if (tb[RTA_PRIORITY]) 3623 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 3624 3625 if (tb[RTA_METRICS]) { 3626 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 3627 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 3628 } 3629 3630 if (tb[RTA_TABLE]) 3631 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 3632 3633 if (tb[RTA_MULTIPATH]) { 3634 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 3635 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 3636 3637 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 3638 cfg->fc_mp_len, extack); 3639 if (err < 0) 3640 goto errout; 3641 } 3642 3643 if (tb[RTA_PREF]) { 3644 pref = nla_get_u8(tb[RTA_PREF]); 3645 if (pref != ICMPV6_ROUTER_PREF_LOW && 3646 pref != ICMPV6_ROUTER_PREF_HIGH) 3647 pref = ICMPV6_ROUTER_PREF_MEDIUM; 3648 cfg->fc_flags |= RTF_PREF(pref); 3649 } 3650 3651 if (tb[RTA_ENCAP]) 3652 cfg->fc_encap = tb[RTA_ENCAP]; 3653 3654 if (tb[RTA_ENCAP_TYPE]) { 3655 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 3656 3657 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 3658 if (err < 0) 3659 goto errout; 3660 } 3661 3662 if (tb[RTA_EXPIRES]) { 3663 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 3664 3665 if (addrconf_finite_timeout(timeout)) { 3666 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 3667 cfg->fc_flags |= RTF_EXPIRES; 3668 } 3669 } 3670 3671 err = 0; 3672 errout: 3673 return err; 3674 } 3675 3676 struct rt6_nh { 3677 struct rt6_info *rt6_info; 3678 struct fib6_config r_cfg; 3679 struct mx6_config mxc; 3680 struct list_head next; 3681 }; 3682 3683 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 3684 { 3685 struct rt6_nh *nh; 3686 3687 list_for_each_entry(nh, rt6_nh_list, next) { 3688 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 3689 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3690 nh->r_cfg.fc_ifindex); 3691 } 3692 } 3693 3694 static int ip6_route_info_append(struct list_head *rt6_nh_list, 3695 struct rt6_info *rt, struct fib6_config *r_cfg) 3696 { 3697 struct rt6_nh *nh; 3698 int err = -EEXIST; 3699 3700 list_for_each_entry(nh, rt6_nh_list, next) { 3701 /* check if rt6_info already exists */ 3702 if (rt6_duplicate_nexthop(nh->rt6_info, rt)) 3703 return err; 3704 } 3705 3706 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 3707 if (!nh) 3708 return -ENOMEM; 3709 nh->rt6_info = rt; 3710 err = ip6_convert_metrics(&nh->mxc, r_cfg); 3711 if (err) { 3712 kfree(nh); 3713 return err; 3714 } 3715 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 3716 list_add_tail(&nh->next, rt6_nh_list); 3717 3718 return 0; 3719 } 3720 3721 static void ip6_route_mpath_notify(struct rt6_info *rt, 3722 struct rt6_info *rt_last, 3723 struct nl_info *info, 3724 __u16 nlflags) 3725 { 3726 /* if this is an APPEND route, then rt points to the first route 3727 * inserted and rt_last points to last route inserted. Userspace 3728 * wants a consistent dump of the route which starts at the first 3729 * nexthop. Since sibling routes are always added at the end of 3730 * the list, find the first sibling of the last route appended 3731 */ 3732 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { 3733 rt = list_first_entry(&rt_last->rt6i_siblings, 3734 struct rt6_info, 3735 rt6i_siblings); 3736 } 3737 3738 if (rt) 3739 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 3740 } 3741 3742 static int ip6_route_multipath_add(struct fib6_config *cfg, 3743 struct netlink_ext_ack *extack) 3744 { 3745 struct rt6_info *rt_notif = NULL, *rt_last = NULL; 3746 struct nl_info *info = &cfg->fc_nlinfo; 3747 struct fib6_config r_cfg; 3748 struct rtnexthop *rtnh; 3749 struct rt6_info *rt; 3750 struct rt6_nh *err_nh; 3751 struct rt6_nh *nh, *nh_safe; 3752 __u16 nlflags; 3753 int remaining; 3754 int attrlen; 3755 int err = 1; 3756 int nhn = 0; 3757 int replace = (cfg->fc_nlinfo.nlh && 3758 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 3759 LIST_HEAD(rt6_nh_list); 3760 3761 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 3762 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 3763 nlflags |= NLM_F_APPEND; 3764 3765 remaining = cfg->fc_mp_len; 3766 rtnh = (struct rtnexthop *)cfg->fc_mp; 3767 3768 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 3769 * rt6_info structs per nexthop 3770 */ 3771 while (rtnh_ok(rtnh, remaining)) { 3772 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3773 if (rtnh->rtnh_ifindex) 3774 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3775 3776 attrlen = rtnh_attrlen(rtnh); 3777 if (attrlen > 0) { 3778 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3779 3780 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3781 if (nla) { 3782 r_cfg.fc_gateway = nla_get_in6_addr(nla); 3783 r_cfg.fc_flags |= RTF_GATEWAY; 3784 } 3785 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 3786 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 3787 if (nla) 3788 r_cfg.fc_encap_type = nla_get_u16(nla); 3789 } 3790 3791 rt = ip6_route_info_create(&r_cfg, extack); 3792 if (IS_ERR(rt)) { 3793 err = PTR_ERR(rt); 3794 rt = NULL; 3795 goto cleanup; 3796 } 3797 3798 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 3799 if (err) { 3800 dst_release_immediate(&rt->dst); 3801 goto cleanup; 3802 } 3803 3804 rtnh = rtnh_next(rtnh, &remaining); 3805 } 3806 3807 /* for add and replace send one notification with all nexthops. 3808 * Skip the notification in fib6_add_rt2node and send one with 3809 * the full route when done 3810 */ 3811 info->skip_notify = 1; 3812 3813 err_nh = NULL; 3814 list_for_each_entry(nh, &rt6_nh_list, next) { 3815 rt_last = nh->rt6_info; 3816 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); 3817 /* save reference to first route for notification */ 3818 if (!rt_notif && !err) 3819 rt_notif = nh->rt6_info; 3820 3821 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 3822 nh->rt6_info = NULL; 3823 if (err) { 3824 if (replace && nhn) 3825 ip6_print_replace_route_err(&rt6_nh_list); 3826 err_nh = nh; 3827 goto add_errout; 3828 } 3829 3830 /* Because each route is added like a single route we remove 3831 * these flags after the first nexthop: if there is a collision, 3832 * we have already failed to add the first nexthop: 3833 * fib6_add_rt2node() has rejected it; when replacing, old 3834 * nexthops have been replaced by first new, the rest should 3835 * be added to it. 3836 */ 3837 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 3838 NLM_F_REPLACE); 3839 nhn++; 3840 } 3841 3842 /* success ... tell user about new route */ 3843 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3844 goto cleanup; 3845 3846 add_errout: 3847 /* send notification for routes that were added so that 3848 * the delete notifications sent by ip6_route_del are 3849 * coherent 3850 */ 3851 if (rt_notif) 3852 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3853 3854 /* Delete routes that were already added */ 3855 list_for_each_entry(nh, &rt6_nh_list, next) { 3856 if (err_nh == nh) 3857 break; 3858 ip6_route_del(&nh->r_cfg, extack); 3859 } 3860 3861 cleanup: 3862 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 3863 if (nh->rt6_info) 3864 dst_release_immediate(&nh->rt6_info->dst); 3865 kfree(nh->mxc.mx); 3866 list_del(&nh->next); 3867 kfree(nh); 3868 } 3869 3870 return err; 3871 } 3872 3873 static int ip6_route_multipath_del(struct fib6_config *cfg, 3874 struct netlink_ext_ack *extack) 3875 { 3876 struct fib6_config r_cfg; 3877 struct rtnexthop *rtnh; 3878 int remaining; 3879 int attrlen; 3880 int err = 1, last_err = 0; 3881 3882 remaining = cfg->fc_mp_len; 3883 rtnh = (struct rtnexthop *)cfg->fc_mp; 3884 3885 /* Parse a Multipath Entry */ 3886 while (rtnh_ok(rtnh, remaining)) { 3887 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3888 if (rtnh->rtnh_ifindex) 3889 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3890 3891 attrlen = rtnh_attrlen(rtnh); 3892 if (attrlen > 0) { 3893 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3894 3895 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3896 if (nla) { 3897 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 3898 r_cfg.fc_flags |= RTF_GATEWAY; 3899 } 3900 } 3901 err = ip6_route_del(&r_cfg, extack); 3902 if (err) 3903 last_err = err; 3904 3905 rtnh = rtnh_next(rtnh, &remaining); 3906 } 3907 3908 return last_err; 3909 } 3910 3911 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3912 struct netlink_ext_ack *extack) 3913 { 3914 struct fib6_config cfg; 3915 int err; 3916 3917 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3918 if (err < 0) 3919 return err; 3920 3921 if (cfg.fc_mp) 3922 return ip6_route_multipath_del(&cfg, extack); 3923 else { 3924 cfg.fc_delete_all_nh = 1; 3925 return ip6_route_del(&cfg, extack); 3926 } 3927 } 3928 3929 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3930 struct netlink_ext_ack *extack) 3931 { 3932 struct fib6_config cfg; 3933 int err; 3934 3935 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3936 if (err < 0) 3937 return err; 3938 3939 if (cfg.fc_mp) 3940 return ip6_route_multipath_add(&cfg, extack); 3941 else 3942 return ip6_route_add(&cfg, extack); 3943 } 3944 3945 static size_t rt6_nlmsg_size(struct rt6_info *rt) 3946 { 3947 int nexthop_len = 0; 3948 3949 if (rt->rt6i_nsiblings) { 3950 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 3951 + NLA_ALIGN(sizeof(struct rtnexthop)) 3952 + nla_total_size(16) /* RTA_GATEWAY */ 3953 + lwtunnel_get_encap_size(rt->dst.lwtstate); 3954 3955 nexthop_len *= rt->rt6i_nsiblings; 3956 } 3957 3958 return NLMSG_ALIGN(sizeof(struct rtmsg)) 3959 + nla_total_size(16) /* RTA_SRC */ 3960 + nla_total_size(16) /* RTA_DST */ 3961 + nla_total_size(16) /* RTA_GATEWAY */ 3962 + nla_total_size(16) /* RTA_PREFSRC */ 3963 + nla_total_size(4) /* RTA_TABLE */ 3964 + nla_total_size(4) /* RTA_IIF */ 3965 + nla_total_size(4) /* RTA_OIF */ 3966 + nla_total_size(4) /* RTA_PRIORITY */ 3967 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 3968 + nla_total_size(sizeof(struct rta_cacheinfo)) 3969 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 3970 + nla_total_size(1) /* RTA_PREF */ 3971 + lwtunnel_get_encap_size(rt->dst.lwtstate) 3972 + nexthop_len; 3973 } 3974 3975 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, 3976 unsigned int *flags, bool skip_oif) 3977 { 3978 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { 3979 *flags |= RTNH_F_LINKDOWN; 3980 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 3981 *flags |= RTNH_F_DEAD; 3982 } 3983 3984 if (rt->rt6i_flags & RTF_GATEWAY) { 3985 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 3986 goto nla_put_failure; 3987 } 3988 3989 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) 3990 *flags |= RTNH_F_OFFLOAD; 3991 3992 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 3993 if (!skip_oif && rt->dst.dev && 3994 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 3995 goto nla_put_failure; 3996 3997 if (rt->dst.lwtstate && 3998 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) 3999 goto nla_put_failure; 4000 4001 return 0; 4002 4003 nla_put_failure: 4004 return -EMSGSIZE; 4005 } 4006 4007 /* add multipath next hop */ 4008 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) 4009 { 4010 struct rtnexthop *rtnh; 4011 unsigned int flags = 0; 4012 4013 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 4014 if (!rtnh) 4015 goto nla_put_failure; 4016 4017 rtnh->rtnh_hops = 0; 4018 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; 4019 4020 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 4021 goto nla_put_failure; 4022 4023 rtnh->rtnh_flags = flags; 4024 4025 /* length of rtnetlink header + attributes */ 4026 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 4027 4028 return 0; 4029 4030 nla_put_failure: 4031 return -EMSGSIZE; 4032 } 4033 4034 static int rt6_fill_node(struct net *net, 4035 struct sk_buff *skb, struct rt6_info *rt, 4036 struct in6_addr *dst, struct in6_addr *src, 4037 int iif, int type, u32 portid, u32 seq, 4038 unsigned int flags) 4039 { 4040 u32 metrics[RTAX_MAX]; 4041 struct rtmsg *rtm; 4042 struct nlmsghdr *nlh; 4043 long expires; 4044 u32 table; 4045 4046 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 4047 if (!nlh) 4048 return -EMSGSIZE; 4049 4050 rtm = nlmsg_data(nlh); 4051 rtm->rtm_family = AF_INET6; 4052 rtm->rtm_dst_len = rt->rt6i_dst.plen; 4053 rtm->rtm_src_len = rt->rt6i_src.plen; 4054 rtm->rtm_tos = 0; 4055 if (rt->rt6i_table) 4056 table = rt->rt6i_table->tb6_id; 4057 else 4058 table = RT6_TABLE_UNSPEC; 4059 rtm->rtm_table = table; 4060 if (nla_put_u32(skb, RTA_TABLE, table)) 4061 goto nla_put_failure; 4062 if (rt->rt6i_flags & RTF_REJECT) { 4063 switch (rt->dst.error) { 4064 case -EINVAL: 4065 rtm->rtm_type = RTN_BLACKHOLE; 4066 break; 4067 case -EACCES: 4068 rtm->rtm_type = RTN_PROHIBIT; 4069 break; 4070 case -EAGAIN: 4071 rtm->rtm_type = RTN_THROW; 4072 break; 4073 default: 4074 rtm->rtm_type = RTN_UNREACHABLE; 4075 break; 4076 } 4077 } 4078 else if (rt->rt6i_flags & RTF_LOCAL) 4079 rtm->rtm_type = RTN_LOCAL; 4080 else if (rt->rt6i_flags & RTF_ANYCAST) 4081 rtm->rtm_type = RTN_ANYCAST; 4082 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 4083 rtm->rtm_type = RTN_LOCAL; 4084 else 4085 rtm->rtm_type = RTN_UNICAST; 4086 rtm->rtm_flags = 0; 4087 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4088 rtm->rtm_protocol = rt->rt6i_protocol; 4089 4090 if (rt->rt6i_flags & RTF_CACHE) 4091 rtm->rtm_flags |= RTM_F_CLONED; 4092 4093 if (dst) { 4094 if (nla_put_in6_addr(skb, RTA_DST, dst)) 4095 goto nla_put_failure; 4096 rtm->rtm_dst_len = 128; 4097 } else if (rtm->rtm_dst_len) 4098 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 4099 goto nla_put_failure; 4100 #ifdef CONFIG_IPV6_SUBTREES 4101 if (src) { 4102 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4103 goto nla_put_failure; 4104 rtm->rtm_src_len = 128; 4105 } else if (rtm->rtm_src_len && 4106 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 4107 goto nla_put_failure; 4108 #endif 4109 if (iif) { 4110 #ifdef CONFIG_IPV6_MROUTE 4111 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 4112 int err = ip6mr_get_route(net, skb, rtm, portid); 4113 4114 if (err == 0) 4115 return 0; 4116 if (err < 0) 4117 goto nla_put_failure; 4118 } else 4119 #endif 4120 if (nla_put_u32(skb, RTA_IIF, iif)) 4121 goto nla_put_failure; 4122 } else if (dst) { 4123 struct in6_addr saddr_buf; 4124 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 4125 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4126 goto nla_put_failure; 4127 } 4128 4129 if (rt->rt6i_prefsrc.plen) { 4130 struct in6_addr saddr_buf; 4131 saddr_buf = rt->rt6i_prefsrc.addr; 4132 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4133 goto nla_put_failure; 4134 } 4135 4136 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 4137 if (rt->rt6i_pmtu) 4138 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; 4139 if (rtnetlink_put_metrics(skb, metrics) < 0) 4140 goto nla_put_failure; 4141 4142 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 4143 goto nla_put_failure; 4144 4145 /* For multipath routes, walk the siblings list and add 4146 * each as a nexthop within RTA_MULTIPATH. 4147 */ 4148 if (rt->rt6i_nsiblings) { 4149 struct rt6_info *sibling, *next_sibling; 4150 struct nlattr *mp; 4151 4152 mp = nla_nest_start(skb, RTA_MULTIPATH); 4153 if (!mp) 4154 goto nla_put_failure; 4155 4156 if (rt6_add_nexthop(skb, rt) < 0) 4157 goto nla_put_failure; 4158 4159 list_for_each_entry_safe(sibling, next_sibling, 4160 &rt->rt6i_siblings, rt6i_siblings) { 4161 if (rt6_add_nexthop(skb, sibling) < 0) 4162 goto nla_put_failure; 4163 } 4164 4165 nla_nest_end(skb, mp); 4166 } else { 4167 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4168 goto nla_put_failure; 4169 } 4170 4171 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 4172 4173 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 4174 goto nla_put_failure; 4175 4176 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 4177 goto nla_put_failure; 4178 4179 4180 nlmsg_end(skb, nlh); 4181 return 0; 4182 4183 nla_put_failure: 4184 nlmsg_cancel(skb, nlh); 4185 return -EMSGSIZE; 4186 } 4187 4188 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 4189 { 4190 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4191 struct net *net = arg->net; 4192 4193 if (rt == net->ipv6.ip6_null_entry) 4194 return 0; 4195 4196 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4197 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4198 4199 /* user wants prefix routes only */ 4200 if (rtm->rtm_flags & RTM_F_PREFIX && 4201 !(rt->rt6i_flags & RTF_PREFIX_RT)) { 4202 /* success since this is not a prefix route */ 4203 return 1; 4204 } 4205 } 4206 4207 return rt6_fill_node(net, 4208 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 4209 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 4210 NLM_F_MULTI); 4211 } 4212 4213 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4214 struct netlink_ext_ack *extack) 4215 { 4216 struct net *net = sock_net(in_skb->sk); 4217 struct nlattr *tb[RTA_MAX+1]; 4218 int err, iif = 0, oif = 0; 4219 struct dst_entry *dst; 4220 struct rt6_info *rt; 4221 struct sk_buff *skb; 4222 struct rtmsg *rtm; 4223 struct flowi6 fl6; 4224 bool fibmatch; 4225 4226 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4227 extack); 4228 if (err < 0) 4229 goto errout; 4230 4231 err = -EINVAL; 4232 memset(&fl6, 0, sizeof(fl6)); 4233 rtm = nlmsg_data(nlh); 4234 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4235 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4236 4237 if (tb[RTA_SRC]) { 4238 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4239 goto errout; 4240 4241 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4242 } 4243 4244 if (tb[RTA_DST]) { 4245 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4246 goto errout; 4247 4248 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4249 } 4250 4251 if (tb[RTA_IIF]) 4252 iif = nla_get_u32(tb[RTA_IIF]); 4253 4254 if (tb[RTA_OIF]) 4255 oif = nla_get_u32(tb[RTA_OIF]); 4256 4257 if (tb[RTA_MARK]) 4258 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4259 4260 if (tb[RTA_UID]) 4261 fl6.flowi6_uid = make_kuid(current_user_ns(), 4262 nla_get_u32(tb[RTA_UID])); 4263 else 4264 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4265 4266 if (iif) { 4267 struct net_device *dev; 4268 int flags = 0; 4269 4270 rcu_read_lock(); 4271 4272 dev = dev_get_by_index_rcu(net, iif); 4273 if (!dev) { 4274 rcu_read_unlock(); 4275 err = -ENODEV; 4276 goto errout; 4277 } 4278 4279 fl6.flowi6_iif = iif; 4280 4281 if (!ipv6_addr_any(&fl6.saddr)) 4282 flags |= RT6_LOOKUP_F_HAS_SADDR; 4283 4284 if (!fibmatch) 4285 dst = ip6_route_input_lookup(net, dev, &fl6, flags); 4286 else 4287 dst = ip6_route_lookup(net, &fl6, 0); 4288 4289 rcu_read_unlock(); 4290 } else { 4291 fl6.flowi6_oif = oif; 4292 4293 if (!fibmatch) 4294 dst = ip6_route_output(net, NULL, &fl6); 4295 else 4296 dst = ip6_route_lookup(net, &fl6, 0); 4297 } 4298 4299 4300 rt = container_of(dst, struct rt6_info, dst); 4301 if (rt->dst.error) { 4302 err = rt->dst.error; 4303 ip6_rt_put(rt); 4304 goto errout; 4305 } 4306 4307 if (rt == net->ipv6.ip6_null_entry) { 4308 err = rt->dst.error; 4309 ip6_rt_put(rt); 4310 goto errout; 4311 } 4312 4313 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4314 if (!skb) { 4315 ip6_rt_put(rt); 4316 err = -ENOBUFS; 4317 goto errout; 4318 } 4319 4320 skb_dst_set(skb, &rt->dst); 4321 if (fibmatch) 4322 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, 4323 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4324 nlh->nlmsg_seq, 0); 4325 else 4326 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 4327 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4328 nlh->nlmsg_seq, 0); 4329 if (err < 0) { 4330 kfree_skb(skb); 4331 goto errout; 4332 } 4333 4334 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4335 errout: 4336 return err; 4337 } 4338 4339 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, 4340 unsigned int nlm_flags) 4341 { 4342 struct sk_buff *skb; 4343 struct net *net = info->nl_net; 4344 u32 seq; 4345 int err; 4346 4347 err = -ENOBUFS; 4348 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4349 4350 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4351 if (!skb) 4352 goto errout; 4353 4354 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 4355 event, info->portid, seq, nlm_flags); 4356 if (err < 0) { 4357 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4358 WARN_ON(err == -EMSGSIZE); 4359 kfree_skb(skb); 4360 goto errout; 4361 } 4362 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4363 info->nlh, gfp_any()); 4364 return; 4365 errout: 4366 if (err < 0) 4367 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4368 } 4369 4370 static int ip6_route_dev_notify(struct notifier_block *this, 4371 unsigned long event, void *ptr) 4372 { 4373 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4374 struct net *net = dev_net(dev); 4375 4376 if (!(dev->flags & IFF_LOOPBACK)) 4377 return NOTIFY_OK; 4378 4379 if (event == NETDEV_REGISTER) { 4380 net->ipv6.ip6_null_entry->dst.dev = dev; 4381 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4382 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4383 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4384 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4385 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4386 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4387 #endif 4388 } else if (event == NETDEV_UNREGISTER && 4389 dev->reg_state != NETREG_UNREGISTERED) { 4390 /* NETDEV_UNREGISTER could be fired for multiple times by 4391 * netdev_wait_allrefs(). Make sure we only call this once. 4392 */ 4393 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4394 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4395 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4396 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4397 #endif 4398 } 4399 4400 return NOTIFY_OK; 4401 } 4402 4403 /* 4404 * /proc 4405 */ 4406 4407 #ifdef CONFIG_PROC_FS 4408 4409 static const struct file_operations ipv6_route_proc_fops = { 4410 .owner = THIS_MODULE, 4411 .open = ipv6_route_open, 4412 .read = seq_read, 4413 .llseek = seq_lseek, 4414 .release = seq_release_net, 4415 }; 4416 4417 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4418 { 4419 struct net *net = (struct net *)seq->private; 4420 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4421 net->ipv6.rt6_stats->fib_nodes, 4422 net->ipv6.rt6_stats->fib_route_nodes, 4423 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), 4424 net->ipv6.rt6_stats->fib_rt_entries, 4425 net->ipv6.rt6_stats->fib_rt_cache, 4426 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4427 net->ipv6.rt6_stats->fib_discarded_routes); 4428 4429 return 0; 4430 } 4431 4432 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4433 { 4434 return single_open_net(inode, file, rt6_stats_seq_show); 4435 } 4436 4437 static const struct file_operations rt6_stats_seq_fops = { 4438 .owner = THIS_MODULE, 4439 .open = rt6_stats_seq_open, 4440 .read = seq_read, 4441 .llseek = seq_lseek, 4442 .release = single_release_net, 4443 }; 4444 #endif /* CONFIG_PROC_FS */ 4445 4446 #ifdef CONFIG_SYSCTL 4447 4448 static 4449 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4450 void __user *buffer, size_t *lenp, loff_t *ppos) 4451 { 4452 struct net *net; 4453 int delay; 4454 if (!write) 4455 return -EINVAL; 4456 4457 net = (struct net *)ctl->extra1; 4458 delay = net->ipv6.sysctl.flush_delay; 4459 proc_dointvec(ctl, write, buffer, lenp, ppos); 4460 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4461 return 0; 4462 } 4463 4464 struct ctl_table ipv6_route_table_template[] = { 4465 { 4466 .procname = "flush", 4467 .data = &init_net.ipv6.sysctl.flush_delay, 4468 .maxlen = sizeof(int), 4469 .mode = 0200, 4470 .proc_handler = ipv6_sysctl_rtcache_flush 4471 }, 4472 { 4473 .procname = "gc_thresh", 4474 .data = &ip6_dst_ops_template.gc_thresh, 4475 .maxlen = sizeof(int), 4476 .mode = 0644, 4477 .proc_handler = proc_dointvec, 4478 }, 4479 { 4480 .procname = "max_size", 4481 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 4482 .maxlen = sizeof(int), 4483 .mode = 0644, 4484 .proc_handler = proc_dointvec, 4485 }, 4486 { 4487 .procname = "gc_min_interval", 4488 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4489 .maxlen = sizeof(int), 4490 .mode = 0644, 4491 .proc_handler = proc_dointvec_jiffies, 4492 }, 4493 { 4494 .procname = "gc_timeout", 4495 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 4496 .maxlen = sizeof(int), 4497 .mode = 0644, 4498 .proc_handler = proc_dointvec_jiffies, 4499 }, 4500 { 4501 .procname = "gc_interval", 4502 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 4503 .maxlen = sizeof(int), 4504 .mode = 0644, 4505 .proc_handler = proc_dointvec_jiffies, 4506 }, 4507 { 4508 .procname = "gc_elasticity", 4509 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 4510 .maxlen = sizeof(int), 4511 .mode = 0644, 4512 .proc_handler = proc_dointvec, 4513 }, 4514 { 4515 .procname = "mtu_expires", 4516 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 4517 .maxlen = sizeof(int), 4518 .mode = 0644, 4519 .proc_handler = proc_dointvec_jiffies, 4520 }, 4521 { 4522 .procname = "min_adv_mss", 4523 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 4524 .maxlen = sizeof(int), 4525 .mode = 0644, 4526 .proc_handler = proc_dointvec, 4527 }, 4528 { 4529 .procname = "gc_min_interval_ms", 4530 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4531 .maxlen = sizeof(int), 4532 .mode = 0644, 4533 .proc_handler = proc_dointvec_ms_jiffies, 4534 }, 4535 { } 4536 }; 4537 4538 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 4539 { 4540 struct ctl_table *table; 4541 4542 table = kmemdup(ipv6_route_table_template, 4543 sizeof(ipv6_route_table_template), 4544 GFP_KERNEL); 4545 4546 if (table) { 4547 table[0].data = &net->ipv6.sysctl.flush_delay; 4548 table[0].extra1 = net; 4549 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 4550 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 4551 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4552 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 4553 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 4554 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 4555 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 4556 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 4557 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4558 4559 /* Don't export sysctls to unprivileged users */ 4560 if (net->user_ns != &init_user_ns) 4561 table[0].procname = NULL; 4562 } 4563 4564 return table; 4565 } 4566 #endif 4567 4568 static int __net_init ip6_route_net_init(struct net *net) 4569 { 4570 int ret = -ENOMEM; 4571 4572 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 4573 sizeof(net->ipv6.ip6_dst_ops)); 4574 4575 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 4576 goto out_ip6_dst_ops; 4577 4578 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 4579 sizeof(*net->ipv6.ip6_null_entry), 4580 GFP_KERNEL); 4581 if (!net->ipv6.ip6_null_entry) 4582 goto out_ip6_dst_entries; 4583 net->ipv6.ip6_null_entry->dst.path = 4584 (struct dst_entry *)net->ipv6.ip6_null_entry; 4585 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4586 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 4587 ip6_template_metrics, true); 4588 4589 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4590 net->ipv6.fib6_has_custom_rules = false; 4591 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 4592 sizeof(*net->ipv6.ip6_prohibit_entry), 4593 GFP_KERNEL); 4594 if (!net->ipv6.ip6_prohibit_entry) 4595 goto out_ip6_null_entry; 4596 net->ipv6.ip6_prohibit_entry->dst.path = 4597 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 4598 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4599 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 4600 ip6_template_metrics, true); 4601 4602 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 4603 sizeof(*net->ipv6.ip6_blk_hole_entry), 4604 GFP_KERNEL); 4605 if (!net->ipv6.ip6_blk_hole_entry) 4606 goto out_ip6_prohibit_entry; 4607 net->ipv6.ip6_blk_hole_entry->dst.path = 4608 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 4609 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4610 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 4611 ip6_template_metrics, true); 4612 #endif 4613 4614 net->ipv6.sysctl.flush_delay = 0; 4615 net->ipv6.sysctl.ip6_rt_max_size = 4096; 4616 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 4617 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 4618 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 4619 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 4620 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 4621 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 4622 4623 net->ipv6.ip6_rt_gc_expire = 30*HZ; 4624 4625 ret = 0; 4626 out: 4627 return ret; 4628 4629 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4630 out_ip6_prohibit_entry: 4631 kfree(net->ipv6.ip6_prohibit_entry); 4632 out_ip6_null_entry: 4633 kfree(net->ipv6.ip6_null_entry); 4634 #endif 4635 out_ip6_dst_entries: 4636 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4637 out_ip6_dst_ops: 4638 goto out; 4639 } 4640 4641 static void __net_exit ip6_route_net_exit(struct net *net) 4642 { 4643 kfree(net->ipv6.ip6_null_entry); 4644 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4645 kfree(net->ipv6.ip6_prohibit_entry); 4646 kfree(net->ipv6.ip6_blk_hole_entry); 4647 #endif 4648 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4649 } 4650 4651 static int __net_init ip6_route_net_init_late(struct net *net) 4652 { 4653 #ifdef CONFIG_PROC_FS 4654 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 4655 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); 4656 #endif 4657 return 0; 4658 } 4659 4660 static void __net_exit ip6_route_net_exit_late(struct net *net) 4661 { 4662 #ifdef CONFIG_PROC_FS 4663 remove_proc_entry("ipv6_route", net->proc_net); 4664 remove_proc_entry("rt6_stats", net->proc_net); 4665 #endif 4666 } 4667 4668 static struct pernet_operations ip6_route_net_ops = { 4669 .init = ip6_route_net_init, 4670 .exit = ip6_route_net_exit, 4671 }; 4672 4673 static int __net_init ipv6_inetpeer_init(struct net *net) 4674 { 4675 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 4676 4677 if (!bp) 4678 return -ENOMEM; 4679 inet_peer_base_init(bp); 4680 net->ipv6.peers = bp; 4681 return 0; 4682 } 4683 4684 static void __net_exit ipv6_inetpeer_exit(struct net *net) 4685 { 4686 struct inet_peer_base *bp = net->ipv6.peers; 4687 4688 net->ipv6.peers = NULL; 4689 inetpeer_invalidate_tree(bp); 4690 kfree(bp); 4691 } 4692 4693 static struct pernet_operations ipv6_inetpeer_ops = { 4694 .init = ipv6_inetpeer_init, 4695 .exit = ipv6_inetpeer_exit, 4696 }; 4697 4698 static struct pernet_operations ip6_route_net_late_ops = { 4699 .init = ip6_route_net_init_late, 4700 .exit = ip6_route_net_exit_late, 4701 }; 4702 4703 static struct notifier_block ip6_route_dev_notifier = { 4704 .notifier_call = ip6_route_dev_notify, 4705 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 4706 }; 4707 4708 void __init ip6_route_init_special_entries(void) 4709 { 4710 /* Registering of the loopback is done before this portion of code, 4711 * the loopback reference in rt6_info will not be taken, do it 4712 * manually for init_net */ 4713 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 4714 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4715 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4716 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 4717 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4718 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 4719 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4720 #endif 4721 } 4722 4723 int __init ip6_route_init(void) 4724 { 4725 int ret; 4726 int cpu; 4727 4728 ret = -ENOMEM; 4729 ip6_dst_ops_template.kmem_cachep = 4730 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 4731 SLAB_HWCACHE_ALIGN, NULL); 4732 if (!ip6_dst_ops_template.kmem_cachep) 4733 goto out; 4734 4735 ret = dst_entries_init(&ip6_dst_blackhole_ops); 4736 if (ret) 4737 goto out_kmem_cache; 4738 4739 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 4740 if (ret) 4741 goto out_dst_entries; 4742 4743 ret = register_pernet_subsys(&ip6_route_net_ops); 4744 if (ret) 4745 goto out_register_inetpeer; 4746 4747 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 4748 4749 ret = fib6_init(); 4750 if (ret) 4751 goto out_register_subsys; 4752 4753 ret = xfrm6_init(); 4754 if (ret) 4755 goto out_fib6_init; 4756 4757 ret = fib6_rules_init(); 4758 if (ret) 4759 goto xfrm6_init; 4760 4761 ret = register_pernet_subsys(&ip6_route_net_late_ops); 4762 if (ret) 4763 goto fib6_rules_init; 4764 4765 ret = -ENOBUFS; 4766 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || 4767 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || 4768 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 4769 RTNL_FLAG_DOIT_UNLOCKED)) 4770 goto out_register_late_subsys; 4771 4772 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 4773 if (ret) 4774 goto out_register_late_subsys; 4775 4776 for_each_possible_cpu(cpu) { 4777 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 4778 4779 INIT_LIST_HEAD(&ul->head); 4780 spin_lock_init(&ul->lock); 4781 } 4782 4783 out: 4784 return ret; 4785 4786 out_register_late_subsys: 4787 unregister_pernet_subsys(&ip6_route_net_late_ops); 4788 fib6_rules_init: 4789 fib6_rules_cleanup(); 4790 xfrm6_init: 4791 xfrm6_fini(); 4792 out_fib6_init: 4793 fib6_gc_cleanup(); 4794 out_register_subsys: 4795 unregister_pernet_subsys(&ip6_route_net_ops); 4796 out_register_inetpeer: 4797 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4798 out_dst_entries: 4799 dst_entries_destroy(&ip6_dst_blackhole_ops); 4800 out_kmem_cache: 4801 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4802 goto out; 4803 } 4804 4805 void ip6_route_cleanup(void) 4806 { 4807 unregister_netdevice_notifier(&ip6_route_dev_notifier); 4808 unregister_pernet_subsys(&ip6_route_net_late_ops); 4809 fib6_rules_cleanup(); 4810 xfrm6_fini(); 4811 fib6_gc_cleanup(); 4812 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4813 unregister_pernet_subsys(&ip6_route_net_ops); 4814 dst_entries_destroy(&ip6_dst_blackhole_ops); 4815 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4816 } 4817