1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <linux/jhash.h> 48 #include <net/net_namespace.h> 49 #include <net/snmp.h> 50 #include <net/ipv6.h> 51 #include <net/ip6_fib.h> 52 #include <net/ip6_route.h> 53 #include <net/ndisc.h> 54 #include <net/addrconf.h> 55 #include <net/tcp.h> 56 #include <linux/rtnetlink.h> 57 #include <net/dst.h> 58 #include <net/dst_metadata.h> 59 #include <net/xfrm.h> 60 #include <net/netevent.h> 61 #include <net/netlink.h> 62 #include <net/nexthop.h> 63 #include <net/lwtunnel.h> 64 #include <net/ip_tunnels.h> 65 #include <net/l3mdev.h> 66 #include <trace/events/fib6.h> 67 68 #include <linux/uaccess.h> 69 70 #ifdef CONFIG_SYSCTL 71 #include <linux/sysctl.h> 72 #endif 73 74 enum rt6_nud_state { 75 RT6_NUD_FAIL_HARD = -3, 76 RT6_NUD_FAIL_PROBE = -2, 77 RT6_NUD_FAIL_DO_RR = -1, 78 RT6_NUD_SUCCEED = 1 79 }; 80 81 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); 82 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 83 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 84 static unsigned int ip6_mtu(const struct dst_entry *dst); 85 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 86 static void ip6_dst_destroy(struct dst_entry *); 87 static void ip6_dst_ifdown(struct dst_entry *, 88 struct net_device *dev, int how); 89 static int ip6_dst_gc(struct dst_ops *ops); 90 91 static int ip6_pkt_discard(struct sk_buff *skb); 92 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 93 static int ip6_pkt_prohibit(struct sk_buff *skb); 94 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 95 static void ip6_link_failure(struct sk_buff *skb); 96 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 97 struct sk_buff *skb, u32 mtu); 98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 99 struct sk_buff *skb); 100 static void rt6_dst_from_metrics_check(struct rt6_info *rt); 101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 102 static size_t rt6_nlmsg_size(struct rt6_info *rt); 103 static int rt6_fill_node(struct net *net, 104 struct sk_buff *skb, struct rt6_info *rt, 105 struct in6_addr *dst, struct in6_addr *src, 106 int iif, int type, u32 portid, u32 seq, 107 unsigned int flags); 108 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 109 struct in6_addr *daddr, 110 struct in6_addr *saddr); 111 112 #ifdef CONFIG_IPV6_ROUTE_INFO 113 static struct rt6_info *rt6_add_route_info(struct net *net, 114 const struct in6_addr *prefix, int prefixlen, 115 const struct in6_addr *gwaddr, 116 struct net_device *dev, 117 unsigned int pref); 118 static struct rt6_info *rt6_get_route_info(struct net *net, 119 const struct in6_addr *prefix, int prefixlen, 120 const struct in6_addr *gwaddr, 121 struct net_device *dev); 122 #endif 123 124 struct uncached_list { 125 spinlock_t lock; 126 struct list_head head; 127 }; 128 129 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 130 131 static void rt6_uncached_list_add(struct rt6_info *rt) 132 { 133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 134 135 rt->rt6i_uncached_list = ul; 136 137 spin_lock_bh(&ul->lock); 138 list_add_tail(&rt->rt6i_uncached, &ul->head); 139 spin_unlock_bh(&ul->lock); 140 } 141 142 static void rt6_uncached_list_del(struct rt6_info *rt) 143 { 144 if (!list_empty(&rt->rt6i_uncached)) { 145 struct uncached_list *ul = rt->rt6i_uncached_list; 146 147 spin_lock_bh(&ul->lock); 148 list_del(&rt->rt6i_uncached); 149 spin_unlock_bh(&ul->lock); 150 } 151 } 152 153 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 154 { 155 struct net_device *loopback_dev = net->loopback_dev; 156 int cpu; 157 158 if (dev == loopback_dev) 159 return; 160 161 for_each_possible_cpu(cpu) { 162 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 163 struct rt6_info *rt; 164 165 spin_lock_bh(&ul->lock); 166 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 167 struct inet6_dev *rt_idev = rt->rt6i_idev; 168 struct net_device *rt_dev = rt->dst.dev; 169 170 if (rt_idev->dev == dev) { 171 rt->rt6i_idev = in6_dev_get(loopback_dev); 172 in6_dev_put(rt_idev); 173 } 174 175 if (rt_dev == dev) { 176 rt->dst.dev = loopback_dev; 177 dev_hold(rt->dst.dev); 178 dev_put(rt_dev); 179 } 180 } 181 spin_unlock_bh(&ul->lock); 182 } 183 } 184 185 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 186 { 187 return dst_metrics_write_ptr(rt->dst.from); 188 } 189 190 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 191 { 192 struct rt6_info *rt = (struct rt6_info *)dst; 193 194 if (rt->rt6i_flags & RTF_PCPU) 195 return rt6_pcpu_cow_metrics(rt); 196 else if (rt->rt6i_flags & RTF_CACHE) 197 return NULL; 198 else 199 return dst_cow_metrics_generic(dst, old); 200 } 201 202 static inline const void *choose_neigh_daddr(struct rt6_info *rt, 203 struct sk_buff *skb, 204 const void *daddr) 205 { 206 struct in6_addr *p = &rt->rt6i_gateway; 207 208 if (!ipv6_addr_any(p)) 209 return (const void *) p; 210 else if (skb) 211 return &ipv6_hdr(skb)->daddr; 212 return daddr; 213 } 214 215 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 216 struct sk_buff *skb, 217 const void *daddr) 218 { 219 struct rt6_info *rt = (struct rt6_info *) dst; 220 struct neighbour *n; 221 222 daddr = choose_neigh_daddr(rt, skb, daddr); 223 n = __ipv6_neigh_lookup(dst->dev, daddr); 224 if (n) 225 return n; 226 return neigh_create(&nd_tbl, daddr, dst->dev); 227 } 228 229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 230 { 231 struct net_device *dev = dst->dev; 232 struct rt6_info *rt = (struct rt6_info *)dst; 233 234 daddr = choose_neigh_daddr(rt, NULL, daddr); 235 if (!daddr) 236 return; 237 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 238 return; 239 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 240 return; 241 __ipv6_confirm_neigh(dev, daddr); 242 } 243 244 static struct dst_ops ip6_dst_ops_template = { 245 .family = AF_INET6, 246 .gc = ip6_dst_gc, 247 .gc_thresh = 1024, 248 .check = ip6_dst_check, 249 .default_advmss = ip6_default_advmss, 250 .mtu = ip6_mtu, 251 .cow_metrics = ipv6_cow_metrics, 252 .destroy = ip6_dst_destroy, 253 .ifdown = ip6_dst_ifdown, 254 .negative_advice = ip6_negative_advice, 255 .link_failure = ip6_link_failure, 256 .update_pmtu = ip6_rt_update_pmtu, 257 .redirect = rt6_do_redirect, 258 .local_out = __ip6_local_out, 259 .neigh_lookup = ip6_neigh_lookup, 260 .confirm_neigh = ip6_confirm_neigh, 261 }; 262 263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 264 { 265 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 266 267 return mtu ? : dst->dev->mtu; 268 } 269 270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 271 struct sk_buff *skb, u32 mtu) 272 { 273 } 274 275 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 276 struct sk_buff *skb) 277 { 278 } 279 280 static struct dst_ops ip6_dst_blackhole_ops = { 281 .family = AF_INET6, 282 .destroy = ip6_dst_destroy, 283 .check = ip6_dst_check, 284 .mtu = ip6_blackhole_mtu, 285 .default_advmss = ip6_default_advmss, 286 .update_pmtu = ip6_rt_blackhole_update_pmtu, 287 .redirect = ip6_rt_blackhole_redirect, 288 .cow_metrics = dst_cow_metrics_generic, 289 .neigh_lookup = ip6_neigh_lookup, 290 }; 291 292 static const u32 ip6_template_metrics[RTAX_MAX] = { 293 [RTAX_HOPLIMIT - 1] = 0, 294 }; 295 296 static const struct rt6_info ip6_null_entry_template = { 297 .dst = { 298 .__refcnt = ATOMIC_INIT(1), 299 .__use = 1, 300 .obsolete = DST_OBSOLETE_FORCE_CHK, 301 .error = -ENETUNREACH, 302 .input = ip6_pkt_discard, 303 .output = ip6_pkt_discard_out, 304 }, 305 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 306 .rt6i_protocol = RTPROT_KERNEL, 307 .rt6i_metric = ~(u32) 0, 308 .rt6i_ref = ATOMIC_INIT(1), 309 }; 310 311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 312 313 static const struct rt6_info ip6_prohibit_entry_template = { 314 .dst = { 315 .__refcnt = ATOMIC_INIT(1), 316 .__use = 1, 317 .obsolete = DST_OBSOLETE_FORCE_CHK, 318 .error = -EACCES, 319 .input = ip6_pkt_prohibit, 320 .output = ip6_pkt_prohibit_out, 321 }, 322 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 323 .rt6i_protocol = RTPROT_KERNEL, 324 .rt6i_metric = ~(u32) 0, 325 .rt6i_ref = ATOMIC_INIT(1), 326 }; 327 328 static const struct rt6_info ip6_blk_hole_entry_template = { 329 .dst = { 330 .__refcnt = ATOMIC_INIT(1), 331 .__use = 1, 332 .obsolete = DST_OBSOLETE_FORCE_CHK, 333 .error = -EINVAL, 334 .input = dst_discard, 335 .output = dst_discard_out, 336 }, 337 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 338 .rt6i_protocol = RTPROT_KERNEL, 339 .rt6i_metric = ~(u32) 0, 340 .rt6i_ref = ATOMIC_INIT(1), 341 }; 342 343 #endif 344 345 static void rt6_info_init(struct rt6_info *rt) 346 { 347 struct dst_entry *dst = &rt->dst; 348 349 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 350 INIT_LIST_HEAD(&rt->rt6i_siblings); 351 INIT_LIST_HEAD(&rt->rt6i_uncached); 352 } 353 354 /* allocate dst with ip6_dst_ops */ 355 static struct rt6_info *__ip6_dst_alloc(struct net *net, 356 struct net_device *dev, 357 int flags) 358 { 359 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 360 1, DST_OBSOLETE_FORCE_CHK, flags); 361 362 if (rt) 363 rt6_info_init(rt); 364 365 return rt; 366 } 367 368 struct rt6_info *ip6_dst_alloc(struct net *net, 369 struct net_device *dev, 370 int flags) 371 { 372 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); 373 374 if (rt) { 375 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); 376 if (rt->rt6i_pcpu) { 377 int cpu; 378 379 for_each_possible_cpu(cpu) { 380 struct rt6_info **p; 381 382 p = per_cpu_ptr(rt->rt6i_pcpu, cpu); 383 /* no one shares rt */ 384 *p = NULL; 385 } 386 } else { 387 dst_release_immediate(&rt->dst); 388 return NULL; 389 } 390 } 391 392 return rt; 393 } 394 EXPORT_SYMBOL(ip6_dst_alloc); 395 396 static void ip6_dst_destroy(struct dst_entry *dst) 397 { 398 struct rt6_info *rt = (struct rt6_info *)dst; 399 struct rt6_exception_bucket *bucket; 400 struct dst_entry *from = dst->from; 401 struct inet6_dev *idev; 402 403 dst_destroy_metrics_generic(dst); 404 free_percpu(rt->rt6i_pcpu); 405 rt6_uncached_list_del(rt); 406 407 idev = rt->rt6i_idev; 408 if (idev) { 409 rt->rt6i_idev = NULL; 410 in6_dev_put(idev); 411 } 412 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); 413 if (bucket) { 414 rt->rt6i_exception_bucket = NULL; 415 kfree(bucket); 416 } 417 418 dst->from = NULL; 419 dst_release(from); 420 } 421 422 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 423 int how) 424 { 425 struct rt6_info *rt = (struct rt6_info *)dst; 426 struct inet6_dev *idev = rt->rt6i_idev; 427 struct net_device *loopback_dev = 428 dev_net(dev)->loopback_dev; 429 430 if (idev && idev->dev != loopback_dev) { 431 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 432 if (loopback_idev) { 433 rt->rt6i_idev = loopback_idev; 434 in6_dev_put(idev); 435 } 436 } 437 } 438 439 static bool __rt6_check_expired(const struct rt6_info *rt) 440 { 441 if (rt->rt6i_flags & RTF_EXPIRES) 442 return time_after(jiffies, rt->dst.expires); 443 else 444 return false; 445 } 446 447 static bool rt6_check_expired(const struct rt6_info *rt) 448 { 449 if (rt->rt6i_flags & RTF_EXPIRES) { 450 if (time_after(jiffies, rt->dst.expires)) 451 return true; 452 } else if (rt->dst.from) { 453 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 454 rt6_check_expired((struct rt6_info *)rt->dst.from); 455 } 456 return false; 457 } 458 459 static struct rt6_info *rt6_multipath_select(struct rt6_info *match, 460 struct flowi6 *fl6, int oif, 461 int strict) 462 { 463 struct rt6_info *sibling, *next_sibling; 464 int route_choosen; 465 466 /* We might have already computed the hash for ICMPv6 errors. In such 467 * case it will always be non-zero. Otherwise now is the time to do it. 468 */ 469 if (!fl6->mp_hash) 470 fl6->mp_hash = rt6_multipath_hash(fl6, NULL); 471 472 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); 473 /* Don't change the route, if route_choosen == 0 474 * (siblings does not include ourself) 475 */ 476 if (route_choosen) 477 list_for_each_entry_safe(sibling, next_sibling, 478 &match->rt6i_siblings, rt6i_siblings) { 479 route_choosen--; 480 if (route_choosen == 0) { 481 if (rt6_score_route(sibling, oif, strict) < 0) 482 break; 483 match = sibling; 484 break; 485 } 486 } 487 return match; 488 } 489 490 /* 491 * Route lookup. Any table->tb6_lock is implied. 492 */ 493 494 static inline struct rt6_info *rt6_device_match(struct net *net, 495 struct rt6_info *rt, 496 const struct in6_addr *saddr, 497 int oif, 498 int flags) 499 { 500 struct rt6_info *local = NULL; 501 struct rt6_info *sprt; 502 503 if (!oif && ipv6_addr_any(saddr)) 504 goto out; 505 506 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { 507 struct net_device *dev = sprt->dst.dev; 508 509 if (oif) { 510 if (dev->ifindex == oif) 511 return sprt; 512 if (dev->flags & IFF_LOOPBACK) { 513 if (!sprt->rt6i_idev || 514 sprt->rt6i_idev->dev->ifindex != oif) { 515 if (flags & RT6_LOOKUP_F_IFACE) 516 continue; 517 if (local && 518 local->rt6i_idev->dev->ifindex == oif) 519 continue; 520 } 521 local = sprt; 522 } 523 } else { 524 if (ipv6_chk_addr(net, saddr, dev, 525 flags & RT6_LOOKUP_F_IFACE)) 526 return sprt; 527 } 528 } 529 530 if (oif) { 531 if (local) 532 return local; 533 534 if (flags & RT6_LOOKUP_F_IFACE) 535 return net->ipv6.ip6_null_entry; 536 } 537 out: 538 return rt; 539 } 540 541 #ifdef CONFIG_IPV6_ROUTER_PREF 542 struct __rt6_probe_work { 543 struct work_struct work; 544 struct in6_addr target; 545 struct net_device *dev; 546 }; 547 548 static void rt6_probe_deferred(struct work_struct *w) 549 { 550 struct in6_addr mcaddr; 551 struct __rt6_probe_work *work = 552 container_of(w, struct __rt6_probe_work, work); 553 554 addrconf_addr_solict_mult(&work->target, &mcaddr); 555 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 556 dev_put(work->dev); 557 kfree(work); 558 } 559 560 static void rt6_probe(struct rt6_info *rt) 561 { 562 struct __rt6_probe_work *work; 563 struct neighbour *neigh; 564 /* 565 * Okay, this does not seem to be appropriate 566 * for now, however, we need to check if it 567 * is really so; aka Router Reachability Probing. 568 * 569 * Router Reachability Probe MUST be rate-limited 570 * to no more than one per minute. 571 */ 572 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 573 return; 574 rcu_read_lock_bh(); 575 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 576 if (neigh) { 577 if (neigh->nud_state & NUD_VALID) 578 goto out; 579 580 work = NULL; 581 write_lock(&neigh->lock); 582 if (!(neigh->nud_state & NUD_VALID) && 583 time_after(jiffies, 584 neigh->updated + 585 rt->rt6i_idev->cnf.rtr_probe_interval)) { 586 work = kmalloc(sizeof(*work), GFP_ATOMIC); 587 if (work) 588 __neigh_set_probe_once(neigh); 589 } 590 write_unlock(&neigh->lock); 591 } else { 592 work = kmalloc(sizeof(*work), GFP_ATOMIC); 593 } 594 595 if (work) { 596 INIT_WORK(&work->work, rt6_probe_deferred); 597 work->target = rt->rt6i_gateway; 598 dev_hold(rt->dst.dev); 599 work->dev = rt->dst.dev; 600 schedule_work(&work->work); 601 } 602 603 out: 604 rcu_read_unlock_bh(); 605 } 606 #else 607 static inline void rt6_probe(struct rt6_info *rt) 608 { 609 } 610 #endif 611 612 /* 613 * Default Router Selection (RFC 2461 6.3.6) 614 */ 615 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 616 { 617 struct net_device *dev = rt->dst.dev; 618 if (!oif || dev->ifindex == oif) 619 return 2; 620 if ((dev->flags & IFF_LOOPBACK) && 621 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 622 return 1; 623 return 0; 624 } 625 626 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 627 { 628 struct neighbour *neigh; 629 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 630 631 if (rt->rt6i_flags & RTF_NONEXTHOP || 632 !(rt->rt6i_flags & RTF_GATEWAY)) 633 return RT6_NUD_SUCCEED; 634 635 rcu_read_lock_bh(); 636 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 637 if (neigh) { 638 read_lock(&neigh->lock); 639 if (neigh->nud_state & NUD_VALID) 640 ret = RT6_NUD_SUCCEED; 641 #ifdef CONFIG_IPV6_ROUTER_PREF 642 else if (!(neigh->nud_state & NUD_FAILED)) 643 ret = RT6_NUD_SUCCEED; 644 else 645 ret = RT6_NUD_FAIL_PROBE; 646 #endif 647 read_unlock(&neigh->lock); 648 } else { 649 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 650 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 651 } 652 rcu_read_unlock_bh(); 653 654 return ret; 655 } 656 657 static int rt6_score_route(struct rt6_info *rt, int oif, 658 int strict) 659 { 660 int m; 661 662 m = rt6_check_dev(rt, oif); 663 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 664 return RT6_NUD_FAIL_HARD; 665 #ifdef CONFIG_IPV6_ROUTER_PREF 666 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 667 #endif 668 if (strict & RT6_LOOKUP_F_REACHABLE) { 669 int n = rt6_check_neigh(rt); 670 if (n < 0) 671 return n; 672 } 673 return m; 674 } 675 676 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 677 int *mpri, struct rt6_info *match, 678 bool *do_rr) 679 { 680 int m; 681 bool match_do_rr = false; 682 struct inet6_dev *idev = rt->rt6i_idev; 683 struct net_device *dev = rt->dst.dev; 684 685 if (dev && !netif_carrier_ok(dev) && 686 idev->cnf.ignore_routes_with_linkdown && 687 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 688 goto out; 689 690 if (rt6_check_expired(rt)) 691 goto out; 692 693 m = rt6_score_route(rt, oif, strict); 694 if (m == RT6_NUD_FAIL_DO_RR) { 695 match_do_rr = true; 696 m = 0; /* lowest valid score */ 697 } else if (m == RT6_NUD_FAIL_HARD) { 698 goto out; 699 } 700 701 if (strict & RT6_LOOKUP_F_REACHABLE) 702 rt6_probe(rt); 703 704 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 705 if (m > *mpri) { 706 *do_rr = match_do_rr; 707 *mpri = m; 708 match = rt; 709 } 710 out: 711 return match; 712 } 713 714 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 715 struct rt6_info *rr_head, 716 u32 metric, int oif, int strict, 717 bool *do_rr) 718 { 719 struct rt6_info *rt, *match, *cont; 720 int mpri = -1; 721 722 match = NULL; 723 cont = NULL; 724 for (rt = rr_head; rt; rt = rt->dst.rt6_next) { 725 if (rt->rt6i_metric != metric) { 726 cont = rt; 727 break; 728 } 729 730 match = find_match(rt, oif, strict, &mpri, match, do_rr); 731 } 732 733 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { 734 if (rt->rt6i_metric != metric) { 735 cont = rt; 736 break; 737 } 738 739 match = find_match(rt, oif, strict, &mpri, match, do_rr); 740 } 741 742 if (match || !cont) 743 return match; 744 745 for (rt = cont; rt; rt = rt->dst.rt6_next) 746 match = find_match(rt, oif, strict, &mpri, match, do_rr); 747 748 return match; 749 } 750 751 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) 752 { 753 struct rt6_info *match, *rt0; 754 struct net *net; 755 bool do_rr = false; 756 757 rt0 = fn->rr_ptr; 758 if (!rt0) 759 fn->rr_ptr = rt0 = fn->leaf; 760 761 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, 762 &do_rr); 763 764 if (do_rr) { 765 struct rt6_info *next = rt0->dst.rt6_next; 766 767 /* no entries matched; do round-robin */ 768 if (!next || next->rt6i_metric != rt0->rt6i_metric) 769 next = fn->leaf; 770 771 if (next != rt0) 772 fn->rr_ptr = next; 773 } 774 775 net = dev_net(rt0->dst.dev); 776 return match ? match : net->ipv6.ip6_null_entry; 777 } 778 779 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 780 { 781 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 782 } 783 784 #ifdef CONFIG_IPV6_ROUTE_INFO 785 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 786 const struct in6_addr *gwaddr) 787 { 788 struct net *net = dev_net(dev); 789 struct route_info *rinfo = (struct route_info *) opt; 790 struct in6_addr prefix_buf, *prefix; 791 unsigned int pref; 792 unsigned long lifetime; 793 struct rt6_info *rt; 794 795 if (len < sizeof(struct route_info)) { 796 return -EINVAL; 797 } 798 799 /* Sanity check for prefix_len and length */ 800 if (rinfo->length > 3) { 801 return -EINVAL; 802 } else if (rinfo->prefix_len > 128) { 803 return -EINVAL; 804 } else if (rinfo->prefix_len > 64) { 805 if (rinfo->length < 2) { 806 return -EINVAL; 807 } 808 } else if (rinfo->prefix_len > 0) { 809 if (rinfo->length < 1) { 810 return -EINVAL; 811 } 812 } 813 814 pref = rinfo->route_pref; 815 if (pref == ICMPV6_ROUTER_PREF_INVALID) 816 return -EINVAL; 817 818 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 819 820 if (rinfo->length == 3) 821 prefix = (struct in6_addr *)rinfo->prefix; 822 else { 823 /* this function is safe */ 824 ipv6_addr_prefix(&prefix_buf, 825 (struct in6_addr *)rinfo->prefix, 826 rinfo->prefix_len); 827 prefix = &prefix_buf; 828 } 829 830 if (rinfo->prefix_len == 0) 831 rt = rt6_get_dflt_router(gwaddr, dev); 832 else 833 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 834 gwaddr, dev); 835 836 if (rt && !lifetime) { 837 ip6_del_rt(rt); 838 rt = NULL; 839 } 840 841 if (!rt && lifetime) 842 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 843 dev, pref); 844 else if (rt) 845 rt->rt6i_flags = RTF_ROUTEINFO | 846 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 847 848 if (rt) { 849 if (!addrconf_finite_timeout(lifetime)) 850 rt6_clean_expires(rt); 851 else 852 rt6_set_expires(rt, jiffies + HZ * lifetime); 853 854 ip6_rt_put(rt); 855 } 856 return 0; 857 } 858 #endif 859 860 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 861 struct in6_addr *saddr) 862 { 863 struct fib6_node *pn; 864 while (1) { 865 if (fn->fn_flags & RTN_TL_ROOT) 866 return NULL; 867 pn = fn->parent; 868 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) 869 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); 870 else 871 fn = pn; 872 if (fn->fn_flags & RTN_RTINFO) 873 return fn; 874 } 875 } 876 877 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 878 struct fib6_table *table, 879 struct flowi6 *fl6, int flags) 880 { 881 struct fib6_node *fn; 882 struct rt6_info *rt; 883 884 read_lock_bh(&table->tb6_lock); 885 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 886 restart: 887 rt = fn->leaf; 888 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); 889 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 890 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); 891 if (rt == net->ipv6.ip6_null_entry) { 892 fn = fib6_backtrack(fn, &fl6->saddr); 893 if (fn) 894 goto restart; 895 } 896 dst_use(&rt->dst, jiffies); 897 read_unlock_bh(&table->tb6_lock); 898 899 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 900 901 return rt; 902 903 } 904 905 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 906 int flags) 907 { 908 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); 909 } 910 EXPORT_SYMBOL_GPL(ip6_route_lookup); 911 912 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 913 const struct in6_addr *saddr, int oif, int strict) 914 { 915 struct flowi6 fl6 = { 916 .flowi6_oif = oif, 917 .daddr = *daddr, 918 }; 919 struct dst_entry *dst; 920 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 921 922 if (saddr) { 923 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 924 flags |= RT6_LOOKUP_F_HAS_SADDR; 925 } 926 927 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 928 if (dst->error == 0) 929 return (struct rt6_info *) dst; 930 931 dst_release(dst); 932 933 return NULL; 934 } 935 EXPORT_SYMBOL(rt6_lookup); 936 937 /* ip6_ins_rt is called with FREE table->tb6_lock. 938 * It takes new route entry, the addition fails by any reason the 939 * route is released. 940 * Caller must hold dst before calling it. 941 */ 942 943 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 944 struct mx6_config *mxc, 945 struct netlink_ext_ack *extack) 946 { 947 int err; 948 struct fib6_table *table; 949 950 table = rt->rt6i_table; 951 write_lock_bh(&table->tb6_lock); 952 err = fib6_add(&table->tb6_root, rt, info, mxc, extack); 953 write_unlock_bh(&table->tb6_lock); 954 955 return err; 956 } 957 958 int ip6_ins_rt(struct rt6_info *rt) 959 { 960 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 961 struct mx6_config mxc = { .mx = NULL, }; 962 963 /* Hold dst to account for the reference from the fib6 tree */ 964 dst_hold(&rt->dst); 965 return __ip6_ins_rt(rt, &info, &mxc, NULL); 966 } 967 968 /* called with rcu_lock held */ 969 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) 970 { 971 struct net_device *dev = rt->dst.dev; 972 973 if (rt->rt6i_flags & RTF_LOCAL) { 974 /* for copies of local routes, dst->dev needs to be the 975 * device if it is a master device, the master device if 976 * device is enslaved, and the loopback as the default 977 */ 978 if (netif_is_l3_slave(dev) && 979 !rt6_need_strict(&rt->rt6i_dst.addr)) 980 dev = l3mdev_master_dev_rcu(dev); 981 else if (!netif_is_l3_master(dev)) 982 dev = dev_net(dev)->loopback_dev; 983 /* last case is netif_is_l3_master(dev) is true in which 984 * case we want dev returned to be dev 985 */ 986 } 987 988 return dev; 989 } 990 991 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, 992 const struct in6_addr *daddr, 993 const struct in6_addr *saddr) 994 { 995 struct net_device *dev; 996 struct rt6_info *rt; 997 998 /* 999 * Clone the route. 1000 */ 1001 1002 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1003 ort = (struct rt6_info *)ort->dst.from; 1004 1005 rcu_read_lock(); 1006 dev = ip6_rt_get_dev_rcu(ort); 1007 rt = __ip6_dst_alloc(dev_net(dev), dev, 0); 1008 rcu_read_unlock(); 1009 if (!rt) 1010 return NULL; 1011 1012 ip6_rt_copy_init(rt, ort); 1013 rt->rt6i_flags |= RTF_CACHE; 1014 rt->rt6i_metric = 0; 1015 rt->dst.flags |= DST_HOST; 1016 rt->rt6i_dst.addr = *daddr; 1017 rt->rt6i_dst.plen = 128; 1018 1019 if (!rt6_is_gw_or_nonexthop(ort)) { 1020 if (ort->rt6i_dst.plen != 128 && 1021 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 1022 rt->rt6i_flags |= RTF_ANYCAST; 1023 #ifdef CONFIG_IPV6_SUBTREES 1024 if (rt->rt6i_src.plen && saddr) { 1025 rt->rt6i_src.addr = *saddr; 1026 rt->rt6i_src.plen = 128; 1027 } 1028 #endif 1029 } 1030 1031 return rt; 1032 } 1033 1034 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 1035 { 1036 struct net_device *dev; 1037 struct rt6_info *pcpu_rt; 1038 1039 rcu_read_lock(); 1040 dev = ip6_rt_get_dev_rcu(rt); 1041 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); 1042 rcu_read_unlock(); 1043 if (!pcpu_rt) 1044 return NULL; 1045 ip6_rt_copy_init(pcpu_rt, rt); 1046 pcpu_rt->rt6i_protocol = rt->rt6i_protocol; 1047 pcpu_rt->rt6i_flags |= RTF_PCPU; 1048 return pcpu_rt; 1049 } 1050 1051 /* It should be called with read_lock_bh(&tb6_lock) acquired */ 1052 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1053 { 1054 struct rt6_info *pcpu_rt, **p; 1055 1056 p = this_cpu_ptr(rt->rt6i_pcpu); 1057 pcpu_rt = *p; 1058 1059 if (pcpu_rt) { 1060 dst_hold(&pcpu_rt->dst); 1061 rt6_dst_from_metrics_check(pcpu_rt); 1062 } 1063 return pcpu_rt; 1064 } 1065 1066 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) 1067 { 1068 struct fib6_table *table = rt->rt6i_table; 1069 struct rt6_info *pcpu_rt, *prev, **p; 1070 1071 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1072 if (!pcpu_rt) { 1073 struct net *net = dev_net(rt->dst.dev); 1074 1075 dst_hold(&net->ipv6.ip6_null_entry->dst); 1076 return net->ipv6.ip6_null_entry; 1077 } 1078 1079 read_lock_bh(&table->tb6_lock); 1080 if (rt->rt6i_pcpu) { 1081 p = this_cpu_ptr(rt->rt6i_pcpu); 1082 prev = cmpxchg(p, NULL, pcpu_rt); 1083 if (prev) { 1084 /* If someone did it before us, return prev instead */ 1085 dst_release_immediate(&pcpu_rt->dst); 1086 pcpu_rt = prev; 1087 } 1088 } else { 1089 /* rt has been removed from the fib6 tree 1090 * before we have a chance to acquire the read_lock. 1091 * In this case, don't brother to create a pcpu rt 1092 * since rt is going away anyway. The next 1093 * dst_check() will trigger a re-lookup. 1094 */ 1095 dst_release_immediate(&pcpu_rt->dst); 1096 pcpu_rt = rt; 1097 } 1098 dst_hold(&pcpu_rt->dst); 1099 rt6_dst_from_metrics_check(pcpu_rt); 1100 read_unlock_bh(&table->tb6_lock); 1101 return pcpu_rt; 1102 } 1103 1104 /* exception hash table implementation 1105 */ 1106 static DEFINE_SPINLOCK(rt6_exception_lock); 1107 1108 /* Remove rt6_ex from hash table and free the memory 1109 * Caller must hold rt6_exception_lock 1110 */ 1111 static void rt6_remove_exception(struct rt6_exception_bucket *bucket, 1112 struct rt6_exception *rt6_ex) 1113 { 1114 if (!bucket || !rt6_ex) 1115 return; 1116 rt6_ex->rt6i->rt6i_node = NULL; 1117 hlist_del_rcu(&rt6_ex->hlist); 1118 rt6_release(rt6_ex->rt6i); 1119 kfree_rcu(rt6_ex, rcu); 1120 WARN_ON_ONCE(!bucket->depth); 1121 bucket->depth--; 1122 } 1123 1124 /* Remove oldest rt6_ex in bucket and free the memory 1125 * Caller must hold rt6_exception_lock 1126 */ 1127 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) 1128 { 1129 struct rt6_exception *rt6_ex, *oldest = NULL; 1130 1131 if (!bucket) 1132 return; 1133 1134 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1135 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) 1136 oldest = rt6_ex; 1137 } 1138 rt6_remove_exception(bucket, oldest); 1139 } 1140 1141 static u32 rt6_exception_hash(const struct in6_addr *dst, 1142 const struct in6_addr *src) 1143 { 1144 static u32 seed __read_mostly; 1145 u32 val; 1146 1147 net_get_random_once(&seed, sizeof(seed)); 1148 val = jhash(dst, sizeof(*dst), seed); 1149 1150 #ifdef CONFIG_IPV6_SUBTREES 1151 if (src) 1152 val = jhash(src, sizeof(*src), val); 1153 #endif 1154 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); 1155 } 1156 1157 /* Helper function to find the cached rt in the hash table 1158 * and update bucket pointer to point to the bucket for this 1159 * (daddr, saddr) pair 1160 * Caller must hold rt6_exception_lock 1161 */ 1162 static struct rt6_exception * 1163 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, 1164 const struct in6_addr *daddr, 1165 const struct in6_addr *saddr) 1166 { 1167 struct rt6_exception *rt6_ex; 1168 u32 hval; 1169 1170 if (!(*bucket) || !daddr) 1171 return NULL; 1172 1173 hval = rt6_exception_hash(daddr, saddr); 1174 *bucket += hval; 1175 1176 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { 1177 struct rt6_info *rt6 = rt6_ex->rt6i; 1178 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1179 1180 #ifdef CONFIG_IPV6_SUBTREES 1181 if (matched && saddr) 1182 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1183 #endif 1184 if (matched) 1185 return rt6_ex; 1186 } 1187 return NULL; 1188 } 1189 1190 /* Helper function to find the cached rt in the hash table 1191 * and update bucket pointer to point to the bucket for this 1192 * (daddr, saddr) pair 1193 * Caller must hold rcu_read_lock() 1194 */ 1195 static struct rt6_exception * 1196 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, 1197 const struct in6_addr *daddr, 1198 const struct in6_addr *saddr) 1199 { 1200 struct rt6_exception *rt6_ex; 1201 u32 hval; 1202 1203 WARN_ON_ONCE(!rcu_read_lock_held()); 1204 1205 if (!(*bucket) || !daddr) 1206 return NULL; 1207 1208 hval = rt6_exception_hash(daddr, saddr); 1209 *bucket += hval; 1210 1211 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { 1212 struct rt6_info *rt6 = rt6_ex->rt6i; 1213 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); 1214 1215 #ifdef CONFIG_IPV6_SUBTREES 1216 if (matched && saddr) 1217 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); 1218 #endif 1219 if (matched) 1220 return rt6_ex; 1221 } 1222 return NULL; 1223 } 1224 1225 static int rt6_insert_exception(struct rt6_info *nrt, 1226 struct rt6_info *ort) 1227 { 1228 struct rt6_exception_bucket *bucket; 1229 struct in6_addr *src_key = NULL; 1230 struct rt6_exception *rt6_ex; 1231 int err = 0; 1232 1233 /* ort can't be a cache or pcpu route */ 1234 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 1235 ort = (struct rt6_info *)ort->dst.from; 1236 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); 1237 1238 spin_lock_bh(&rt6_exception_lock); 1239 1240 if (ort->exception_bucket_flushed) { 1241 err = -EINVAL; 1242 goto out; 1243 } 1244 1245 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, 1246 lockdep_is_held(&rt6_exception_lock)); 1247 if (!bucket) { 1248 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), 1249 GFP_ATOMIC); 1250 if (!bucket) { 1251 err = -ENOMEM; 1252 goto out; 1253 } 1254 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); 1255 } 1256 1257 #ifdef CONFIG_IPV6_SUBTREES 1258 /* rt6i_src.plen != 0 indicates ort is in subtree 1259 * and exception table is indexed by a hash of 1260 * both rt6i_dst and rt6i_src. 1261 * Otherwise, the exception table is indexed by 1262 * a hash of only rt6i_dst. 1263 */ 1264 if (ort->rt6i_src.plen) 1265 src_key = &nrt->rt6i_src.addr; 1266 #endif 1267 1268 /* Update rt6i_prefsrc as it could be changed 1269 * in rt6_remove_prefsrc() 1270 */ 1271 nrt->rt6i_prefsrc = ort->rt6i_prefsrc; 1272 /* rt6_mtu_change() might lower mtu on ort. 1273 * Only insert this exception route if its mtu 1274 * is less than ort's mtu value. 1275 */ 1276 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { 1277 err = -EINVAL; 1278 goto out; 1279 } 1280 1281 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, 1282 src_key); 1283 if (rt6_ex) 1284 rt6_remove_exception(bucket, rt6_ex); 1285 1286 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); 1287 if (!rt6_ex) { 1288 err = -ENOMEM; 1289 goto out; 1290 } 1291 rt6_ex->rt6i = nrt; 1292 rt6_ex->stamp = jiffies; 1293 atomic_inc(&nrt->rt6i_ref); 1294 nrt->rt6i_node = ort->rt6i_node; 1295 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); 1296 bucket->depth++; 1297 1298 if (bucket->depth > FIB6_MAX_DEPTH) 1299 rt6_exception_remove_oldest(bucket); 1300 1301 out: 1302 spin_unlock_bh(&rt6_exception_lock); 1303 1304 /* Update fn->fn_sernum to invalidate all cached dst */ 1305 if (!err) 1306 fib6_update_sernum(ort); 1307 1308 return err; 1309 } 1310 1311 void rt6_flush_exceptions(struct rt6_info *rt) 1312 { 1313 struct rt6_exception_bucket *bucket; 1314 struct rt6_exception *rt6_ex; 1315 struct hlist_node *tmp; 1316 int i; 1317 1318 spin_lock_bh(&rt6_exception_lock); 1319 /* Prevent rt6_insert_exception() to recreate the bucket list */ 1320 rt->exception_bucket_flushed = 1; 1321 1322 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1323 lockdep_is_held(&rt6_exception_lock)); 1324 if (!bucket) 1325 goto out; 1326 1327 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1328 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) 1329 rt6_remove_exception(bucket, rt6_ex); 1330 WARN_ON_ONCE(bucket->depth); 1331 bucket++; 1332 } 1333 1334 out: 1335 spin_unlock_bh(&rt6_exception_lock); 1336 } 1337 1338 /* Find cached rt in the hash table inside passed in rt 1339 * Caller has to hold rcu_read_lock() 1340 */ 1341 static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, 1342 struct in6_addr *daddr, 1343 struct in6_addr *saddr) 1344 { 1345 struct rt6_exception_bucket *bucket; 1346 struct in6_addr *src_key = NULL; 1347 struct rt6_exception *rt6_ex; 1348 struct rt6_info *res = NULL; 1349 1350 bucket = rcu_dereference(rt->rt6i_exception_bucket); 1351 1352 #ifdef CONFIG_IPV6_SUBTREES 1353 /* rt6i_src.plen != 0 indicates rt is in subtree 1354 * and exception table is indexed by a hash of 1355 * both rt6i_dst and rt6i_src. 1356 * Otherwise, the exception table is indexed by 1357 * a hash of only rt6i_dst. 1358 */ 1359 if (rt->rt6i_src.plen) 1360 src_key = saddr; 1361 #endif 1362 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); 1363 1364 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) 1365 res = rt6_ex->rt6i; 1366 1367 return res; 1368 } 1369 1370 /* Remove the passed in cached rt from the hash table that contains it */ 1371 int rt6_remove_exception_rt(struct rt6_info *rt) 1372 { 1373 struct rt6_info *from = (struct rt6_info *)rt->dst.from; 1374 struct rt6_exception_bucket *bucket; 1375 struct in6_addr *src_key = NULL; 1376 struct rt6_exception *rt6_ex; 1377 int err; 1378 1379 if (!from || 1380 !(rt->rt6i_flags | RTF_CACHE)) 1381 return -EINVAL; 1382 1383 if (!rcu_access_pointer(from->rt6i_exception_bucket)) 1384 return -ENOENT; 1385 1386 spin_lock_bh(&rt6_exception_lock); 1387 bucket = rcu_dereference_protected(from->rt6i_exception_bucket, 1388 lockdep_is_held(&rt6_exception_lock)); 1389 #ifdef CONFIG_IPV6_SUBTREES 1390 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1391 * and exception table is indexed by a hash of 1392 * both rt6i_dst and rt6i_src. 1393 * Otherwise, the exception table is indexed by 1394 * a hash of only rt6i_dst. 1395 */ 1396 if (from->rt6i_src.plen) 1397 src_key = &rt->rt6i_src.addr; 1398 #endif 1399 rt6_ex = __rt6_find_exception_spinlock(&bucket, 1400 &rt->rt6i_dst.addr, 1401 src_key); 1402 if (rt6_ex) { 1403 rt6_remove_exception(bucket, rt6_ex); 1404 err = 0; 1405 } else { 1406 err = -ENOENT; 1407 } 1408 1409 spin_unlock_bh(&rt6_exception_lock); 1410 return err; 1411 } 1412 1413 /* Find rt6_ex which contains the passed in rt cache and 1414 * refresh its stamp 1415 */ 1416 static void rt6_update_exception_stamp_rt(struct rt6_info *rt) 1417 { 1418 struct rt6_info *from = (struct rt6_info *)rt->dst.from; 1419 struct rt6_exception_bucket *bucket; 1420 struct in6_addr *src_key = NULL; 1421 struct rt6_exception *rt6_ex; 1422 1423 if (!from || 1424 !(rt->rt6i_flags | RTF_CACHE)) 1425 return; 1426 1427 rcu_read_lock(); 1428 bucket = rcu_dereference(from->rt6i_exception_bucket); 1429 1430 #ifdef CONFIG_IPV6_SUBTREES 1431 /* rt6i_src.plen != 0 indicates 'from' is in subtree 1432 * and exception table is indexed by a hash of 1433 * both rt6i_dst and rt6i_src. 1434 * Otherwise, the exception table is indexed by 1435 * a hash of only rt6i_dst. 1436 */ 1437 if (from->rt6i_src.plen) 1438 src_key = &rt->rt6i_src.addr; 1439 #endif 1440 rt6_ex = __rt6_find_exception_rcu(&bucket, 1441 &rt->rt6i_dst.addr, 1442 src_key); 1443 if (rt6_ex) 1444 rt6_ex->stamp = jiffies; 1445 1446 rcu_read_unlock(); 1447 } 1448 1449 static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) 1450 { 1451 struct rt6_exception_bucket *bucket; 1452 struct rt6_exception *rt6_ex; 1453 int i; 1454 1455 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1456 lockdep_is_held(&rt6_exception_lock)); 1457 1458 if (bucket) { 1459 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1460 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1461 rt6_ex->rt6i->rt6i_prefsrc.plen = 0; 1462 } 1463 bucket++; 1464 } 1465 } 1466 } 1467 1468 static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) 1469 { 1470 struct rt6_exception_bucket *bucket; 1471 struct rt6_exception *rt6_ex; 1472 int i; 1473 1474 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1475 lockdep_is_held(&rt6_exception_lock)); 1476 1477 if (bucket) { 1478 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1479 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { 1480 struct rt6_info *entry = rt6_ex->rt6i; 1481 /* For RTF_CACHE with rt6i_pmtu == 0 1482 * (i.e. a redirected route), 1483 * the metrics of its rt->dst.from has already 1484 * been updated. 1485 */ 1486 if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) 1487 entry->rt6i_pmtu = mtu; 1488 } 1489 bucket++; 1490 } 1491 } 1492 } 1493 1494 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 1495 1496 static void rt6_exceptions_clean_tohost(struct rt6_info *rt, 1497 struct in6_addr *gateway) 1498 { 1499 struct rt6_exception_bucket *bucket; 1500 struct rt6_exception *rt6_ex; 1501 struct hlist_node *tmp; 1502 int i; 1503 1504 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1505 return; 1506 1507 spin_lock_bh(&rt6_exception_lock); 1508 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1509 lockdep_is_held(&rt6_exception_lock)); 1510 1511 if (bucket) { 1512 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1513 hlist_for_each_entry_safe(rt6_ex, tmp, 1514 &bucket->chain, hlist) { 1515 struct rt6_info *entry = rt6_ex->rt6i; 1516 1517 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == 1518 RTF_CACHE_GATEWAY && 1519 ipv6_addr_equal(gateway, 1520 &entry->rt6i_gateway)) { 1521 rt6_remove_exception(bucket, rt6_ex); 1522 } 1523 } 1524 bucket++; 1525 } 1526 } 1527 1528 spin_unlock_bh(&rt6_exception_lock); 1529 } 1530 1531 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, 1532 struct rt6_exception *rt6_ex, 1533 struct fib6_gc_args *gc_args, 1534 unsigned long now) 1535 { 1536 struct rt6_info *rt = rt6_ex->rt6i; 1537 1538 if (atomic_read(&rt->dst.__refcnt) == 1 && 1539 time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { 1540 RT6_TRACE("aging clone %p\n", rt); 1541 rt6_remove_exception(bucket, rt6_ex); 1542 return; 1543 } else if (rt->rt6i_flags & RTF_GATEWAY) { 1544 struct neighbour *neigh; 1545 __u8 neigh_flags = 0; 1546 1547 neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); 1548 if (neigh) { 1549 neigh_flags = neigh->flags; 1550 neigh_release(neigh); 1551 } 1552 if (!(neigh_flags & NTF_ROUTER)) { 1553 RT6_TRACE("purging route %p via non-router but gateway\n", 1554 rt); 1555 rt6_remove_exception(bucket, rt6_ex); 1556 return; 1557 } 1558 } 1559 gc_args->more++; 1560 } 1561 1562 void rt6_age_exceptions(struct rt6_info *rt, 1563 struct fib6_gc_args *gc_args, 1564 unsigned long now) 1565 { 1566 struct rt6_exception_bucket *bucket; 1567 struct rt6_exception *rt6_ex; 1568 struct hlist_node *tmp; 1569 int i; 1570 1571 if (!rcu_access_pointer(rt->rt6i_exception_bucket)) 1572 return; 1573 1574 spin_lock_bh(&rt6_exception_lock); 1575 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1576 lockdep_is_held(&rt6_exception_lock)); 1577 1578 if (bucket) { 1579 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { 1580 hlist_for_each_entry_safe(rt6_ex, tmp, 1581 &bucket->chain, hlist) { 1582 rt6_age_examine_exception(bucket, rt6_ex, 1583 gc_args, now); 1584 } 1585 bucket++; 1586 } 1587 } 1588 spin_unlock_bh(&rt6_exception_lock); 1589 } 1590 1591 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1592 int oif, struct flowi6 *fl6, int flags) 1593 { 1594 struct fib6_node *fn, *saved_fn; 1595 struct rt6_info *rt; 1596 int strict = 0; 1597 1598 strict |= flags & RT6_LOOKUP_F_IFACE; 1599 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1600 if (net->ipv6.devconf_all->forwarding == 0) 1601 strict |= RT6_LOOKUP_F_REACHABLE; 1602 1603 read_lock_bh(&table->tb6_lock); 1604 1605 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1606 saved_fn = fn; 1607 1608 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1609 oif = 0; 1610 1611 redo_rt6_select: 1612 rt = rt6_select(fn, oif, strict); 1613 if (rt->rt6i_nsiblings) 1614 rt = rt6_multipath_select(rt, fl6, oif, strict); 1615 if (rt == net->ipv6.ip6_null_entry) { 1616 fn = fib6_backtrack(fn, &fl6->saddr); 1617 if (fn) 1618 goto redo_rt6_select; 1619 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1620 /* also consider unreachable route */ 1621 strict &= ~RT6_LOOKUP_F_REACHABLE; 1622 fn = saved_fn; 1623 goto redo_rt6_select; 1624 } 1625 } 1626 1627 1628 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { 1629 dst_use(&rt->dst, jiffies); 1630 read_unlock_bh(&table->tb6_lock); 1631 1632 rt6_dst_from_metrics_check(rt); 1633 1634 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 1635 return rt; 1636 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1637 !(rt->rt6i_flags & RTF_GATEWAY))) { 1638 /* Create a RTF_CACHE clone which will not be 1639 * owned by the fib6 tree. It is for the special case where 1640 * the daddr in the skb during the neighbor look-up is different 1641 * from the fl6->daddr used to look-up route here. 1642 */ 1643 1644 struct rt6_info *uncached_rt; 1645 1646 dst_use(&rt->dst, jiffies); 1647 read_unlock_bh(&table->tb6_lock); 1648 1649 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1650 dst_release(&rt->dst); 1651 1652 if (uncached_rt) { 1653 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1654 * No need for another dst_hold() 1655 */ 1656 rt6_uncached_list_add(uncached_rt); 1657 } else { 1658 uncached_rt = net->ipv6.ip6_null_entry; 1659 dst_hold(&uncached_rt->dst); 1660 } 1661 1662 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); 1663 return uncached_rt; 1664 1665 } else { 1666 /* Get a percpu copy */ 1667 1668 struct rt6_info *pcpu_rt; 1669 1670 rt->dst.lastuse = jiffies; 1671 rt->dst.__use++; 1672 pcpu_rt = rt6_get_pcpu_route(rt); 1673 1674 if (pcpu_rt) { 1675 read_unlock_bh(&table->tb6_lock); 1676 } else { 1677 /* We have to do the read_unlock first 1678 * because rt6_make_pcpu_route() may trigger 1679 * ip6_dst_gc() which will take the write_lock. 1680 */ 1681 dst_hold(&rt->dst); 1682 read_unlock_bh(&table->tb6_lock); 1683 pcpu_rt = rt6_make_pcpu_route(rt); 1684 dst_release(&rt->dst); 1685 } 1686 1687 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); 1688 return pcpu_rt; 1689 1690 } 1691 } 1692 EXPORT_SYMBOL_GPL(ip6_pol_route); 1693 1694 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 1695 struct flowi6 *fl6, int flags) 1696 { 1697 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1698 } 1699 1700 struct dst_entry *ip6_route_input_lookup(struct net *net, 1701 struct net_device *dev, 1702 struct flowi6 *fl6, int flags) 1703 { 1704 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1705 flags |= RT6_LOOKUP_F_IFACE; 1706 1707 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1708 } 1709 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1710 1711 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1712 struct flow_keys *keys) 1713 { 1714 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1715 const struct ipv6hdr *key_iph = outer_iph; 1716 const struct ipv6hdr *inner_iph; 1717 const struct icmp6hdr *icmph; 1718 struct ipv6hdr _inner_iph; 1719 1720 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1721 goto out; 1722 1723 icmph = icmp6_hdr(skb); 1724 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1725 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1726 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1727 icmph->icmp6_type != ICMPV6_PARAMPROB) 1728 goto out; 1729 1730 inner_iph = skb_header_pointer(skb, 1731 skb_transport_offset(skb) + sizeof(*icmph), 1732 sizeof(_inner_iph), &_inner_iph); 1733 if (!inner_iph) 1734 goto out; 1735 1736 key_iph = inner_iph; 1737 out: 1738 memset(keys, 0, sizeof(*keys)); 1739 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1740 keys->addrs.v6addrs.src = key_iph->saddr; 1741 keys->addrs.v6addrs.dst = key_iph->daddr; 1742 keys->tags.flow_label = ip6_flowinfo(key_iph); 1743 keys->basic.ip_proto = key_iph->nexthdr; 1744 } 1745 1746 /* if skb is set it will be used and fl6 can be NULL */ 1747 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) 1748 { 1749 struct flow_keys hash_keys; 1750 1751 if (skb) { 1752 ip6_multipath_l3_keys(skb, &hash_keys); 1753 return flow_hash_from_keys(&hash_keys); 1754 } 1755 1756 return get_hash_from_flowi6(fl6); 1757 } 1758 1759 void ip6_route_input(struct sk_buff *skb) 1760 { 1761 const struct ipv6hdr *iph = ipv6_hdr(skb); 1762 struct net *net = dev_net(skb->dev); 1763 int flags = RT6_LOOKUP_F_HAS_SADDR; 1764 struct ip_tunnel_info *tun_info; 1765 struct flowi6 fl6 = { 1766 .flowi6_iif = skb->dev->ifindex, 1767 .daddr = iph->daddr, 1768 .saddr = iph->saddr, 1769 .flowlabel = ip6_flowinfo(iph), 1770 .flowi6_mark = skb->mark, 1771 .flowi6_proto = iph->nexthdr, 1772 }; 1773 1774 tun_info = skb_tunnel_info(skb); 1775 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1776 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 1777 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 1778 fl6.mp_hash = rt6_multipath_hash(&fl6, skb); 1779 skb_dst_drop(skb); 1780 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); 1781 } 1782 1783 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 1784 struct flowi6 *fl6, int flags) 1785 { 1786 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 1787 } 1788 1789 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 1790 struct flowi6 *fl6, int flags) 1791 { 1792 bool any_src; 1793 1794 if (rt6_need_strict(&fl6->daddr)) { 1795 struct dst_entry *dst; 1796 1797 dst = l3mdev_link_scope_lookup(net, fl6); 1798 if (dst) 1799 return dst; 1800 } 1801 1802 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1803 1804 any_src = ipv6_addr_any(&fl6->saddr); 1805 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 1806 (fl6->flowi6_oif && any_src)) 1807 flags |= RT6_LOOKUP_F_IFACE; 1808 1809 if (!any_src) 1810 flags |= RT6_LOOKUP_F_HAS_SADDR; 1811 else if (sk) 1812 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 1813 1814 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 1815 } 1816 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 1817 1818 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1819 { 1820 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1821 struct net_device *loopback_dev = net->loopback_dev; 1822 struct dst_entry *new = NULL; 1823 1824 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 1825 DST_OBSOLETE_NONE, 0); 1826 if (rt) { 1827 rt6_info_init(rt); 1828 1829 new = &rt->dst; 1830 new->__use = 1; 1831 new->input = dst_discard; 1832 new->output = dst_discard_out; 1833 1834 dst_copy_metrics(new, &ort->dst); 1835 1836 rt->rt6i_idev = in6_dev_get(loopback_dev); 1837 rt->rt6i_gateway = ort->rt6i_gateway; 1838 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 1839 rt->rt6i_metric = 0; 1840 1841 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 1842 #ifdef CONFIG_IPV6_SUBTREES 1843 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1844 #endif 1845 } 1846 1847 dst_release(dst_orig); 1848 return new ? new : ERR_PTR(-ENOMEM); 1849 } 1850 1851 /* 1852 * Destination cache support functions 1853 */ 1854 1855 static void rt6_dst_from_metrics_check(struct rt6_info *rt) 1856 { 1857 if (rt->dst.from && 1858 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) 1859 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); 1860 } 1861 1862 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 1863 { 1864 u32 rt_cookie = 0; 1865 1866 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) 1867 return NULL; 1868 1869 if (rt6_check_expired(rt)) 1870 return NULL; 1871 1872 return &rt->dst; 1873 } 1874 1875 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 1876 { 1877 if (!__rt6_check_expired(rt) && 1878 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1879 rt6_check((struct rt6_info *)(rt->dst.from), cookie)) 1880 return &rt->dst; 1881 else 1882 return NULL; 1883 } 1884 1885 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 1886 { 1887 struct rt6_info *rt; 1888 1889 rt = (struct rt6_info *) dst; 1890 1891 /* All IPV6 dsts are created with ->obsolete set to the value 1892 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1893 * into this function always. 1894 */ 1895 1896 rt6_dst_from_metrics_check(rt); 1897 1898 if (rt->rt6i_flags & RTF_PCPU || 1899 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) 1900 return rt6_dst_from_check(rt, cookie); 1901 else 1902 return rt6_check(rt, cookie); 1903 } 1904 1905 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 1906 { 1907 struct rt6_info *rt = (struct rt6_info *) dst; 1908 1909 if (rt) { 1910 if (rt->rt6i_flags & RTF_CACHE) { 1911 if (rt6_check_expired(rt)) { 1912 ip6_del_rt(rt); 1913 dst = NULL; 1914 } 1915 } else { 1916 dst_release(dst); 1917 dst = NULL; 1918 } 1919 } 1920 return dst; 1921 } 1922 1923 static void ip6_link_failure(struct sk_buff *skb) 1924 { 1925 struct rt6_info *rt; 1926 1927 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 1928 1929 rt = (struct rt6_info *) skb_dst(skb); 1930 if (rt) { 1931 if (rt->rt6i_flags & RTF_CACHE) { 1932 if (dst_hold_safe(&rt->dst)) 1933 ip6_del_rt(rt); 1934 } else { 1935 struct fib6_node *fn; 1936 1937 rcu_read_lock(); 1938 fn = rcu_dereference(rt->rt6i_node); 1939 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 1940 fn->fn_sernum = -1; 1941 rcu_read_unlock(); 1942 } 1943 } 1944 } 1945 1946 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 1947 { 1948 struct net *net = dev_net(rt->dst.dev); 1949 1950 rt->rt6i_flags |= RTF_MODIFIED; 1951 rt->rt6i_pmtu = mtu; 1952 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 1953 } 1954 1955 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 1956 { 1957 return !(rt->rt6i_flags & RTF_CACHE) && 1958 (rt->rt6i_flags & RTF_PCPU || 1959 rcu_access_pointer(rt->rt6i_node)); 1960 } 1961 1962 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 1963 const struct ipv6hdr *iph, u32 mtu) 1964 { 1965 const struct in6_addr *daddr, *saddr; 1966 struct rt6_info *rt6 = (struct rt6_info *)dst; 1967 1968 if (rt6->rt6i_flags & RTF_LOCAL) 1969 return; 1970 1971 if (dst_metric_locked(dst, RTAX_MTU)) 1972 return; 1973 1974 if (iph) { 1975 daddr = &iph->daddr; 1976 saddr = &iph->saddr; 1977 } else if (sk) { 1978 daddr = &sk->sk_v6_daddr; 1979 saddr = &inet6_sk(sk)->saddr; 1980 } else { 1981 daddr = NULL; 1982 saddr = NULL; 1983 } 1984 dst_confirm_neigh(dst, daddr); 1985 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 1986 if (mtu >= dst_mtu(dst)) 1987 return; 1988 1989 if (!rt6_cache_allowed_for_pmtu(rt6)) { 1990 rt6_do_update_pmtu(rt6, mtu); 1991 } else if (daddr) { 1992 struct rt6_info *nrt6; 1993 1994 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 1995 if (nrt6) { 1996 rt6_do_update_pmtu(nrt6, mtu); 1997 1998 /* ip6_ins_rt(nrt6) will bump the 1999 * rt6->rt6i_node->fn_sernum 2000 * which will fail the next rt6_check() and 2001 * invalidate the sk->sk_dst_cache. 2002 */ 2003 ip6_ins_rt(nrt6); 2004 /* Release the reference taken in 2005 * ip6_rt_cache_alloc() 2006 */ 2007 dst_release(&nrt6->dst); 2008 } 2009 } 2010 } 2011 2012 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 2013 struct sk_buff *skb, u32 mtu) 2014 { 2015 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 2016 } 2017 2018 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 2019 int oif, u32 mark, kuid_t uid) 2020 { 2021 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2022 struct dst_entry *dst; 2023 struct flowi6 fl6; 2024 2025 memset(&fl6, 0, sizeof(fl6)); 2026 fl6.flowi6_oif = oif; 2027 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 2028 fl6.daddr = iph->daddr; 2029 fl6.saddr = iph->saddr; 2030 fl6.flowlabel = ip6_flowinfo(iph); 2031 fl6.flowi6_uid = uid; 2032 2033 dst = ip6_route_output(net, NULL, &fl6); 2034 if (!dst->error) 2035 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 2036 dst_release(dst); 2037 } 2038 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 2039 2040 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 2041 { 2042 struct dst_entry *dst; 2043 2044 ip6_update_pmtu(skb, sock_net(sk), mtu, 2045 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 2046 2047 dst = __sk_dst_get(sk); 2048 if (!dst || !dst->obsolete || 2049 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 2050 return; 2051 2052 bh_lock_sock(sk); 2053 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 2054 ip6_datagram_dst_update(sk, false); 2055 bh_unlock_sock(sk); 2056 } 2057 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 2058 2059 /* Handle redirects */ 2060 struct ip6rd_flowi { 2061 struct flowi6 fl6; 2062 struct in6_addr gateway; 2063 }; 2064 2065 static struct rt6_info *__ip6_route_redirect(struct net *net, 2066 struct fib6_table *table, 2067 struct flowi6 *fl6, 2068 int flags) 2069 { 2070 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 2071 struct rt6_info *rt; 2072 struct fib6_node *fn; 2073 2074 /* Get the "current" route for this destination and 2075 * check if the redirect has come from appropriate router. 2076 * 2077 * RFC 4861 specifies that redirects should only be 2078 * accepted if they come from the nexthop to the target. 2079 * Due to the way the routes are chosen, this notion 2080 * is a bit fuzzy and one might need to check all possible 2081 * routes. 2082 */ 2083 2084 read_lock_bh(&table->tb6_lock); 2085 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 2086 restart: 2087 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2088 if (rt6_check_expired(rt)) 2089 continue; 2090 if (rt->dst.error) 2091 break; 2092 if (!(rt->rt6i_flags & RTF_GATEWAY)) 2093 continue; 2094 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 2095 continue; 2096 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) 2097 continue; 2098 break; 2099 } 2100 2101 if (!rt) 2102 rt = net->ipv6.ip6_null_entry; 2103 else if (rt->dst.error) { 2104 rt = net->ipv6.ip6_null_entry; 2105 goto out; 2106 } 2107 2108 if (rt == net->ipv6.ip6_null_entry) { 2109 fn = fib6_backtrack(fn, &fl6->saddr); 2110 if (fn) 2111 goto restart; 2112 } 2113 2114 out: 2115 dst_hold(&rt->dst); 2116 2117 read_unlock_bh(&table->tb6_lock); 2118 2119 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 2120 return rt; 2121 }; 2122 2123 static struct dst_entry *ip6_route_redirect(struct net *net, 2124 const struct flowi6 *fl6, 2125 const struct in6_addr *gateway) 2126 { 2127 int flags = RT6_LOOKUP_F_HAS_SADDR; 2128 struct ip6rd_flowi rdfl; 2129 2130 rdfl.fl6 = *fl6; 2131 rdfl.gateway = *gateway; 2132 2133 return fib6_rule_lookup(net, &rdfl.fl6, 2134 flags, __ip6_route_redirect); 2135 } 2136 2137 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 2138 kuid_t uid) 2139 { 2140 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 2141 struct dst_entry *dst; 2142 struct flowi6 fl6; 2143 2144 memset(&fl6, 0, sizeof(fl6)); 2145 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2146 fl6.flowi6_oif = oif; 2147 fl6.flowi6_mark = mark; 2148 fl6.daddr = iph->daddr; 2149 fl6.saddr = iph->saddr; 2150 fl6.flowlabel = ip6_flowinfo(iph); 2151 fl6.flowi6_uid = uid; 2152 2153 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 2154 rt6_do_redirect(dst, NULL, skb); 2155 dst_release(dst); 2156 } 2157 EXPORT_SYMBOL_GPL(ip6_redirect); 2158 2159 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 2160 u32 mark) 2161 { 2162 const struct ipv6hdr *iph = ipv6_hdr(skb); 2163 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 2164 struct dst_entry *dst; 2165 struct flowi6 fl6; 2166 2167 memset(&fl6, 0, sizeof(fl6)); 2168 fl6.flowi6_iif = LOOPBACK_IFINDEX; 2169 fl6.flowi6_oif = oif; 2170 fl6.flowi6_mark = mark; 2171 fl6.daddr = msg->dest; 2172 fl6.saddr = iph->daddr; 2173 fl6.flowi6_uid = sock_net_uid(net, NULL); 2174 2175 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 2176 rt6_do_redirect(dst, NULL, skb); 2177 dst_release(dst); 2178 } 2179 2180 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 2181 { 2182 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 2183 sk->sk_uid); 2184 } 2185 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 2186 2187 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 2188 { 2189 struct net_device *dev = dst->dev; 2190 unsigned int mtu = dst_mtu(dst); 2191 struct net *net = dev_net(dev); 2192 2193 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 2194 2195 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 2196 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 2197 2198 /* 2199 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 2200 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 2201 * IPV6_MAXPLEN is also valid and means: "any MSS, 2202 * rely only on pmtu discovery" 2203 */ 2204 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 2205 mtu = IPV6_MAXPLEN; 2206 return mtu; 2207 } 2208 2209 static unsigned int ip6_mtu(const struct dst_entry *dst) 2210 { 2211 const struct rt6_info *rt = (const struct rt6_info *)dst; 2212 unsigned int mtu = rt->rt6i_pmtu; 2213 struct inet6_dev *idev; 2214 2215 if (mtu) 2216 goto out; 2217 2218 mtu = dst_metric_raw(dst, RTAX_MTU); 2219 if (mtu) 2220 goto out; 2221 2222 mtu = IPV6_MIN_MTU; 2223 2224 rcu_read_lock(); 2225 idev = __in6_dev_get(dst->dev); 2226 if (idev) 2227 mtu = idev->cnf.mtu6; 2228 rcu_read_unlock(); 2229 2230 out: 2231 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 2232 2233 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 2234 } 2235 2236 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 2237 struct flowi6 *fl6) 2238 { 2239 struct dst_entry *dst; 2240 struct rt6_info *rt; 2241 struct inet6_dev *idev = in6_dev_get(dev); 2242 struct net *net = dev_net(dev); 2243 2244 if (unlikely(!idev)) 2245 return ERR_PTR(-ENODEV); 2246 2247 rt = ip6_dst_alloc(net, dev, 0); 2248 if (unlikely(!rt)) { 2249 in6_dev_put(idev); 2250 dst = ERR_PTR(-ENOMEM); 2251 goto out; 2252 } 2253 2254 rt->dst.flags |= DST_HOST; 2255 rt->dst.output = ip6_output; 2256 rt->rt6i_gateway = fl6->daddr; 2257 rt->rt6i_dst.addr = fl6->daddr; 2258 rt->rt6i_dst.plen = 128; 2259 rt->rt6i_idev = idev; 2260 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 2261 2262 /* Add this dst into uncached_list so that rt6_ifdown() can 2263 * do proper release of the net_device 2264 */ 2265 rt6_uncached_list_add(rt); 2266 2267 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 2268 2269 out: 2270 return dst; 2271 } 2272 2273 static int ip6_dst_gc(struct dst_ops *ops) 2274 { 2275 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 2276 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 2277 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 2278 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 2279 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 2280 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 2281 int entries; 2282 2283 entries = dst_entries_get_fast(ops); 2284 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 2285 entries <= rt_max_size) 2286 goto out; 2287 2288 net->ipv6.ip6_rt_gc_expire++; 2289 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 2290 entries = dst_entries_get_slow(ops); 2291 if (entries < ops->gc_thresh) 2292 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 2293 out: 2294 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 2295 return entries > rt_max_size; 2296 } 2297 2298 static int ip6_convert_metrics(struct mx6_config *mxc, 2299 const struct fib6_config *cfg) 2300 { 2301 bool ecn_ca = false; 2302 struct nlattr *nla; 2303 int remaining; 2304 u32 *mp; 2305 2306 if (!cfg->fc_mx) 2307 return 0; 2308 2309 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 2310 if (unlikely(!mp)) 2311 return -ENOMEM; 2312 2313 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 2314 int type = nla_type(nla); 2315 u32 val; 2316 2317 if (!type) 2318 continue; 2319 if (unlikely(type > RTAX_MAX)) 2320 goto err; 2321 2322 if (type == RTAX_CC_ALGO) { 2323 char tmp[TCP_CA_NAME_MAX]; 2324 2325 nla_strlcpy(tmp, nla, sizeof(tmp)); 2326 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 2327 if (val == TCP_CA_UNSPEC) 2328 goto err; 2329 } else { 2330 val = nla_get_u32(nla); 2331 } 2332 if (type == RTAX_HOPLIMIT && val > 255) 2333 val = 255; 2334 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) 2335 goto err; 2336 2337 mp[type - 1] = val; 2338 __set_bit(type - 1, mxc->mx_valid); 2339 } 2340 2341 if (ecn_ca) { 2342 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); 2343 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; 2344 } 2345 2346 mxc->mx = mp; 2347 return 0; 2348 err: 2349 kfree(mp); 2350 return -EINVAL; 2351 } 2352 2353 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 2354 struct fib6_config *cfg, 2355 const struct in6_addr *gw_addr) 2356 { 2357 struct flowi6 fl6 = { 2358 .flowi6_oif = cfg->fc_ifindex, 2359 .daddr = *gw_addr, 2360 .saddr = cfg->fc_prefsrc, 2361 }; 2362 struct fib6_table *table; 2363 struct rt6_info *rt; 2364 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; 2365 2366 table = fib6_get_table(net, cfg->fc_table); 2367 if (!table) 2368 return NULL; 2369 2370 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 2371 flags |= RT6_LOOKUP_F_HAS_SADDR; 2372 2373 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); 2374 2375 /* if table lookup failed, fall back to full lookup */ 2376 if (rt == net->ipv6.ip6_null_entry) { 2377 ip6_rt_put(rt); 2378 rt = NULL; 2379 } 2380 2381 return rt; 2382 } 2383 2384 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, 2385 struct netlink_ext_ack *extack) 2386 { 2387 struct net *net = cfg->fc_nlinfo.nl_net; 2388 struct rt6_info *rt = NULL; 2389 struct net_device *dev = NULL; 2390 struct inet6_dev *idev = NULL; 2391 struct fib6_table *table; 2392 int addr_type; 2393 int err = -EINVAL; 2394 2395 /* RTF_PCPU is an internal flag; can not be set by userspace */ 2396 if (cfg->fc_flags & RTF_PCPU) { 2397 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 2398 goto out; 2399 } 2400 2401 if (cfg->fc_dst_len > 128) { 2402 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 2403 goto out; 2404 } 2405 if (cfg->fc_src_len > 128) { 2406 NL_SET_ERR_MSG(extack, "Invalid source address length"); 2407 goto out; 2408 } 2409 #ifndef CONFIG_IPV6_SUBTREES 2410 if (cfg->fc_src_len) { 2411 NL_SET_ERR_MSG(extack, 2412 "Specifying source address requires IPV6_SUBTREES to be enabled"); 2413 goto out; 2414 } 2415 #endif 2416 if (cfg->fc_ifindex) { 2417 err = -ENODEV; 2418 dev = dev_get_by_index(net, cfg->fc_ifindex); 2419 if (!dev) 2420 goto out; 2421 idev = in6_dev_get(dev); 2422 if (!idev) 2423 goto out; 2424 } 2425 2426 if (cfg->fc_metric == 0) 2427 cfg->fc_metric = IP6_RT_PRIO_USER; 2428 2429 err = -ENOBUFS; 2430 if (cfg->fc_nlinfo.nlh && 2431 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 2432 table = fib6_get_table(net, cfg->fc_table); 2433 if (!table) { 2434 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 2435 table = fib6_new_table(net, cfg->fc_table); 2436 } 2437 } else { 2438 table = fib6_new_table(net, cfg->fc_table); 2439 } 2440 2441 if (!table) 2442 goto out; 2443 2444 rt = ip6_dst_alloc(net, NULL, 2445 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); 2446 2447 if (!rt) { 2448 err = -ENOMEM; 2449 goto out; 2450 } 2451 2452 if (cfg->fc_flags & RTF_EXPIRES) 2453 rt6_set_expires(rt, jiffies + 2454 clock_t_to_jiffies(cfg->fc_expires)); 2455 else 2456 rt6_clean_expires(rt); 2457 2458 if (cfg->fc_protocol == RTPROT_UNSPEC) 2459 cfg->fc_protocol = RTPROT_BOOT; 2460 rt->rt6i_protocol = cfg->fc_protocol; 2461 2462 addr_type = ipv6_addr_type(&cfg->fc_dst); 2463 2464 if (addr_type & IPV6_ADDR_MULTICAST) 2465 rt->dst.input = ip6_mc_input; 2466 else if (cfg->fc_flags & RTF_LOCAL) 2467 rt->dst.input = ip6_input; 2468 else 2469 rt->dst.input = ip6_forward; 2470 2471 rt->dst.output = ip6_output; 2472 2473 if (cfg->fc_encap) { 2474 struct lwtunnel_state *lwtstate; 2475 2476 err = lwtunnel_build_state(cfg->fc_encap_type, 2477 cfg->fc_encap, AF_INET6, cfg, 2478 &lwtstate, extack); 2479 if (err) 2480 goto out; 2481 rt->dst.lwtstate = lwtstate_get(lwtstate); 2482 if (lwtunnel_output_redirect(rt->dst.lwtstate)) { 2483 rt->dst.lwtstate->orig_output = rt->dst.output; 2484 rt->dst.output = lwtunnel_output; 2485 } 2486 if (lwtunnel_input_redirect(rt->dst.lwtstate)) { 2487 rt->dst.lwtstate->orig_input = rt->dst.input; 2488 rt->dst.input = lwtunnel_input; 2489 } 2490 } 2491 2492 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 2493 rt->rt6i_dst.plen = cfg->fc_dst_len; 2494 if (rt->rt6i_dst.plen == 128) 2495 rt->dst.flags |= DST_HOST; 2496 2497 #ifdef CONFIG_IPV6_SUBTREES 2498 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 2499 rt->rt6i_src.plen = cfg->fc_src_len; 2500 #endif 2501 2502 rt->rt6i_metric = cfg->fc_metric; 2503 2504 /* We cannot add true routes via loopback here, 2505 they would result in kernel looping; promote them to reject routes 2506 */ 2507 if ((cfg->fc_flags & RTF_REJECT) || 2508 (dev && (dev->flags & IFF_LOOPBACK) && 2509 !(addr_type & IPV6_ADDR_LOOPBACK) && 2510 !(cfg->fc_flags & RTF_LOCAL))) { 2511 /* hold loopback dev/idev if we haven't done so. */ 2512 if (dev != net->loopback_dev) { 2513 if (dev) { 2514 dev_put(dev); 2515 in6_dev_put(idev); 2516 } 2517 dev = net->loopback_dev; 2518 dev_hold(dev); 2519 idev = in6_dev_get(dev); 2520 if (!idev) { 2521 err = -ENODEV; 2522 goto out; 2523 } 2524 } 2525 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 2526 switch (cfg->fc_type) { 2527 case RTN_BLACKHOLE: 2528 rt->dst.error = -EINVAL; 2529 rt->dst.output = dst_discard_out; 2530 rt->dst.input = dst_discard; 2531 break; 2532 case RTN_PROHIBIT: 2533 rt->dst.error = -EACCES; 2534 rt->dst.output = ip6_pkt_prohibit_out; 2535 rt->dst.input = ip6_pkt_prohibit; 2536 break; 2537 case RTN_THROW: 2538 case RTN_UNREACHABLE: 2539 default: 2540 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN 2541 : (cfg->fc_type == RTN_UNREACHABLE) 2542 ? -EHOSTUNREACH : -ENETUNREACH; 2543 rt->dst.output = ip6_pkt_discard_out; 2544 rt->dst.input = ip6_pkt_discard; 2545 break; 2546 } 2547 goto install_route; 2548 } 2549 2550 if (cfg->fc_flags & RTF_GATEWAY) { 2551 const struct in6_addr *gw_addr; 2552 int gwa_type; 2553 2554 gw_addr = &cfg->fc_gateway; 2555 gwa_type = ipv6_addr_type(gw_addr); 2556 2557 /* if gw_addr is local we will fail to detect this in case 2558 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2559 * will return already-added prefix route via interface that 2560 * prefix route was assigned to, which might be non-loopback. 2561 */ 2562 err = -EINVAL; 2563 if (ipv6_chk_addr_and_flags(net, gw_addr, 2564 gwa_type & IPV6_ADDR_LINKLOCAL ? 2565 dev : NULL, 0, 0)) { 2566 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2567 goto out; 2568 } 2569 rt->rt6i_gateway = *gw_addr; 2570 2571 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 2572 struct rt6_info *grt = NULL; 2573 2574 /* IPv6 strictly inhibits using not link-local 2575 addresses as nexthop address. 2576 Otherwise, router will not able to send redirects. 2577 It is very good, but in some (rare!) circumstances 2578 (SIT, PtP, NBMA NOARP links) it is handy to allow 2579 some exceptions. --ANK 2580 We allow IPv4-mapped nexthops to support RFC4798-type 2581 addressing 2582 */ 2583 if (!(gwa_type & (IPV6_ADDR_UNICAST | 2584 IPV6_ADDR_MAPPED))) { 2585 NL_SET_ERR_MSG(extack, 2586 "Invalid gateway address"); 2587 goto out; 2588 } 2589 2590 if (cfg->fc_table) { 2591 grt = ip6_nh_lookup_table(net, cfg, gw_addr); 2592 2593 if (grt) { 2594 if (grt->rt6i_flags & RTF_GATEWAY || 2595 (dev && dev != grt->dst.dev)) { 2596 ip6_rt_put(grt); 2597 grt = NULL; 2598 } 2599 } 2600 } 2601 2602 if (!grt) 2603 grt = rt6_lookup(net, gw_addr, NULL, 2604 cfg->fc_ifindex, 1); 2605 2606 err = -EHOSTUNREACH; 2607 if (!grt) 2608 goto out; 2609 if (dev) { 2610 if (dev != grt->dst.dev) { 2611 ip6_rt_put(grt); 2612 goto out; 2613 } 2614 } else { 2615 dev = grt->dst.dev; 2616 idev = grt->rt6i_idev; 2617 dev_hold(dev); 2618 in6_dev_hold(grt->rt6i_idev); 2619 } 2620 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2621 err = 0; 2622 ip6_rt_put(grt); 2623 2624 if (err) 2625 goto out; 2626 } 2627 err = -EINVAL; 2628 if (!dev) { 2629 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2630 goto out; 2631 } else if (dev->flags & IFF_LOOPBACK) { 2632 NL_SET_ERR_MSG(extack, 2633 "Egress device can not be loopback device for this route"); 2634 goto out; 2635 } 2636 } 2637 2638 err = -ENODEV; 2639 if (!dev) 2640 goto out; 2641 2642 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 2643 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 2644 NL_SET_ERR_MSG(extack, "Invalid source address"); 2645 err = -EINVAL; 2646 goto out; 2647 } 2648 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 2649 rt->rt6i_prefsrc.plen = 128; 2650 } else 2651 rt->rt6i_prefsrc.plen = 0; 2652 2653 rt->rt6i_flags = cfg->fc_flags; 2654 2655 install_route: 2656 rt->dst.dev = dev; 2657 rt->rt6i_idev = idev; 2658 rt->rt6i_table = table; 2659 2660 cfg->fc_nlinfo.nl_net = dev_net(dev); 2661 2662 return rt; 2663 out: 2664 if (dev) 2665 dev_put(dev); 2666 if (idev) 2667 in6_dev_put(idev); 2668 if (rt) 2669 dst_release_immediate(&rt->dst); 2670 2671 return ERR_PTR(err); 2672 } 2673 2674 int ip6_route_add(struct fib6_config *cfg, 2675 struct netlink_ext_ack *extack) 2676 { 2677 struct mx6_config mxc = { .mx = NULL, }; 2678 struct rt6_info *rt; 2679 int err; 2680 2681 rt = ip6_route_info_create(cfg, extack); 2682 if (IS_ERR(rt)) { 2683 err = PTR_ERR(rt); 2684 rt = NULL; 2685 goto out; 2686 } 2687 2688 err = ip6_convert_metrics(&mxc, cfg); 2689 if (err) 2690 goto out; 2691 2692 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); 2693 2694 kfree(mxc.mx); 2695 2696 return err; 2697 out: 2698 if (rt) 2699 dst_release_immediate(&rt->dst); 2700 2701 return err; 2702 } 2703 2704 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 2705 { 2706 int err; 2707 struct fib6_table *table; 2708 struct net *net = dev_net(rt->dst.dev); 2709 2710 if (rt == net->ipv6.ip6_null_entry) { 2711 err = -ENOENT; 2712 goto out; 2713 } 2714 2715 table = rt->rt6i_table; 2716 write_lock_bh(&table->tb6_lock); 2717 err = fib6_del(rt, info); 2718 write_unlock_bh(&table->tb6_lock); 2719 2720 out: 2721 ip6_rt_put(rt); 2722 return err; 2723 } 2724 2725 int ip6_del_rt(struct rt6_info *rt) 2726 { 2727 struct nl_info info = { 2728 .nl_net = dev_net(rt->dst.dev), 2729 }; 2730 return __ip6_del_rt(rt, &info); 2731 } 2732 2733 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) 2734 { 2735 struct nl_info *info = &cfg->fc_nlinfo; 2736 struct net *net = info->nl_net; 2737 struct sk_buff *skb = NULL; 2738 struct fib6_table *table; 2739 int err = -ENOENT; 2740 2741 if (rt == net->ipv6.ip6_null_entry) 2742 goto out_put; 2743 table = rt->rt6i_table; 2744 write_lock_bh(&table->tb6_lock); 2745 2746 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { 2747 struct rt6_info *sibling, *next_sibling; 2748 2749 /* prefer to send a single notification with all hops */ 2750 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 2751 if (skb) { 2752 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 2753 2754 if (rt6_fill_node(net, skb, rt, 2755 NULL, NULL, 0, RTM_DELROUTE, 2756 info->portid, seq, 0) < 0) { 2757 kfree_skb(skb); 2758 skb = NULL; 2759 } else 2760 info->skip_notify = 1; 2761 } 2762 2763 list_for_each_entry_safe(sibling, next_sibling, 2764 &rt->rt6i_siblings, 2765 rt6i_siblings) { 2766 err = fib6_del(sibling, info); 2767 if (err) 2768 goto out_unlock; 2769 } 2770 } 2771 2772 err = fib6_del(rt, info); 2773 out_unlock: 2774 write_unlock_bh(&table->tb6_lock); 2775 out_put: 2776 ip6_rt_put(rt); 2777 2778 if (skb) { 2779 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 2780 info->nlh, gfp_any()); 2781 } 2782 return err; 2783 } 2784 2785 static int ip6_route_del(struct fib6_config *cfg, 2786 struct netlink_ext_ack *extack) 2787 { 2788 struct fib6_table *table; 2789 struct fib6_node *fn; 2790 struct rt6_info *rt; 2791 int err = -ESRCH; 2792 2793 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 2794 if (!table) { 2795 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 2796 return err; 2797 } 2798 2799 read_lock_bh(&table->tb6_lock); 2800 2801 fn = fib6_locate(&table->tb6_root, 2802 &cfg->fc_dst, cfg->fc_dst_len, 2803 &cfg->fc_src, cfg->fc_src_len, 2804 true); 2805 2806 if (fn) { 2807 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2808 if ((rt->rt6i_flags & RTF_CACHE) && 2809 !(cfg->fc_flags & RTF_CACHE)) 2810 continue; 2811 if (cfg->fc_ifindex && 2812 (!rt->dst.dev || 2813 rt->dst.dev->ifindex != cfg->fc_ifindex)) 2814 continue; 2815 if (cfg->fc_flags & RTF_GATEWAY && 2816 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 2817 continue; 2818 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2819 continue; 2820 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) 2821 continue; 2822 dst_hold(&rt->dst); 2823 read_unlock_bh(&table->tb6_lock); 2824 2825 /* if gateway was specified only delete the one hop */ 2826 if (cfg->fc_flags & RTF_GATEWAY) 2827 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2828 2829 return __ip6_del_rt_siblings(rt, cfg); 2830 } 2831 } 2832 read_unlock_bh(&table->tb6_lock); 2833 2834 return err; 2835 } 2836 2837 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 2838 { 2839 struct netevent_redirect netevent; 2840 struct rt6_info *rt, *nrt = NULL; 2841 struct ndisc_options ndopts; 2842 struct inet6_dev *in6_dev; 2843 struct neighbour *neigh; 2844 struct rd_msg *msg; 2845 int optlen, on_link; 2846 u8 *lladdr; 2847 2848 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 2849 optlen -= sizeof(*msg); 2850 2851 if (optlen < 0) { 2852 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 2853 return; 2854 } 2855 2856 msg = (struct rd_msg *)icmp6_hdr(skb); 2857 2858 if (ipv6_addr_is_multicast(&msg->dest)) { 2859 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 2860 return; 2861 } 2862 2863 on_link = 0; 2864 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 2865 on_link = 1; 2866 } else if (ipv6_addr_type(&msg->target) != 2867 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 2868 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 2869 return; 2870 } 2871 2872 in6_dev = __in6_dev_get(skb->dev); 2873 if (!in6_dev) 2874 return; 2875 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 2876 return; 2877 2878 /* RFC2461 8.1: 2879 * The IP source address of the Redirect MUST be the same as the current 2880 * first-hop router for the specified ICMP Destination Address. 2881 */ 2882 2883 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 2884 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 2885 return; 2886 } 2887 2888 lladdr = NULL; 2889 if (ndopts.nd_opts_tgt_lladdr) { 2890 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 2891 skb->dev); 2892 if (!lladdr) { 2893 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 2894 return; 2895 } 2896 } 2897 2898 rt = (struct rt6_info *) dst; 2899 if (rt->rt6i_flags & RTF_REJECT) { 2900 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 2901 return; 2902 } 2903 2904 /* Redirect received -> path was valid. 2905 * Look, redirects are sent only in response to data packets, 2906 * so that this nexthop apparently is reachable. --ANK 2907 */ 2908 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 2909 2910 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 2911 if (!neigh) 2912 return; 2913 2914 /* 2915 * We have finally decided to accept it. 2916 */ 2917 2918 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 2919 NEIGH_UPDATE_F_WEAK_OVERRIDE| 2920 NEIGH_UPDATE_F_OVERRIDE| 2921 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 2922 NEIGH_UPDATE_F_ISROUTER)), 2923 NDISC_REDIRECT, &ndopts); 2924 2925 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 2926 if (!nrt) 2927 goto out; 2928 2929 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 2930 if (on_link) 2931 nrt->rt6i_flags &= ~RTF_GATEWAY; 2932 2933 nrt->rt6i_protocol = RTPROT_REDIRECT; 2934 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 2935 2936 if (ip6_ins_rt(nrt)) 2937 goto out_release; 2938 2939 netevent.old = &rt->dst; 2940 netevent.new = &nrt->dst; 2941 netevent.daddr = &msg->dest; 2942 netevent.neigh = neigh; 2943 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 2944 2945 if (rt->rt6i_flags & RTF_CACHE) { 2946 rt = (struct rt6_info *) dst_clone(&rt->dst); 2947 ip6_del_rt(rt); 2948 } 2949 2950 out_release: 2951 /* Release the reference taken in 2952 * ip6_rt_cache_alloc() 2953 */ 2954 dst_release(&nrt->dst); 2955 2956 out: 2957 neigh_release(neigh); 2958 } 2959 2960 /* 2961 * Misc support functions 2962 */ 2963 2964 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) 2965 { 2966 BUG_ON(from->dst.from); 2967 2968 rt->rt6i_flags &= ~RTF_EXPIRES; 2969 dst_hold(&from->dst); 2970 rt->dst.from = &from->dst; 2971 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); 2972 } 2973 2974 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) 2975 { 2976 rt->dst.input = ort->dst.input; 2977 rt->dst.output = ort->dst.output; 2978 rt->rt6i_dst = ort->rt6i_dst; 2979 rt->dst.error = ort->dst.error; 2980 rt->rt6i_idev = ort->rt6i_idev; 2981 if (rt->rt6i_idev) 2982 in6_dev_hold(rt->rt6i_idev); 2983 rt->dst.lastuse = jiffies; 2984 rt->rt6i_gateway = ort->rt6i_gateway; 2985 rt->rt6i_flags = ort->rt6i_flags; 2986 rt6_set_from(rt, ort); 2987 rt->rt6i_metric = ort->rt6i_metric; 2988 #ifdef CONFIG_IPV6_SUBTREES 2989 rt->rt6i_src = ort->rt6i_src; 2990 #endif 2991 rt->rt6i_prefsrc = ort->rt6i_prefsrc; 2992 rt->rt6i_table = ort->rt6i_table; 2993 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); 2994 } 2995 2996 #ifdef CONFIG_IPV6_ROUTE_INFO 2997 static struct rt6_info *rt6_get_route_info(struct net *net, 2998 const struct in6_addr *prefix, int prefixlen, 2999 const struct in6_addr *gwaddr, 3000 struct net_device *dev) 3001 { 3002 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 3003 int ifindex = dev->ifindex; 3004 struct fib6_node *fn; 3005 struct rt6_info *rt = NULL; 3006 struct fib6_table *table; 3007 3008 table = fib6_get_table(net, tb_id); 3009 if (!table) 3010 return NULL; 3011 3012 read_lock_bh(&table->tb6_lock); 3013 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); 3014 if (!fn) 3015 goto out; 3016 3017 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 3018 if (rt->dst.dev->ifindex != ifindex) 3019 continue; 3020 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 3021 continue; 3022 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 3023 continue; 3024 dst_hold(&rt->dst); 3025 break; 3026 } 3027 out: 3028 read_unlock_bh(&table->tb6_lock); 3029 return rt; 3030 } 3031 3032 static struct rt6_info *rt6_add_route_info(struct net *net, 3033 const struct in6_addr *prefix, int prefixlen, 3034 const struct in6_addr *gwaddr, 3035 struct net_device *dev, 3036 unsigned int pref) 3037 { 3038 struct fib6_config cfg = { 3039 .fc_metric = IP6_RT_PRIO_USER, 3040 .fc_ifindex = dev->ifindex, 3041 .fc_dst_len = prefixlen, 3042 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 3043 RTF_UP | RTF_PREF(pref), 3044 .fc_protocol = RTPROT_RA, 3045 .fc_nlinfo.portid = 0, 3046 .fc_nlinfo.nlh = NULL, 3047 .fc_nlinfo.nl_net = net, 3048 }; 3049 3050 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 3051 cfg.fc_dst = *prefix; 3052 cfg.fc_gateway = *gwaddr; 3053 3054 /* We should treat it as a default route if prefix length is 0. */ 3055 if (!prefixlen) 3056 cfg.fc_flags |= RTF_DEFAULT; 3057 3058 ip6_route_add(&cfg, NULL); 3059 3060 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 3061 } 3062 #endif 3063 3064 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 3065 { 3066 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 3067 struct rt6_info *rt; 3068 struct fib6_table *table; 3069 3070 table = fib6_get_table(dev_net(dev), tb_id); 3071 if (!table) 3072 return NULL; 3073 3074 read_lock_bh(&table->tb6_lock); 3075 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 3076 if (dev == rt->dst.dev && 3077 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 3078 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 3079 break; 3080 } 3081 if (rt) 3082 dst_hold(&rt->dst); 3083 read_unlock_bh(&table->tb6_lock); 3084 return rt; 3085 } 3086 3087 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 3088 struct net_device *dev, 3089 unsigned int pref) 3090 { 3091 struct fib6_config cfg = { 3092 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 3093 .fc_metric = IP6_RT_PRIO_USER, 3094 .fc_ifindex = dev->ifindex, 3095 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 3096 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 3097 .fc_protocol = RTPROT_RA, 3098 .fc_nlinfo.portid = 0, 3099 .fc_nlinfo.nlh = NULL, 3100 .fc_nlinfo.nl_net = dev_net(dev), 3101 }; 3102 3103 cfg.fc_gateway = *gwaddr; 3104 3105 if (!ip6_route_add(&cfg, NULL)) { 3106 struct fib6_table *table; 3107 3108 table = fib6_get_table(dev_net(dev), cfg.fc_table); 3109 if (table) 3110 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 3111 } 3112 3113 return rt6_get_dflt_router(gwaddr, dev); 3114 } 3115 3116 static void __rt6_purge_dflt_routers(struct fib6_table *table) 3117 { 3118 struct rt6_info *rt; 3119 3120 restart: 3121 read_lock_bh(&table->tb6_lock); 3122 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 3123 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 3124 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 3125 dst_hold(&rt->dst); 3126 read_unlock_bh(&table->tb6_lock); 3127 ip6_del_rt(rt); 3128 goto restart; 3129 } 3130 } 3131 read_unlock_bh(&table->tb6_lock); 3132 3133 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 3134 } 3135 3136 void rt6_purge_dflt_routers(struct net *net) 3137 { 3138 struct fib6_table *table; 3139 struct hlist_head *head; 3140 unsigned int h; 3141 3142 rcu_read_lock(); 3143 3144 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 3145 head = &net->ipv6.fib_table_hash[h]; 3146 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 3147 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 3148 __rt6_purge_dflt_routers(table); 3149 } 3150 } 3151 3152 rcu_read_unlock(); 3153 } 3154 3155 static void rtmsg_to_fib6_config(struct net *net, 3156 struct in6_rtmsg *rtmsg, 3157 struct fib6_config *cfg) 3158 { 3159 memset(cfg, 0, sizeof(*cfg)); 3160 3161 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 3162 : RT6_TABLE_MAIN; 3163 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 3164 cfg->fc_metric = rtmsg->rtmsg_metric; 3165 cfg->fc_expires = rtmsg->rtmsg_info; 3166 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 3167 cfg->fc_src_len = rtmsg->rtmsg_src_len; 3168 cfg->fc_flags = rtmsg->rtmsg_flags; 3169 3170 cfg->fc_nlinfo.nl_net = net; 3171 3172 cfg->fc_dst = rtmsg->rtmsg_dst; 3173 cfg->fc_src = rtmsg->rtmsg_src; 3174 cfg->fc_gateway = rtmsg->rtmsg_gateway; 3175 } 3176 3177 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 3178 { 3179 struct fib6_config cfg; 3180 struct in6_rtmsg rtmsg; 3181 int err; 3182 3183 switch (cmd) { 3184 case SIOCADDRT: /* Add a route */ 3185 case SIOCDELRT: /* Delete a route */ 3186 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 3187 return -EPERM; 3188 err = copy_from_user(&rtmsg, arg, 3189 sizeof(struct in6_rtmsg)); 3190 if (err) 3191 return -EFAULT; 3192 3193 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 3194 3195 rtnl_lock(); 3196 switch (cmd) { 3197 case SIOCADDRT: 3198 err = ip6_route_add(&cfg, NULL); 3199 break; 3200 case SIOCDELRT: 3201 err = ip6_route_del(&cfg, NULL); 3202 break; 3203 default: 3204 err = -EINVAL; 3205 } 3206 rtnl_unlock(); 3207 3208 return err; 3209 } 3210 3211 return -EINVAL; 3212 } 3213 3214 /* 3215 * Drop the packet on the floor 3216 */ 3217 3218 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 3219 { 3220 int type; 3221 struct dst_entry *dst = skb_dst(skb); 3222 switch (ipstats_mib_noroutes) { 3223 case IPSTATS_MIB_INNOROUTES: 3224 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 3225 if (type == IPV6_ADDR_ANY) { 3226 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3227 IPSTATS_MIB_INADDRERRORS); 3228 break; 3229 } 3230 /* FALLTHROUGH */ 3231 case IPSTATS_MIB_OUTNOROUTES: 3232 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 3233 ipstats_mib_noroutes); 3234 break; 3235 } 3236 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 3237 kfree_skb(skb); 3238 return 0; 3239 } 3240 3241 static int ip6_pkt_discard(struct sk_buff *skb) 3242 { 3243 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 3244 } 3245 3246 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3247 { 3248 skb->dev = skb_dst(skb)->dev; 3249 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 3250 } 3251 3252 static int ip6_pkt_prohibit(struct sk_buff *skb) 3253 { 3254 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 3255 } 3256 3257 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 3258 { 3259 skb->dev = skb_dst(skb)->dev; 3260 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 3261 } 3262 3263 /* 3264 * Allocate a dst for local (unicast / anycast) address. 3265 */ 3266 3267 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 3268 const struct in6_addr *addr, 3269 bool anycast) 3270 { 3271 u32 tb_id; 3272 struct net *net = dev_net(idev->dev); 3273 struct net_device *dev = idev->dev; 3274 struct rt6_info *rt; 3275 3276 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); 3277 if (!rt) 3278 return ERR_PTR(-ENOMEM); 3279 3280 in6_dev_hold(idev); 3281 3282 rt->dst.flags |= DST_HOST; 3283 rt->dst.input = ip6_input; 3284 rt->dst.output = ip6_output; 3285 rt->rt6i_idev = idev; 3286 3287 rt->rt6i_protocol = RTPROT_KERNEL; 3288 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 3289 if (anycast) 3290 rt->rt6i_flags |= RTF_ANYCAST; 3291 else 3292 rt->rt6i_flags |= RTF_LOCAL; 3293 3294 rt->rt6i_gateway = *addr; 3295 rt->rt6i_dst.addr = *addr; 3296 rt->rt6i_dst.plen = 128; 3297 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 3298 rt->rt6i_table = fib6_get_table(net, tb_id); 3299 3300 return rt; 3301 } 3302 3303 /* remove deleted ip from prefsrc entries */ 3304 struct arg_dev_net_ip { 3305 struct net_device *dev; 3306 struct net *net; 3307 struct in6_addr *addr; 3308 }; 3309 3310 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 3311 { 3312 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 3313 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 3314 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 3315 3316 if (((void *)rt->dst.dev == dev || !dev) && 3317 rt != net->ipv6.ip6_null_entry && 3318 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 3319 spin_lock_bh(&rt6_exception_lock); 3320 /* remove prefsrc entry */ 3321 rt->rt6i_prefsrc.plen = 0; 3322 /* need to update cache as well */ 3323 rt6_exceptions_remove_prefsrc(rt); 3324 spin_unlock_bh(&rt6_exception_lock); 3325 } 3326 return 0; 3327 } 3328 3329 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 3330 { 3331 struct net *net = dev_net(ifp->idev->dev); 3332 struct arg_dev_net_ip adni = { 3333 .dev = ifp->idev->dev, 3334 .net = net, 3335 .addr = &ifp->addr, 3336 }; 3337 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 3338 } 3339 3340 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 3341 3342 /* Remove routers and update dst entries when gateway turn into host. */ 3343 static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 3344 { 3345 struct in6_addr *gateway = (struct in6_addr *)arg; 3346 3347 /* RTF_CACHE_GATEWAY case will be removed once the exception 3348 * table is hooked up to store all cached routes. 3349 */ 3350 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || 3351 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && 3352 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 3353 return -1; 3354 } 3355 3356 /* Further clean up cached routes in exception table. 3357 * This is needed because cached route may have a different 3358 * gateway than its 'parent' in the case of an ip redirect. 3359 */ 3360 rt6_exceptions_clean_tohost(rt, gateway); 3361 3362 return 0; 3363 } 3364 3365 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 3366 { 3367 fib6_clean_all(net, fib6_clean_tohost, gateway); 3368 } 3369 3370 struct arg_dev_net { 3371 struct net_device *dev; 3372 struct net *net; 3373 }; 3374 3375 /* called with write lock held for table with rt */ 3376 static int fib6_ifdown(struct rt6_info *rt, void *arg) 3377 { 3378 const struct arg_dev_net *adn = arg; 3379 const struct net_device *dev = adn->dev; 3380 3381 if ((rt->dst.dev == dev || !dev) && 3382 rt != adn->net->ipv6.ip6_null_entry && 3383 (rt->rt6i_nsiblings == 0 || 3384 (dev && netdev_unregistering(dev)) || 3385 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) 3386 return -1; 3387 3388 return 0; 3389 } 3390 3391 void rt6_ifdown(struct net *net, struct net_device *dev) 3392 { 3393 struct arg_dev_net adn = { 3394 .dev = dev, 3395 .net = net, 3396 }; 3397 3398 fib6_clean_all(net, fib6_ifdown, &adn); 3399 if (dev) 3400 rt6_uncached_list_flush_dev(net, dev); 3401 } 3402 3403 struct rt6_mtu_change_arg { 3404 struct net_device *dev; 3405 unsigned int mtu; 3406 }; 3407 3408 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 3409 { 3410 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 3411 struct inet6_dev *idev; 3412 3413 /* In IPv6 pmtu discovery is not optional, 3414 so that RTAX_MTU lock cannot disable it. 3415 We still use this lock to block changes 3416 caused by addrconf/ndisc. 3417 */ 3418 3419 idev = __in6_dev_get(arg->dev); 3420 if (!idev) 3421 return 0; 3422 3423 /* For administrative MTU increase, there is no way to discover 3424 IPv6 PMTU increase, so PMTU increase should be updated here. 3425 Since RFC 1981 doesn't include administrative MTU increase 3426 update PMTU increase is a MUST. (i.e. jumbo frame) 3427 */ 3428 /* 3429 If new MTU is less than route PMTU, this new MTU will be the 3430 lowest MTU in the path, update the route PMTU to reflect PMTU 3431 decreases; if new MTU is greater than route PMTU, and the 3432 old MTU is the lowest MTU in the path, update the route PMTU 3433 to reflect the increase. In this case if the other nodes' MTU 3434 also have the lowest MTU, TOO BIG MESSAGE will be lead to 3435 PMTU discovery. 3436 */ 3437 if (rt->dst.dev == arg->dev && 3438 dst_metric_raw(&rt->dst, RTAX_MTU) && 3439 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 3440 spin_lock_bh(&rt6_exception_lock); 3441 /* This case will be removed once the exception table 3442 * is hooked up. 3443 */ 3444 if (rt->rt6i_flags & RTF_CACHE) { 3445 /* For RTF_CACHE with rt6i_pmtu == 0 3446 * (i.e. a redirected route), 3447 * the metrics of its rt->dst.from has already 3448 * been updated. 3449 */ 3450 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) 3451 rt->rt6i_pmtu = arg->mtu; 3452 } else if (dst_mtu(&rt->dst) >= arg->mtu || 3453 (dst_mtu(&rt->dst) < arg->mtu && 3454 dst_mtu(&rt->dst) == idev->cnf.mtu6)) { 3455 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 3456 } 3457 rt6_exceptions_update_pmtu(rt, arg->mtu); 3458 spin_unlock_bh(&rt6_exception_lock); 3459 } 3460 return 0; 3461 } 3462 3463 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 3464 { 3465 struct rt6_mtu_change_arg arg = { 3466 .dev = dev, 3467 .mtu = mtu, 3468 }; 3469 3470 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 3471 } 3472 3473 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 3474 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 3475 [RTA_OIF] = { .type = NLA_U32 }, 3476 [RTA_IIF] = { .type = NLA_U32 }, 3477 [RTA_PRIORITY] = { .type = NLA_U32 }, 3478 [RTA_METRICS] = { .type = NLA_NESTED }, 3479 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 3480 [RTA_PREF] = { .type = NLA_U8 }, 3481 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 3482 [RTA_ENCAP] = { .type = NLA_NESTED }, 3483 [RTA_EXPIRES] = { .type = NLA_U32 }, 3484 [RTA_UID] = { .type = NLA_U32 }, 3485 [RTA_MARK] = { .type = NLA_U32 }, 3486 }; 3487 3488 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 3489 struct fib6_config *cfg, 3490 struct netlink_ext_ack *extack) 3491 { 3492 struct rtmsg *rtm; 3493 struct nlattr *tb[RTA_MAX+1]; 3494 unsigned int pref; 3495 int err; 3496 3497 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 3498 NULL); 3499 if (err < 0) 3500 goto errout; 3501 3502 err = -EINVAL; 3503 rtm = nlmsg_data(nlh); 3504 memset(cfg, 0, sizeof(*cfg)); 3505 3506 cfg->fc_table = rtm->rtm_table; 3507 cfg->fc_dst_len = rtm->rtm_dst_len; 3508 cfg->fc_src_len = rtm->rtm_src_len; 3509 cfg->fc_flags = RTF_UP; 3510 cfg->fc_protocol = rtm->rtm_protocol; 3511 cfg->fc_type = rtm->rtm_type; 3512 3513 if (rtm->rtm_type == RTN_UNREACHABLE || 3514 rtm->rtm_type == RTN_BLACKHOLE || 3515 rtm->rtm_type == RTN_PROHIBIT || 3516 rtm->rtm_type == RTN_THROW) 3517 cfg->fc_flags |= RTF_REJECT; 3518 3519 if (rtm->rtm_type == RTN_LOCAL) 3520 cfg->fc_flags |= RTF_LOCAL; 3521 3522 if (rtm->rtm_flags & RTM_F_CLONED) 3523 cfg->fc_flags |= RTF_CACHE; 3524 3525 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 3526 cfg->fc_nlinfo.nlh = nlh; 3527 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 3528 3529 if (tb[RTA_GATEWAY]) { 3530 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 3531 cfg->fc_flags |= RTF_GATEWAY; 3532 } 3533 3534 if (tb[RTA_DST]) { 3535 int plen = (rtm->rtm_dst_len + 7) >> 3; 3536 3537 if (nla_len(tb[RTA_DST]) < plen) 3538 goto errout; 3539 3540 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 3541 } 3542 3543 if (tb[RTA_SRC]) { 3544 int plen = (rtm->rtm_src_len + 7) >> 3; 3545 3546 if (nla_len(tb[RTA_SRC]) < plen) 3547 goto errout; 3548 3549 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 3550 } 3551 3552 if (tb[RTA_PREFSRC]) 3553 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 3554 3555 if (tb[RTA_OIF]) 3556 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 3557 3558 if (tb[RTA_PRIORITY]) 3559 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 3560 3561 if (tb[RTA_METRICS]) { 3562 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 3563 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 3564 } 3565 3566 if (tb[RTA_TABLE]) 3567 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 3568 3569 if (tb[RTA_MULTIPATH]) { 3570 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 3571 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 3572 3573 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 3574 cfg->fc_mp_len, extack); 3575 if (err < 0) 3576 goto errout; 3577 } 3578 3579 if (tb[RTA_PREF]) { 3580 pref = nla_get_u8(tb[RTA_PREF]); 3581 if (pref != ICMPV6_ROUTER_PREF_LOW && 3582 pref != ICMPV6_ROUTER_PREF_HIGH) 3583 pref = ICMPV6_ROUTER_PREF_MEDIUM; 3584 cfg->fc_flags |= RTF_PREF(pref); 3585 } 3586 3587 if (tb[RTA_ENCAP]) 3588 cfg->fc_encap = tb[RTA_ENCAP]; 3589 3590 if (tb[RTA_ENCAP_TYPE]) { 3591 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 3592 3593 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 3594 if (err < 0) 3595 goto errout; 3596 } 3597 3598 if (tb[RTA_EXPIRES]) { 3599 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 3600 3601 if (addrconf_finite_timeout(timeout)) { 3602 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 3603 cfg->fc_flags |= RTF_EXPIRES; 3604 } 3605 } 3606 3607 err = 0; 3608 errout: 3609 return err; 3610 } 3611 3612 struct rt6_nh { 3613 struct rt6_info *rt6_info; 3614 struct fib6_config r_cfg; 3615 struct mx6_config mxc; 3616 struct list_head next; 3617 }; 3618 3619 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 3620 { 3621 struct rt6_nh *nh; 3622 3623 list_for_each_entry(nh, rt6_nh_list, next) { 3624 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 3625 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3626 nh->r_cfg.fc_ifindex); 3627 } 3628 } 3629 3630 static int ip6_route_info_append(struct list_head *rt6_nh_list, 3631 struct rt6_info *rt, struct fib6_config *r_cfg) 3632 { 3633 struct rt6_nh *nh; 3634 int err = -EEXIST; 3635 3636 list_for_each_entry(nh, rt6_nh_list, next) { 3637 /* check if rt6_info already exists */ 3638 if (rt6_duplicate_nexthop(nh->rt6_info, rt)) 3639 return err; 3640 } 3641 3642 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 3643 if (!nh) 3644 return -ENOMEM; 3645 nh->rt6_info = rt; 3646 err = ip6_convert_metrics(&nh->mxc, r_cfg); 3647 if (err) { 3648 kfree(nh); 3649 return err; 3650 } 3651 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 3652 list_add_tail(&nh->next, rt6_nh_list); 3653 3654 return 0; 3655 } 3656 3657 static void ip6_route_mpath_notify(struct rt6_info *rt, 3658 struct rt6_info *rt_last, 3659 struct nl_info *info, 3660 __u16 nlflags) 3661 { 3662 /* if this is an APPEND route, then rt points to the first route 3663 * inserted and rt_last points to last route inserted. Userspace 3664 * wants a consistent dump of the route which starts at the first 3665 * nexthop. Since sibling routes are always added at the end of 3666 * the list, find the first sibling of the last route appended 3667 */ 3668 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { 3669 rt = list_first_entry(&rt_last->rt6i_siblings, 3670 struct rt6_info, 3671 rt6i_siblings); 3672 } 3673 3674 if (rt) 3675 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 3676 } 3677 3678 static int ip6_route_multipath_add(struct fib6_config *cfg, 3679 struct netlink_ext_ack *extack) 3680 { 3681 struct rt6_info *rt_notif = NULL, *rt_last = NULL; 3682 struct nl_info *info = &cfg->fc_nlinfo; 3683 struct fib6_config r_cfg; 3684 struct rtnexthop *rtnh; 3685 struct rt6_info *rt; 3686 struct rt6_nh *err_nh; 3687 struct rt6_nh *nh, *nh_safe; 3688 __u16 nlflags; 3689 int remaining; 3690 int attrlen; 3691 int err = 1; 3692 int nhn = 0; 3693 int replace = (cfg->fc_nlinfo.nlh && 3694 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 3695 LIST_HEAD(rt6_nh_list); 3696 3697 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 3698 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 3699 nlflags |= NLM_F_APPEND; 3700 3701 remaining = cfg->fc_mp_len; 3702 rtnh = (struct rtnexthop *)cfg->fc_mp; 3703 3704 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 3705 * rt6_info structs per nexthop 3706 */ 3707 while (rtnh_ok(rtnh, remaining)) { 3708 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3709 if (rtnh->rtnh_ifindex) 3710 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3711 3712 attrlen = rtnh_attrlen(rtnh); 3713 if (attrlen > 0) { 3714 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3715 3716 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3717 if (nla) { 3718 r_cfg.fc_gateway = nla_get_in6_addr(nla); 3719 r_cfg.fc_flags |= RTF_GATEWAY; 3720 } 3721 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 3722 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 3723 if (nla) 3724 r_cfg.fc_encap_type = nla_get_u16(nla); 3725 } 3726 3727 rt = ip6_route_info_create(&r_cfg, extack); 3728 if (IS_ERR(rt)) { 3729 err = PTR_ERR(rt); 3730 rt = NULL; 3731 goto cleanup; 3732 } 3733 3734 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 3735 if (err) { 3736 dst_release_immediate(&rt->dst); 3737 goto cleanup; 3738 } 3739 3740 rtnh = rtnh_next(rtnh, &remaining); 3741 } 3742 3743 /* for add and replace send one notification with all nexthops. 3744 * Skip the notification in fib6_add_rt2node and send one with 3745 * the full route when done 3746 */ 3747 info->skip_notify = 1; 3748 3749 err_nh = NULL; 3750 list_for_each_entry(nh, &rt6_nh_list, next) { 3751 rt_last = nh->rt6_info; 3752 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); 3753 /* save reference to first route for notification */ 3754 if (!rt_notif && !err) 3755 rt_notif = nh->rt6_info; 3756 3757 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 3758 nh->rt6_info = NULL; 3759 if (err) { 3760 if (replace && nhn) 3761 ip6_print_replace_route_err(&rt6_nh_list); 3762 err_nh = nh; 3763 goto add_errout; 3764 } 3765 3766 /* Because each route is added like a single route we remove 3767 * these flags after the first nexthop: if there is a collision, 3768 * we have already failed to add the first nexthop: 3769 * fib6_add_rt2node() has rejected it; when replacing, old 3770 * nexthops have been replaced by first new, the rest should 3771 * be added to it. 3772 */ 3773 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 3774 NLM_F_REPLACE); 3775 nhn++; 3776 } 3777 3778 /* success ... tell user about new route */ 3779 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3780 goto cleanup; 3781 3782 add_errout: 3783 /* send notification for routes that were added so that 3784 * the delete notifications sent by ip6_route_del are 3785 * coherent 3786 */ 3787 if (rt_notif) 3788 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3789 3790 /* Delete routes that were already added */ 3791 list_for_each_entry(nh, &rt6_nh_list, next) { 3792 if (err_nh == nh) 3793 break; 3794 ip6_route_del(&nh->r_cfg, extack); 3795 } 3796 3797 cleanup: 3798 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 3799 if (nh->rt6_info) 3800 dst_release_immediate(&nh->rt6_info->dst); 3801 kfree(nh->mxc.mx); 3802 list_del(&nh->next); 3803 kfree(nh); 3804 } 3805 3806 return err; 3807 } 3808 3809 static int ip6_route_multipath_del(struct fib6_config *cfg, 3810 struct netlink_ext_ack *extack) 3811 { 3812 struct fib6_config r_cfg; 3813 struct rtnexthop *rtnh; 3814 int remaining; 3815 int attrlen; 3816 int err = 1, last_err = 0; 3817 3818 remaining = cfg->fc_mp_len; 3819 rtnh = (struct rtnexthop *)cfg->fc_mp; 3820 3821 /* Parse a Multipath Entry */ 3822 while (rtnh_ok(rtnh, remaining)) { 3823 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3824 if (rtnh->rtnh_ifindex) 3825 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3826 3827 attrlen = rtnh_attrlen(rtnh); 3828 if (attrlen > 0) { 3829 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3830 3831 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3832 if (nla) { 3833 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 3834 r_cfg.fc_flags |= RTF_GATEWAY; 3835 } 3836 } 3837 err = ip6_route_del(&r_cfg, extack); 3838 if (err) 3839 last_err = err; 3840 3841 rtnh = rtnh_next(rtnh, &remaining); 3842 } 3843 3844 return last_err; 3845 } 3846 3847 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3848 struct netlink_ext_ack *extack) 3849 { 3850 struct fib6_config cfg; 3851 int err; 3852 3853 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3854 if (err < 0) 3855 return err; 3856 3857 if (cfg.fc_mp) 3858 return ip6_route_multipath_del(&cfg, extack); 3859 else { 3860 cfg.fc_delete_all_nh = 1; 3861 return ip6_route_del(&cfg, extack); 3862 } 3863 } 3864 3865 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3866 struct netlink_ext_ack *extack) 3867 { 3868 struct fib6_config cfg; 3869 int err; 3870 3871 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3872 if (err < 0) 3873 return err; 3874 3875 if (cfg.fc_mp) 3876 return ip6_route_multipath_add(&cfg, extack); 3877 else 3878 return ip6_route_add(&cfg, extack); 3879 } 3880 3881 static size_t rt6_nlmsg_size(struct rt6_info *rt) 3882 { 3883 int nexthop_len = 0; 3884 3885 if (rt->rt6i_nsiblings) { 3886 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 3887 + NLA_ALIGN(sizeof(struct rtnexthop)) 3888 + nla_total_size(16) /* RTA_GATEWAY */ 3889 + lwtunnel_get_encap_size(rt->dst.lwtstate); 3890 3891 nexthop_len *= rt->rt6i_nsiblings; 3892 } 3893 3894 return NLMSG_ALIGN(sizeof(struct rtmsg)) 3895 + nla_total_size(16) /* RTA_SRC */ 3896 + nla_total_size(16) /* RTA_DST */ 3897 + nla_total_size(16) /* RTA_GATEWAY */ 3898 + nla_total_size(16) /* RTA_PREFSRC */ 3899 + nla_total_size(4) /* RTA_TABLE */ 3900 + nla_total_size(4) /* RTA_IIF */ 3901 + nla_total_size(4) /* RTA_OIF */ 3902 + nla_total_size(4) /* RTA_PRIORITY */ 3903 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 3904 + nla_total_size(sizeof(struct rta_cacheinfo)) 3905 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 3906 + nla_total_size(1) /* RTA_PREF */ 3907 + lwtunnel_get_encap_size(rt->dst.lwtstate) 3908 + nexthop_len; 3909 } 3910 3911 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, 3912 unsigned int *flags, bool skip_oif) 3913 { 3914 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { 3915 *flags |= RTNH_F_LINKDOWN; 3916 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 3917 *flags |= RTNH_F_DEAD; 3918 } 3919 3920 if (rt->rt6i_flags & RTF_GATEWAY) { 3921 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 3922 goto nla_put_failure; 3923 } 3924 3925 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) 3926 *flags |= RTNH_F_OFFLOAD; 3927 3928 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 3929 if (!skip_oif && rt->dst.dev && 3930 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 3931 goto nla_put_failure; 3932 3933 if (rt->dst.lwtstate && 3934 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) 3935 goto nla_put_failure; 3936 3937 return 0; 3938 3939 nla_put_failure: 3940 return -EMSGSIZE; 3941 } 3942 3943 /* add multipath next hop */ 3944 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) 3945 { 3946 struct rtnexthop *rtnh; 3947 unsigned int flags = 0; 3948 3949 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 3950 if (!rtnh) 3951 goto nla_put_failure; 3952 3953 rtnh->rtnh_hops = 0; 3954 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; 3955 3956 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 3957 goto nla_put_failure; 3958 3959 rtnh->rtnh_flags = flags; 3960 3961 /* length of rtnetlink header + attributes */ 3962 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 3963 3964 return 0; 3965 3966 nla_put_failure: 3967 return -EMSGSIZE; 3968 } 3969 3970 static int rt6_fill_node(struct net *net, 3971 struct sk_buff *skb, struct rt6_info *rt, 3972 struct in6_addr *dst, struct in6_addr *src, 3973 int iif, int type, u32 portid, u32 seq, 3974 unsigned int flags) 3975 { 3976 u32 metrics[RTAX_MAX]; 3977 struct rtmsg *rtm; 3978 struct nlmsghdr *nlh; 3979 long expires; 3980 u32 table; 3981 3982 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 3983 if (!nlh) 3984 return -EMSGSIZE; 3985 3986 rtm = nlmsg_data(nlh); 3987 rtm->rtm_family = AF_INET6; 3988 rtm->rtm_dst_len = rt->rt6i_dst.plen; 3989 rtm->rtm_src_len = rt->rt6i_src.plen; 3990 rtm->rtm_tos = 0; 3991 if (rt->rt6i_table) 3992 table = rt->rt6i_table->tb6_id; 3993 else 3994 table = RT6_TABLE_UNSPEC; 3995 rtm->rtm_table = table; 3996 if (nla_put_u32(skb, RTA_TABLE, table)) 3997 goto nla_put_failure; 3998 if (rt->rt6i_flags & RTF_REJECT) { 3999 switch (rt->dst.error) { 4000 case -EINVAL: 4001 rtm->rtm_type = RTN_BLACKHOLE; 4002 break; 4003 case -EACCES: 4004 rtm->rtm_type = RTN_PROHIBIT; 4005 break; 4006 case -EAGAIN: 4007 rtm->rtm_type = RTN_THROW; 4008 break; 4009 default: 4010 rtm->rtm_type = RTN_UNREACHABLE; 4011 break; 4012 } 4013 } 4014 else if (rt->rt6i_flags & RTF_LOCAL) 4015 rtm->rtm_type = RTN_LOCAL; 4016 else if (rt->rt6i_flags & RTF_ANYCAST) 4017 rtm->rtm_type = RTN_ANYCAST; 4018 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 4019 rtm->rtm_type = RTN_LOCAL; 4020 else 4021 rtm->rtm_type = RTN_UNICAST; 4022 rtm->rtm_flags = 0; 4023 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 4024 rtm->rtm_protocol = rt->rt6i_protocol; 4025 4026 if (rt->rt6i_flags & RTF_CACHE) 4027 rtm->rtm_flags |= RTM_F_CLONED; 4028 4029 if (dst) { 4030 if (nla_put_in6_addr(skb, RTA_DST, dst)) 4031 goto nla_put_failure; 4032 rtm->rtm_dst_len = 128; 4033 } else if (rtm->rtm_dst_len) 4034 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 4035 goto nla_put_failure; 4036 #ifdef CONFIG_IPV6_SUBTREES 4037 if (src) { 4038 if (nla_put_in6_addr(skb, RTA_SRC, src)) 4039 goto nla_put_failure; 4040 rtm->rtm_src_len = 128; 4041 } else if (rtm->rtm_src_len && 4042 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 4043 goto nla_put_failure; 4044 #endif 4045 if (iif) { 4046 #ifdef CONFIG_IPV6_MROUTE 4047 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 4048 int err = ip6mr_get_route(net, skb, rtm, portid); 4049 4050 if (err == 0) 4051 return 0; 4052 if (err < 0) 4053 goto nla_put_failure; 4054 } else 4055 #endif 4056 if (nla_put_u32(skb, RTA_IIF, iif)) 4057 goto nla_put_failure; 4058 } else if (dst) { 4059 struct in6_addr saddr_buf; 4060 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 4061 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4062 goto nla_put_failure; 4063 } 4064 4065 if (rt->rt6i_prefsrc.plen) { 4066 struct in6_addr saddr_buf; 4067 saddr_buf = rt->rt6i_prefsrc.addr; 4068 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 4069 goto nla_put_failure; 4070 } 4071 4072 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 4073 if (rt->rt6i_pmtu) 4074 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; 4075 if (rtnetlink_put_metrics(skb, metrics) < 0) 4076 goto nla_put_failure; 4077 4078 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 4079 goto nla_put_failure; 4080 4081 /* For multipath routes, walk the siblings list and add 4082 * each as a nexthop within RTA_MULTIPATH. 4083 */ 4084 if (rt->rt6i_nsiblings) { 4085 struct rt6_info *sibling, *next_sibling; 4086 struct nlattr *mp; 4087 4088 mp = nla_nest_start(skb, RTA_MULTIPATH); 4089 if (!mp) 4090 goto nla_put_failure; 4091 4092 if (rt6_add_nexthop(skb, rt) < 0) 4093 goto nla_put_failure; 4094 4095 list_for_each_entry_safe(sibling, next_sibling, 4096 &rt->rt6i_siblings, rt6i_siblings) { 4097 if (rt6_add_nexthop(skb, sibling) < 0) 4098 goto nla_put_failure; 4099 } 4100 4101 nla_nest_end(skb, mp); 4102 } else { 4103 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 4104 goto nla_put_failure; 4105 } 4106 4107 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 4108 4109 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 4110 goto nla_put_failure; 4111 4112 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 4113 goto nla_put_failure; 4114 4115 4116 nlmsg_end(skb, nlh); 4117 return 0; 4118 4119 nla_put_failure: 4120 nlmsg_cancel(skb, nlh); 4121 return -EMSGSIZE; 4122 } 4123 4124 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 4125 { 4126 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 4127 struct net *net = arg->net; 4128 4129 if (rt == net->ipv6.ip6_null_entry) 4130 return 0; 4131 4132 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 4133 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 4134 4135 /* user wants prefix routes only */ 4136 if (rtm->rtm_flags & RTM_F_PREFIX && 4137 !(rt->rt6i_flags & RTF_PREFIX_RT)) { 4138 /* success since this is not a prefix route */ 4139 return 1; 4140 } 4141 } 4142 4143 return rt6_fill_node(net, 4144 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 4145 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 4146 NLM_F_MULTI); 4147 } 4148 4149 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 4150 struct netlink_ext_ack *extack) 4151 { 4152 struct net *net = sock_net(in_skb->sk); 4153 struct nlattr *tb[RTA_MAX+1]; 4154 int err, iif = 0, oif = 0; 4155 struct dst_entry *dst; 4156 struct rt6_info *rt; 4157 struct sk_buff *skb; 4158 struct rtmsg *rtm; 4159 struct flowi6 fl6; 4160 bool fibmatch; 4161 4162 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 4163 extack); 4164 if (err < 0) 4165 goto errout; 4166 4167 err = -EINVAL; 4168 memset(&fl6, 0, sizeof(fl6)); 4169 rtm = nlmsg_data(nlh); 4170 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 4171 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 4172 4173 if (tb[RTA_SRC]) { 4174 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 4175 goto errout; 4176 4177 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 4178 } 4179 4180 if (tb[RTA_DST]) { 4181 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 4182 goto errout; 4183 4184 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 4185 } 4186 4187 if (tb[RTA_IIF]) 4188 iif = nla_get_u32(tb[RTA_IIF]); 4189 4190 if (tb[RTA_OIF]) 4191 oif = nla_get_u32(tb[RTA_OIF]); 4192 4193 if (tb[RTA_MARK]) 4194 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 4195 4196 if (tb[RTA_UID]) 4197 fl6.flowi6_uid = make_kuid(current_user_ns(), 4198 nla_get_u32(tb[RTA_UID])); 4199 else 4200 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 4201 4202 if (iif) { 4203 struct net_device *dev; 4204 int flags = 0; 4205 4206 rcu_read_lock(); 4207 4208 dev = dev_get_by_index_rcu(net, iif); 4209 if (!dev) { 4210 rcu_read_unlock(); 4211 err = -ENODEV; 4212 goto errout; 4213 } 4214 4215 fl6.flowi6_iif = iif; 4216 4217 if (!ipv6_addr_any(&fl6.saddr)) 4218 flags |= RT6_LOOKUP_F_HAS_SADDR; 4219 4220 if (!fibmatch) 4221 dst = ip6_route_input_lookup(net, dev, &fl6, flags); 4222 else 4223 dst = ip6_route_lookup(net, &fl6, 0); 4224 4225 rcu_read_unlock(); 4226 } else { 4227 fl6.flowi6_oif = oif; 4228 4229 if (!fibmatch) 4230 dst = ip6_route_output(net, NULL, &fl6); 4231 else 4232 dst = ip6_route_lookup(net, &fl6, 0); 4233 } 4234 4235 4236 rt = container_of(dst, struct rt6_info, dst); 4237 if (rt->dst.error) { 4238 err = rt->dst.error; 4239 ip6_rt_put(rt); 4240 goto errout; 4241 } 4242 4243 if (rt == net->ipv6.ip6_null_entry) { 4244 err = rt->dst.error; 4245 ip6_rt_put(rt); 4246 goto errout; 4247 } 4248 4249 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 4250 if (!skb) { 4251 ip6_rt_put(rt); 4252 err = -ENOBUFS; 4253 goto errout; 4254 } 4255 4256 skb_dst_set(skb, &rt->dst); 4257 if (fibmatch) 4258 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, 4259 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4260 nlh->nlmsg_seq, 0); 4261 else 4262 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 4263 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 4264 nlh->nlmsg_seq, 0); 4265 if (err < 0) { 4266 kfree_skb(skb); 4267 goto errout; 4268 } 4269 4270 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 4271 errout: 4272 return err; 4273 } 4274 4275 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, 4276 unsigned int nlm_flags) 4277 { 4278 struct sk_buff *skb; 4279 struct net *net = info->nl_net; 4280 u32 seq; 4281 int err; 4282 4283 err = -ENOBUFS; 4284 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 4285 4286 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 4287 if (!skb) 4288 goto errout; 4289 4290 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 4291 event, info->portid, seq, nlm_flags); 4292 if (err < 0) { 4293 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 4294 WARN_ON(err == -EMSGSIZE); 4295 kfree_skb(skb); 4296 goto errout; 4297 } 4298 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 4299 info->nlh, gfp_any()); 4300 return; 4301 errout: 4302 if (err < 0) 4303 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 4304 } 4305 4306 static int ip6_route_dev_notify(struct notifier_block *this, 4307 unsigned long event, void *ptr) 4308 { 4309 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4310 struct net *net = dev_net(dev); 4311 4312 if (!(dev->flags & IFF_LOOPBACK)) 4313 return NOTIFY_OK; 4314 4315 if (event == NETDEV_REGISTER) { 4316 net->ipv6.ip6_null_entry->dst.dev = dev; 4317 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 4318 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4319 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 4320 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 4321 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 4322 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 4323 #endif 4324 } else if (event == NETDEV_UNREGISTER && 4325 dev->reg_state != NETREG_UNREGISTERED) { 4326 /* NETDEV_UNREGISTER could be fired for multiple times by 4327 * netdev_wait_allrefs(). Make sure we only call this once. 4328 */ 4329 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 4330 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4331 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 4332 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 4333 #endif 4334 } 4335 4336 return NOTIFY_OK; 4337 } 4338 4339 /* 4340 * /proc 4341 */ 4342 4343 #ifdef CONFIG_PROC_FS 4344 4345 static const struct file_operations ipv6_route_proc_fops = { 4346 .owner = THIS_MODULE, 4347 .open = ipv6_route_open, 4348 .read = seq_read, 4349 .llseek = seq_lseek, 4350 .release = seq_release_net, 4351 }; 4352 4353 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 4354 { 4355 struct net *net = (struct net *)seq->private; 4356 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 4357 net->ipv6.rt6_stats->fib_nodes, 4358 net->ipv6.rt6_stats->fib_route_nodes, 4359 net->ipv6.rt6_stats->fib_rt_alloc, 4360 net->ipv6.rt6_stats->fib_rt_entries, 4361 net->ipv6.rt6_stats->fib_rt_cache, 4362 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 4363 net->ipv6.rt6_stats->fib_discarded_routes); 4364 4365 return 0; 4366 } 4367 4368 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 4369 { 4370 return single_open_net(inode, file, rt6_stats_seq_show); 4371 } 4372 4373 static const struct file_operations rt6_stats_seq_fops = { 4374 .owner = THIS_MODULE, 4375 .open = rt6_stats_seq_open, 4376 .read = seq_read, 4377 .llseek = seq_lseek, 4378 .release = single_release_net, 4379 }; 4380 #endif /* CONFIG_PROC_FS */ 4381 4382 #ifdef CONFIG_SYSCTL 4383 4384 static 4385 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 4386 void __user *buffer, size_t *lenp, loff_t *ppos) 4387 { 4388 struct net *net; 4389 int delay; 4390 if (!write) 4391 return -EINVAL; 4392 4393 net = (struct net *)ctl->extra1; 4394 delay = net->ipv6.sysctl.flush_delay; 4395 proc_dointvec(ctl, write, buffer, lenp, ppos); 4396 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 4397 return 0; 4398 } 4399 4400 struct ctl_table ipv6_route_table_template[] = { 4401 { 4402 .procname = "flush", 4403 .data = &init_net.ipv6.sysctl.flush_delay, 4404 .maxlen = sizeof(int), 4405 .mode = 0200, 4406 .proc_handler = ipv6_sysctl_rtcache_flush 4407 }, 4408 { 4409 .procname = "gc_thresh", 4410 .data = &ip6_dst_ops_template.gc_thresh, 4411 .maxlen = sizeof(int), 4412 .mode = 0644, 4413 .proc_handler = proc_dointvec, 4414 }, 4415 { 4416 .procname = "max_size", 4417 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 4418 .maxlen = sizeof(int), 4419 .mode = 0644, 4420 .proc_handler = proc_dointvec, 4421 }, 4422 { 4423 .procname = "gc_min_interval", 4424 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4425 .maxlen = sizeof(int), 4426 .mode = 0644, 4427 .proc_handler = proc_dointvec_jiffies, 4428 }, 4429 { 4430 .procname = "gc_timeout", 4431 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 4432 .maxlen = sizeof(int), 4433 .mode = 0644, 4434 .proc_handler = proc_dointvec_jiffies, 4435 }, 4436 { 4437 .procname = "gc_interval", 4438 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 4439 .maxlen = sizeof(int), 4440 .mode = 0644, 4441 .proc_handler = proc_dointvec_jiffies, 4442 }, 4443 { 4444 .procname = "gc_elasticity", 4445 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 4446 .maxlen = sizeof(int), 4447 .mode = 0644, 4448 .proc_handler = proc_dointvec, 4449 }, 4450 { 4451 .procname = "mtu_expires", 4452 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 4453 .maxlen = sizeof(int), 4454 .mode = 0644, 4455 .proc_handler = proc_dointvec_jiffies, 4456 }, 4457 { 4458 .procname = "min_adv_mss", 4459 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 4460 .maxlen = sizeof(int), 4461 .mode = 0644, 4462 .proc_handler = proc_dointvec, 4463 }, 4464 { 4465 .procname = "gc_min_interval_ms", 4466 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 4467 .maxlen = sizeof(int), 4468 .mode = 0644, 4469 .proc_handler = proc_dointvec_ms_jiffies, 4470 }, 4471 { } 4472 }; 4473 4474 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 4475 { 4476 struct ctl_table *table; 4477 4478 table = kmemdup(ipv6_route_table_template, 4479 sizeof(ipv6_route_table_template), 4480 GFP_KERNEL); 4481 4482 if (table) { 4483 table[0].data = &net->ipv6.sysctl.flush_delay; 4484 table[0].extra1 = net; 4485 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 4486 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 4487 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4488 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 4489 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 4490 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 4491 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 4492 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 4493 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 4494 4495 /* Don't export sysctls to unprivileged users */ 4496 if (net->user_ns != &init_user_ns) 4497 table[0].procname = NULL; 4498 } 4499 4500 return table; 4501 } 4502 #endif 4503 4504 static int __net_init ip6_route_net_init(struct net *net) 4505 { 4506 int ret = -ENOMEM; 4507 4508 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 4509 sizeof(net->ipv6.ip6_dst_ops)); 4510 4511 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 4512 goto out_ip6_dst_ops; 4513 4514 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 4515 sizeof(*net->ipv6.ip6_null_entry), 4516 GFP_KERNEL); 4517 if (!net->ipv6.ip6_null_entry) 4518 goto out_ip6_dst_entries; 4519 net->ipv6.ip6_null_entry->dst.path = 4520 (struct dst_entry *)net->ipv6.ip6_null_entry; 4521 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4522 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 4523 ip6_template_metrics, true); 4524 4525 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4526 net->ipv6.fib6_has_custom_rules = false; 4527 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 4528 sizeof(*net->ipv6.ip6_prohibit_entry), 4529 GFP_KERNEL); 4530 if (!net->ipv6.ip6_prohibit_entry) 4531 goto out_ip6_null_entry; 4532 net->ipv6.ip6_prohibit_entry->dst.path = 4533 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 4534 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4535 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 4536 ip6_template_metrics, true); 4537 4538 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 4539 sizeof(*net->ipv6.ip6_blk_hole_entry), 4540 GFP_KERNEL); 4541 if (!net->ipv6.ip6_blk_hole_entry) 4542 goto out_ip6_prohibit_entry; 4543 net->ipv6.ip6_blk_hole_entry->dst.path = 4544 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 4545 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4546 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 4547 ip6_template_metrics, true); 4548 #endif 4549 4550 net->ipv6.sysctl.flush_delay = 0; 4551 net->ipv6.sysctl.ip6_rt_max_size = 4096; 4552 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 4553 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 4554 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 4555 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 4556 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 4557 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 4558 4559 net->ipv6.ip6_rt_gc_expire = 30*HZ; 4560 4561 ret = 0; 4562 out: 4563 return ret; 4564 4565 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4566 out_ip6_prohibit_entry: 4567 kfree(net->ipv6.ip6_prohibit_entry); 4568 out_ip6_null_entry: 4569 kfree(net->ipv6.ip6_null_entry); 4570 #endif 4571 out_ip6_dst_entries: 4572 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4573 out_ip6_dst_ops: 4574 goto out; 4575 } 4576 4577 static void __net_exit ip6_route_net_exit(struct net *net) 4578 { 4579 kfree(net->ipv6.ip6_null_entry); 4580 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4581 kfree(net->ipv6.ip6_prohibit_entry); 4582 kfree(net->ipv6.ip6_blk_hole_entry); 4583 #endif 4584 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4585 } 4586 4587 static int __net_init ip6_route_net_init_late(struct net *net) 4588 { 4589 #ifdef CONFIG_PROC_FS 4590 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 4591 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); 4592 #endif 4593 return 0; 4594 } 4595 4596 static void __net_exit ip6_route_net_exit_late(struct net *net) 4597 { 4598 #ifdef CONFIG_PROC_FS 4599 remove_proc_entry("ipv6_route", net->proc_net); 4600 remove_proc_entry("rt6_stats", net->proc_net); 4601 #endif 4602 } 4603 4604 static struct pernet_operations ip6_route_net_ops = { 4605 .init = ip6_route_net_init, 4606 .exit = ip6_route_net_exit, 4607 }; 4608 4609 static int __net_init ipv6_inetpeer_init(struct net *net) 4610 { 4611 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 4612 4613 if (!bp) 4614 return -ENOMEM; 4615 inet_peer_base_init(bp); 4616 net->ipv6.peers = bp; 4617 return 0; 4618 } 4619 4620 static void __net_exit ipv6_inetpeer_exit(struct net *net) 4621 { 4622 struct inet_peer_base *bp = net->ipv6.peers; 4623 4624 net->ipv6.peers = NULL; 4625 inetpeer_invalidate_tree(bp); 4626 kfree(bp); 4627 } 4628 4629 static struct pernet_operations ipv6_inetpeer_ops = { 4630 .init = ipv6_inetpeer_init, 4631 .exit = ipv6_inetpeer_exit, 4632 }; 4633 4634 static struct pernet_operations ip6_route_net_late_ops = { 4635 .init = ip6_route_net_init_late, 4636 .exit = ip6_route_net_exit_late, 4637 }; 4638 4639 static struct notifier_block ip6_route_dev_notifier = { 4640 .notifier_call = ip6_route_dev_notify, 4641 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 4642 }; 4643 4644 void __init ip6_route_init_special_entries(void) 4645 { 4646 /* Registering of the loopback is done before this portion of code, 4647 * the loopback reference in rt6_info will not be taken, do it 4648 * manually for init_net */ 4649 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 4650 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4651 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4652 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 4653 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4654 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 4655 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4656 #endif 4657 } 4658 4659 int __init ip6_route_init(void) 4660 { 4661 int ret; 4662 int cpu; 4663 4664 ret = -ENOMEM; 4665 ip6_dst_ops_template.kmem_cachep = 4666 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 4667 SLAB_HWCACHE_ALIGN, NULL); 4668 if (!ip6_dst_ops_template.kmem_cachep) 4669 goto out; 4670 4671 ret = dst_entries_init(&ip6_dst_blackhole_ops); 4672 if (ret) 4673 goto out_kmem_cache; 4674 4675 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 4676 if (ret) 4677 goto out_dst_entries; 4678 4679 ret = register_pernet_subsys(&ip6_route_net_ops); 4680 if (ret) 4681 goto out_register_inetpeer; 4682 4683 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 4684 4685 ret = fib6_init(); 4686 if (ret) 4687 goto out_register_subsys; 4688 4689 ret = xfrm6_init(); 4690 if (ret) 4691 goto out_fib6_init; 4692 4693 ret = fib6_rules_init(); 4694 if (ret) 4695 goto xfrm6_init; 4696 4697 ret = register_pernet_subsys(&ip6_route_net_late_ops); 4698 if (ret) 4699 goto fib6_rules_init; 4700 4701 ret = -ENOBUFS; 4702 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || 4703 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || 4704 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 4705 RTNL_FLAG_DOIT_UNLOCKED)) 4706 goto out_register_late_subsys; 4707 4708 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 4709 if (ret) 4710 goto out_register_late_subsys; 4711 4712 for_each_possible_cpu(cpu) { 4713 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 4714 4715 INIT_LIST_HEAD(&ul->head); 4716 spin_lock_init(&ul->lock); 4717 } 4718 4719 out: 4720 return ret; 4721 4722 out_register_late_subsys: 4723 unregister_pernet_subsys(&ip6_route_net_late_ops); 4724 fib6_rules_init: 4725 fib6_rules_cleanup(); 4726 xfrm6_init: 4727 xfrm6_fini(); 4728 out_fib6_init: 4729 fib6_gc_cleanup(); 4730 out_register_subsys: 4731 unregister_pernet_subsys(&ip6_route_net_ops); 4732 out_register_inetpeer: 4733 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4734 out_dst_entries: 4735 dst_entries_destroy(&ip6_dst_blackhole_ops); 4736 out_kmem_cache: 4737 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4738 goto out; 4739 } 4740 4741 void ip6_route_cleanup(void) 4742 { 4743 unregister_netdevice_notifier(&ip6_route_dev_notifier); 4744 unregister_pernet_subsys(&ip6_route_net_late_ops); 4745 fib6_rules_cleanup(); 4746 xfrm6_fini(); 4747 fib6_gc_cleanup(); 4748 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4749 unregister_pernet_subsys(&ip6_route_net_ops); 4750 dst_entries_destroy(&ip6_dst_blackhole_ops); 4751 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4752 } 4753