1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <net/net_namespace.h> 48 #include <net/snmp.h> 49 #include <net/ipv6.h> 50 #include <net/ip6_fib.h> 51 #include <net/ip6_route.h> 52 #include <net/ndisc.h> 53 #include <net/addrconf.h> 54 #include <net/tcp.h> 55 #include <linux/rtnetlink.h> 56 #include <net/dst.h> 57 #include <net/dst_metadata.h> 58 #include <net/xfrm.h> 59 #include <net/netevent.h> 60 #include <net/netlink.h> 61 #include <net/nexthop.h> 62 #include <net/lwtunnel.h> 63 #include <net/ip_tunnels.h> 64 #include <net/l3mdev.h> 65 #include <trace/events/fib6.h> 66 67 #include <linux/uaccess.h> 68 69 #ifdef CONFIG_SYSCTL 70 #include <linux/sysctl.h> 71 #endif 72 73 enum rt6_nud_state { 74 RT6_NUD_FAIL_HARD = -3, 75 RT6_NUD_FAIL_PROBE = -2, 76 RT6_NUD_FAIL_DO_RR = -1, 77 RT6_NUD_SUCCEED = 1 78 }; 79 80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); 81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 82 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 83 static unsigned int ip6_mtu(const struct dst_entry *dst); 84 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 85 static void ip6_dst_destroy(struct dst_entry *); 86 static void ip6_dst_ifdown(struct dst_entry *, 87 struct net_device *dev, int how); 88 static int ip6_dst_gc(struct dst_ops *ops); 89 90 static int ip6_pkt_discard(struct sk_buff *skb); 91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); 92 static int ip6_pkt_prohibit(struct sk_buff *skb); 93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); 94 static void ip6_link_failure(struct sk_buff *skb); 95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb, u32 mtu); 97 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 98 struct sk_buff *skb); 99 static void rt6_dst_from_metrics_check(struct rt6_info *rt); 100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 101 static size_t rt6_nlmsg_size(struct rt6_info *rt); 102 static int rt6_fill_node(struct net *net, 103 struct sk_buff *skb, struct rt6_info *rt, 104 struct in6_addr *dst, struct in6_addr *src, 105 int iif, int type, u32 portid, u32 seq, 106 unsigned int flags); 107 108 #ifdef CONFIG_IPV6_ROUTE_INFO 109 static struct rt6_info *rt6_add_route_info(struct net *net, 110 const struct in6_addr *prefix, int prefixlen, 111 const struct in6_addr *gwaddr, 112 struct net_device *dev, 113 unsigned int pref); 114 static struct rt6_info *rt6_get_route_info(struct net *net, 115 const struct in6_addr *prefix, int prefixlen, 116 const struct in6_addr *gwaddr, 117 struct net_device *dev); 118 #endif 119 120 struct uncached_list { 121 spinlock_t lock; 122 struct list_head head; 123 }; 124 125 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 126 127 static void rt6_uncached_list_add(struct rt6_info *rt) 128 { 129 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 130 131 rt->rt6i_uncached_list = ul; 132 133 spin_lock_bh(&ul->lock); 134 list_add_tail(&rt->rt6i_uncached, &ul->head); 135 spin_unlock_bh(&ul->lock); 136 } 137 138 static void rt6_uncached_list_del(struct rt6_info *rt) 139 { 140 if (!list_empty(&rt->rt6i_uncached)) { 141 struct uncached_list *ul = rt->rt6i_uncached_list; 142 143 spin_lock_bh(&ul->lock); 144 list_del(&rt->rt6i_uncached); 145 spin_unlock_bh(&ul->lock); 146 } 147 } 148 149 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 150 { 151 struct net_device *loopback_dev = net->loopback_dev; 152 int cpu; 153 154 if (dev == loopback_dev) 155 return; 156 157 for_each_possible_cpu(cpu) { 158 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 159 struct rt6_info *rt; 160 161 spin_lock_bh(&ul->lock); 162 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 163 struct inet6_dev *rt_idev = rt->rt6i_idev; 164 struct net_device *rt_dev = rt->dst.dev; 165 166 if (rt_idev->dev == dev) { 167 rt->rt6i_idev = in6_dev_get(loopback_dev); 168 in6_dev_put(rt_idev); 169 } 170 171 if (rt_dev == dev) { 172 rt->dst.dev = loopback_dev; 173 dev_hold(rt->dst.dev); 174 dev_put(rt_dev); 175 } 176 } 177 spin_unlock_bh(&ul->lock); 178 } 179 } 180 181 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 182 { 183 return dst_metrics_write_ptr(rt->dst.from); 184 } 185 186 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 187 { 188 struct rt6_info *rt = (struct rt6_info *)dst; 189 190 if (rt->rt6i_flags & RTF_PCPU) 191 return rt6_pcpu_cow_metrics(rt); 192 else if (rt->rt6i_flags & RTF_CACHE) 193 return NULL; 194 else 195 return dst_cow_metrics_generic(dst, old); 196 } 197 198 static inline const void *choose_neigh_daddr(struct rt6_info *rt, 199 struct sk_buff *skb, 200 const void *daddr) 201 { 202 struct in6_addr *p = &rt->rt6i_gateway; 203 204 if (!ipv6_addr_any(p)) 205 return (const void *) p; 206 else if (skb) 207 return &ipv6_hdr(skb)->daddr; 208 return daddr; 209 } 210 211 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 212 struct sk_buff *skb, 213 const void *daddr) 214 { 215 struct rt6_info *rt = (struct rt6_info *) dst; 216 struct neighbour *n; 217 218 daddr = choose_neigh_daddr(rt, skb, daddr); 219 n = __ipv6_neigh_lookup(dst->dev, daddr); 220 if (n) 221 return n; 222 return neigh_create(&nd_tbl, daddr, dst->dev); 223 } 224 225 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) 226 { 227 struct net_device *dev = dst->dev; 228 struct rt6_info *rt = (struct rt6_info *)dst; 229 230 daddr = choose_neigh_daddr(rt, NULL, daddr); 231 if (!daddr) 232 return; 233 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) 234 return; 235 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) 236 return; 237 __ipv6_confirm_neigh(dev, daddr); 238 } 239 240 static struct dst_ops ip6_dst_ops_template = { 241 .family = AF_INET6, 242 .gc = ip6_dst_gc, 243 .gc_thresh = 1024, 244 .check = ip6_dst_check, 245 .default_advmss = ip6_default_advmss, 246 .mtu = ip6_mtu, 247 .cow_metrics = ipv6_cow_metrics, 248 .destroy = ip6_dst_destroy, 249 .ifdown = ip6_dst_ifdown, 250 .negative_advice = ip6_negative_advice, 251 .link_failure = ip6_link_failure, 252 .update_pmtu = ip6_rt_update_pmtu, 253 .redirect = rt6_do_redirect, 254 .local_out = __ip6_local_out, 255 .neigh_lookup = ip6_neigh_lookup, 256 .confirm_neigh = ip6_confirm_neigh, 257 }; 258 259 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 260 { 261 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 262 263 return mtu ? : dst->dev->mtu; 264 } 265 266 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 267 struct sk_buff *skb, u32 mtu) 268 { 269 } 270 271 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 272 struct sk_buff *skb) 273 { 274 } 275 276 static struct dst_ops ip6_dst_blackhole_ops = { 277 .family = AF_INET6, 278 .destroy = ip6_dst_destroy, 279 .check = ip6_dst_check, 280 .mtu = ip6_blackhole_mtu, 281 .default_advmss = ip6_default_advmss, 282 .update_pmtu = ip6_rt_blackhole_update_pmtu, 283 .redirect = ip6_rt_blackhole_redirect, 284 .cow_metrics = dst_cow_metrics_generic, 285 .neigh_lookup = ip6_neigh_lookup, 286 }; 287 288 static const u32 ip6_template_metrics[RTAX_MAX] = { 289 [RTAX_HOPLIMIT - 1] = 0, 290 }; 291 292 static const struct rt6_info ip6_null_entry_template = { 293 .dst = { 294 .__refcnt = ATOMIC_INIT(1), 295 .__use = 1, 296 .obsolete = DST_OBSOLETE_FORCE_CHK, 297 .error = -ENETUNREACH, 298 .input = ip6_pkt_discard, 299 .output = ip6_pkt_discard_out, 300 }, 301 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 302 .rt6i_protocol = RTPROT_KERNEL, 303 .rt6i_metric = ~(u32) 0, 304 .rt6i_ref = ATOMIC_INIT(1), 305 }; 306 307 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 308 309 static const struct rt6_info ip6_prohibit_entry_template = { 310 .dst = { 311 .__refcnt = ATOMIC_INIT(1), 312 .__use = 1, 313 .obsolete = DST_OBSOLETE_FORCE_CHK, 314 .error = -EACCES, 315 .input = ip6_pkt_prohibit, 316 .output = ip6_pkt_prohibit_out, 317 }, 318 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 319 .rt6i_protocol = RTPROT_KERNEL, 320 .rt6i_metric = ~(u32) 0, 321 .rt6i_ref = ATOMIC_INIT(1), 322 }; 323 324 static const struct rt6_info ip6_blk_hole_entry_template = { 325 .dst = { 326 .__refcnt = ATOMIC_INIT(1), 327 .__use = 1, 328 .obsolete = DST_OBSOLETE_FORCE_CHK, 329 .error = -EINVAL, 330 .input = dst_discard, 331 .output = dst_discard_out, 332 }, 333 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 334 .rt6i_protocol = RTPROT_KERNEL, 335 .rt6i_metric = ~(u32) 0, 336 .rt6i_ref = ATOMIC_INIT(1), 337 }; 338 339 #endif 340 341 static void rt6_info_init(struct rt6_info *rt) 342 { 343 struct dst_entry *dst = &rt->dst; 344 345 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 346 INIT_LIST_HEAD(&rt->rt6i_siblings); 347 INIT_LIST_HEAD(&rt->rt6i_uncached); 348 } 349 350 /* allocate dst with ip6_dst_ops */ 351 static struct rt6_info *__ip6_dst_alloc(struct net *net, 352 struct net_device *dev, 353 int flags) 354 { 355 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 356 1, DST_OBSOLETE_FORCE_CHK, flags); 357 358 if (rt) 359 rt6_info_init(rt); 360 361 return rt; 362 } 363 364 struct rt6_info *ip6_dst_alloc(struct net *net, 365 struct net_device *dev, 366 int flags) 367 { 368 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags); 369 370 if (rt) { 371 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); 372 if (rt->rt6i_pcpu) { 373 int cpu; 374 375 for_each_possible_cpu(cpu) { 376 struct rt6_info **p; 377 378 p = per_cpu_ptr(rt->rt6i_pcpu, cpu); 379 /* no one shares rt */ 380 *p = NULL; 381 } 382 } else { 383 dst_release_immediate(&rt->dst); 384 return NULL; 385 } 386 } 387 388 return rt; 389 } 390 EXPORT_SYMBOL(ip6_dst_alloc); 391 392 static void ip6_dst_destroy(struct dst_entry *dst) 393 { 394 struct rt6_info *rt = (struct rt6_info *)dst; 395 struct dst_entry *from = dst->from; 396 struct inet6_dev *idev; 397 398 dst_destroy_metrics_generic(dst); 399 free_percpu(rt->rt6i_pcpu); 400 rt6_uncached_list_del(rt); 401 402 idev = rt->rt6i_idev; 403 if (idev) { 404 rt->rt6i_idev = NULL; 405 in6_dev_put(idev); 406 } 407 408 dst->from = NULL; 409 dst_release(from); 410 } 411 412 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 413 int how) 414 { 415 struct rt6_info *rt = (struct rt6_info *)dst; 416 struct inet6_dev *idev = rt->rt6i_idev; 417 struct net_device *loopback_dev = 418 dev_net(dev)->loopback_dev; 419 420 if (idev && idev->dev != loopback_dev) { 421 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); 422 if (loopback_idev) { 423 rt->rt6i_idev = loopback_idev; 424 in6_dev_put(idev); 425 } 426 } 427 } 428 429 static bool __rt6_check_expired(const struct rt6_info *rt) 430 { 431 if (rt->rt6i_flags & RTF_EXPIRES) 432 return time_after(jiffies, rt->dst.expires); 433 else 434 return false; 435 } 436 437 static bool rt6_check_expired(const struct rt6_info *rt) 438 { 439 if (rt->rt6i_flags & RTF_EXPIRES) { 440 if (time_after(jiffies, rt->dst.expires)) 441 return true; 442 } else if (rt->dst.from) { 443 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || 444 rt6_check_expired((struct rt6_info *)rt->dst.from); 445 } 446 return false; 447 } 448 449 static struct rt6_info *rt6_multipath_select(struct rt6_info *match, 450 struct flowi6 *fl6, int oif, 451 int strict) 452 { 453 struct rt6_info *sibling, *next_sibling; 454 int route_choosen; 455 456 /* We might have already computed the hash for ICMPv6 errors. In such 457 * case it will always be non-zero. Otherwise now is the time to do it. 458 */ 459 if (!fl6->mp_hash) 460 fl6->mp_hash = rt6_multipath_hash(fl6, NULL); 461 462 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1); 463 /* Don't change the route, if route_choosen == 0 464 * (siblings does not include ourself) 465 */ 466 if (route_choosen) 467 list_for_each_entry_safe(sibling, next_sibling, 468 &match->rt6i_siblings, rt6i_siblings) { 469 route_choosen--; 470 if (route_choosen == 0) { 471 if (rt6_score_route(sibling, oif, strict) < 0) 472 break; 473 match = sibling; 474 break; 475 } 476 } 477 return match; 478 } 479 480 /* 481 * Route lookup. Any table->tb6_lock is implied. 482 */ 483 484 static inline struct rt6_info *rt6_device_match(struct net *net, 485 struct rt6_info *rt, 486 const struct in6_addr *saddr, 487 int oif, 488 int flags) 489 { 490 struct rt6_info *local = NULL; 491 struct rt6_info *sprt; 492 493 if (!oif && ipv6_addr_any(saddr)) 494 goto out; 495 496 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { 497 struct net_device *dev = sprt->dst.dev; 498 499 if (oif) { 500 if (dev->ifindex == oif) 501 return sprt; 502 if (dev->flags & IFF_LOOPBACK) { 503 if (!sprt->rt6i_idev || 504 sprt->rt6i_idev->dev->ifindex != oif) { 505 if (flags & RT6_LOOKUP_F_IFACE) 506 continue; 507 if (local && 508 local->rt6i_idev->dev->ifindex == oif) 509 continue; 510 } 511 local = sprt; 512 } 513 } else { 514 if (ipv6_chk_addr(net, saddr, dev, 515 flags & RT6_LOOKUP_F_IFACE)) 516 return sprt; 517 } 518 } 519 520 if (oif) { 521 if (local) 522 return local; 523 524 if (flags & RT6_LOOKUP_F_IFACE) 525 return net->ipv6.ip6_null_entry; 526 } 527 out: 528 return rt; 529 } 530 531 #ifdef CONFIG_IPV6_ROUTER_PREF 532 struct __rt6_probe_work { 533 struct work_struct work; 534 struct in6_addr target; 535 struct net_device *dev; 536 }; 537 538 static void rt6_probe_deferred(struct work_struct *w) 539 { 540 struct in6_addr mcaddr; 541 struct __rt6_probe_work *work = 542 container_of(w, struct __rt6_probe_work, work); 543 544 addrconf_addr_solict_mult(&work->target, &mcaddr); 545 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); 546 dev_put(work->dev); 547 kfree(work); 548 } 549 550 static void rt6_probe(struct rt6_info *rt) 551 { 552 struct __rt6_probe_work *work; 553 struct neighbour *neigh; 554 /* 555 * Okay, this does not seem to be appropriate 556 * for now, however, we need to check if it 557 * is really so; aka Router Reachability Probing. 558 * 559 * Router Reachability Probe MUST be rate-limited 560 * to no more than one per minute. 561 */ 562 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 563 return; 564 rcu_read_lock_bh(); 565 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 566 if (neigh) { 567 if (neigh->nud_state & NUD_VALID) 568 goto out; 569 570 work = NULL; 571 write_lock(&neigh->lock); 572 if (!(neigh->nud_state & NUD_VALID) && 573 time_after(jiffies, 574 neigh->updated + 575 rt->rt6i_idev->cnf.rtr_probe_interval)) { 576 work = kmalloc(sizeof(*work), GFP_ATOMIC); 577 if (work) 578 __neigh_set_probe_once(neigh); 579 } 580 write_unlock(&neigh->lock); 581 } else { 582 work = kmalloc(sizeof(*work), GFP_ATOMIC); 583 } 584 585 if (work) { 586 INIT_WORK(&work->work, rt6_probe_deferred); 587 work->target = rt->rt6i_gateway; 588 dev_hold(rt->dst.dev); 589 work->dev = rt->dst.dev; 590 schedule_work(&work->work); 591 } 592 593 out: 594 rcu_read_unlock_bh(); 595 } 596 #else 597 static inline void rt6_probe(struct rt6_info *rt) 598 { 599 } 600 #endif 601 602 /* 603 * Default Router Selection (RFC 2461 6.3.6) 604 */ 605 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 606 { 607 struct net_device *dev = rt->dst.dev; 608 if (!oif || dev->ifindex == oif) 609 return 2; 610 if ((dev->flags & IFF_LOOPBACK) && 611 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 612 return 1; 613 return 0; 614 } 615 616 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 617 { 618 struct neighbour *neigh; 619 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 620 621 if (rt->rt6i_flags & RTF_NONEXTHOP || 622 !(rt->rt6i_flags & RTF_GATEWAY)) 623 return RT6_NUD_SUCCEED; 624 625 rcu_read_lock_bh(); 626 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 627 if (neigh) { 628 read_lock(&neigh->lock); 629 if (neigh->nud_state & NUD_VALID) 630 ret = RT6_NUD_SUCCEED; 631 #ifdef CONFIG_IPV6_ROUTER_PREF 632 else if (!(neigh->nud_state & NUD_FAILED)) 633 ret = RT6_NUD_SUCCEED; 634 else 635 ret = RT6_NUD_FAIL_PROBE; 636 #endif 637 read_unlock(&neigh->lock); 638 } else { 639 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 640 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 641 } 642 rcu_read_unlock_bh(); 643 644 return ret; 645 } 646 647 static int rt6_score_route(struct rt6_info *rt, int oif, 648 int strict) 649 { 650 int m; 651 652 m = rt6_check_dev(rt, oif); 653 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 654 return RT6_NUD_FAIL_HARD; 655 #ifdef CONFIG_IPV6_ROUTER_PREF 656 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 657 #endif 658 if (strict & RT6_LOOKUP_F_REACHABLE) { 659 int n = rt6_check_neigh(rt); 660 if (n < 0) 661 return n; 662 } 663 return m; 664 } 665 666 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 667 int *mpri, struct rt6_info *match, 668 bool *do_rr) 669 { 670 int m; 671 bool match_do_rr = false; 672 struct inet6_dev *idev = rt->rt6i_idev; 673 struct net_device *dev = rt->dst.dev; 674 675 if (dev && !netif_carrier_ok(dev) && 676 idev->cnf.ignore_routes_with_linkdown && 677 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) 678 goto out; 679 680 if (rt6_check_expired(rt)) 681 goto out; 682 683 m = rt6_score_route(rt, oif, strict); 684 if (m == RT6_NUD_FAIL_DO_RR) { 685 match_do_rr = true; 686 m = 0; /* lowest valid score */ 687 } else if (m == RT6_NUD_FAIL_HARD) { 688 goto out; 689 } 690 691 if (strict & RT6_LOOKUP_F_REACHABLE) 692 rt6_probe(rt); 693 694 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 695 if (m > *mpri) { 696 *do_rr = match_do_rr; 697 *mpri = m; 698 match = rt; 699 } 700 out: 701 return match; 702 } 703 704 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 705 struct rt6_info *rr_head, 706 u32 metric, int oif, int strict, 707 bool *do_rr) 708 { 709 struct rt6_info *rt, *match, *cont; 710 int mpri = -1; 711 712 match = NULL; 713 cont = NULL; 714 for (rt = rr_head; rt; rt = rt->dst.rt6_next) { 715 if (rt->rt6i_metric != metric) { 716 cont = rt; 717 break; 718 } 719 720 match = find_match(rt, oif, strict, &mpri, match, do_rr); 721 } 722 723 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { 724 if (rt->rt6i_metric != metric) { 725 cont = rt; 726 break; 727 } 728 729 match = find_match(rt, oif, strict, &mpri, match, do_rr); 730 } 731 732 if (match || !cont) 733 return match; 734 735 for (rt = cont; rt; rt = rt->dst.rt6_next) 736 match = find_match(rt, oif, strict, &mpri, match, do_rr); 737 738 return match; 739 } 740 741 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) 742 { 743 struct rt6_info *match, *rt0; 744 struct net *net; 745 bool do_rr = false; 746 747 rt0 = fn->rr_ptr; 748 if (!rt0) 749 fn->rr_ptr = rt0 = fn->leaf; 750 751 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, 752 &do_rr); 753 754 if (do_rr) { 755 struct rt6_info *next = rt0->dst.rt6_next; 756 757 /* no entries matched; do round-robin */ 758 if (!next || next->rt6i_metric != rt0->rt6i_metric) 759 next = fn->leaf; 760 761 if (next != rt0) 762 fn->rr_ptr = next; 763 } 764 765 net = dev_net(rt0->dst.dev); 766 return match ? match : net->ipv6.ip6_null_entry; 767 } 768 769 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 770 { 771 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 772 } 773 774 #ifdef CONFIG_IPV6_ROUTE_INFO 775 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 776 const struct in6_addr *gwaddr) 777 { 778 struct net *net = dev_net(dev); 779 struct route_info *rinfo = (struct route_info *) opt; 780 struct in6_addr prefix_buf, *prefix; 781 unsigned int pref; 782 unsigned long lifetime; 783 struct rt6_info *rt; 784 785 if (len < sizeof(struct route_info)) { 786 return -EINVAL; 787 } 788 789 /* Sanity check for prefix_len and length */ 790 if (rinfo->length > 3) { 791 return -EINVAL; 792 } else if (rinfo->prefix_len > 128) { 793 return -EINVAL; 794 } else if (rinfo->prefix_len > 64) { 795 if (rinfo->length < 2) { 796 return -EINVAL; 797 } 798 } else if (rinfo->prefix_len > 0) { 799 if (rinfo->length < 1) { 800 return -EINVAL; 801 } 802 } 803 804 pref = rinfo->route_pref; 805 if (pref == ICMPV6_ROUTER_PREF_INVALID) 806 return -EINVAL; 807 808 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 809 810 if (rinfo->length == 3) 811 prefix = (struct in6_addr *)rinfo->prefix; 812 else { 813 /* this function is safe */ 814 ipv6_addr_prefix(&prefix_buf, 815 (struct in6_addr *)rinfo->prefix, 816 rinfo->prefix_len); 817 prefix = &prefix_buf; 818 } 819 820 if (rinfo->prefix_len == 0) 821 rt = rt6_get_dflt_router(gwaddr, dev); 822 else 823 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 824 gwaddr, dev); 825 826 if (rt && !lifetime) { 827 ip6_del_rt(rt); 828 rt = NULL; 829 } 830 831 if (!rt && lifetime) 832 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, 833 dev, pref); 834 else if (rt) 835 rt->rt6i_flags = RTF_ROUTEINFO | 836 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 837 838 if (rt) { 839 if (!addrconf_finite_timeout(lifetime)) 840 rt6_clean_expires(rt); 841 else 842 rt6_set_expires(rt, jiffies + HZ * lifetime); 843 844 ip6_rt_put(rt); 845 } 846 return 0; 847 } 848 #endif 849 850 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 851 struct in6_addr *saddr) 852 { 853 struct fib6_node *pn; 854 while (1) { 855 if (fn->fn_flags & RTN_TL_ROOT) 856 return NULL; 857 pn = fn->parent; 858 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) 859 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); 860 else 861 fn = pn; 862 if (fn->fn_flags & RTN_RTINFO) 863 return fn; 864 } 865 } 866 867 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 868 struct fib6_table *table, 869 struct flowi6 *fl6, int flags) 870 { 871 struct fib6_node *fn; 872 struct rt6_info *rt; 873 874 read_lock_bh(&table->tb6_lock); 875 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 876 restart: 877 rt = fn->leaf; 878 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); 879 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 880 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); 881 if (rt == net->ipv6.ip6_null_entry) { 882 fn = fib6_backtrack(fn, &fl6->saddr); 883 if (fn) 884 goto restart; 885 } 886 dst_use(&rt->dst, jiffies); 887 read_unlock_bh(&table->tb6_lock); 888 889 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 890 891 return rt; 892 893 } 894 895 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 896 int flags) 897 { 898 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); 899 } 900 EXPORT_SYMBOL_GPL(ip6_route_lookup); 901 902 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 903 const struct in6_addr *saddr, int oif, int strict) 904 { 905 struct flowi6 fl6 = { 906 .flowi6_oif = oif, 907 .daddr = *daddr, 908 }; 909 struct dst_entry *dst; 910 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 911 912 if (saddr) { 913 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 914 flags |= RT6_LOOKUP_F_HAS_SADDR; 915 } 916 917 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 918 if (dst->error == 0) 919 return (struct rt6_info *) dst; 920 921 dst_release(dst); 922 923 return NULL; 924 } 925 EXPORT_SYMBOL(rt6_lookup); 926 927 /* ip6_ins_rt is called with FREE table->tb6_lock. 928 * It takes new route entry, the addition fails by any reason the 929 * route is released. 930 * Caller must hold dst before calling it. 931 */ 932 933 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 934 struct mx6_config *mxc, 935 struct netlink_ext_ack *extack) 936 { 937 int err; 938 struct fib6_table *table; 939 940 table = rt->rt6i_table; 941 write_lock_bh(&table->tb6_lock); 942 err = fib6_add(&table->tb6_root, rt, info, mxc, extack); 943 write_unlock_bh(&table->tb6_lock); 944 945 return err; 946 } 947 948 int ip6_ins_rt(struct rt6_info *rt) 949 { 950 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 951 struct mx6_config mxc = { .mx = NULL, }; 952 953 /* Hold dst to account for the reference from the fib6 tree */ 954 dst_hold(&rt->dst); 955 return __ip6_ins_rt(rt, &info, &mxc, NULL); 956 } 957 958 /* called with rcu_lock held */ 959 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt) 960 { 961 struct net_device *dev = rt->dst.dev; 962 963 if (rt->rt6i_flags & RTF_LOCAL) { 964 /* for copies of local routes, dst->dev needs to be the 965 * device if it is a master device, the master device if 966 * device is enslaved, and the loopback as the default 967 */ 968 if (netif_is_l3_slave(dev) && 969 !rt6_need_strict(&rt->rt6i_dst.addr)) 970 dev = l3mdev_master_dev_rcu(dev); 971 else if (!netif_is_l3_master(dev)) 972 dev = dev_net(dev)->loopback_dev; 973 /* last case is netif_is_l3_master(dev) is true in which 974 * case we want dev returned to be dev 975 */ 976 } 977 978 return dev; 979 } 980 981 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, 982 const struct in6_addr *daddr, 983 const struct in6_addr *saddr) 984 { 985 struct net_device *dev; 986 struct rt6_info *rt; 987 988 /* 989 * Clone the route. 990 */ 991 992 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 993 ort = (struct rt6_info *)ort->dst.from; 994 995 rcu_read_lock(); 996 dev = ip6_rt_get_dev_rcu(ort); 997 rt = __ip6_dst_alloc(dev_net(dev), dev, 0); 998 rcu_read_unlock(); 999 if (!rt) 1000 return NULL; 1001 1002 ip6_rt_copy_init(rt, ort); 1003 rt->rt6i_flags |= RTF_CACHE; 1004 rt->rt6i_metric = 0; 1005 rt->dst.flags |= DST_HOST; 1006 rt->rt6i_dst.addr = *daddr; 1007 rt->rt6i_dst.plen = 128; 1008 1009 if (!rt6_is_gw_or_nonexthop(ort)) { 1010 if (ort->rt6i_dst.plen != 128 && 1011 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 1012 rt->rt6i_flags |= RTF_ANYCAST; 1013 #ifdef CONFIG_IPV6_SUBTREES 1014 if (rt->rt6i_src.plen && saddr) { 1015 rt->rt6i_src.addr = *saddr; 1016 rt->rt6i_src.plen = 128; 1017 } 1018 #endif 1019 } 1020 1021 return rt; 1022 } 1023 1024 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 1025 { 1026 struct net_device *dev; 1027 struct rt6_info *pcpu_rt; 1028 1029 rcu_read_lock(); 1030 dev = ip6_rt_get_dev_rcu(rt); 1031 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags); 1032 rcu_read_unlock(); 1033 if (!pcpu_rt) 1034 return NULL; 1035 ip6_rt_copy_init(pcpu_rt, rt); 1036 pcpu_rt->rt6i_protocol = rt->rt6i_protocol; 1037 pcpu_rt->rt6i_flags |= RTF_PCPU; 1038 return pcpu_rt; 1039 } 1040 1041 /* It should be called with read_lock_bh(&tb6_lock) acquired */ 1042 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1043 { 1044 struct rt6_info *pcpu_rt, **p; 1045 1046 p = this_cpu_ptr(rt->rt6i_pcpu); 1047 pcpu_rt = *p; 1048 1049 if (pcpu_rt) { 1050 dst_hold(&pcpu_rt->dst); 1051 rt6_dst_from_metrics_check(pcpu_rt); 1052 } 1053 return pcpu_rt; 1054 } 1055 1056 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) 1057 { 1058 struct fib6_table *table = rt->rt6i_table; 1059 struct rt6_info *pcpu_rt, *prev, **p; 1060 1061 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1062 if (!pcpu_rt) { 1063 struct net *net = dev_net(rt->dst.dev); 1064 1065 dst_hold(&net->ipv6.ip6_null_entry->dst); 1066 return net->ipv6.ip6_null_entry; 1067 } 1068 1069 read_lock_bh(&table->tb6_lock); 1070 if (rt->rt6i_pcpu) { 1071 p = this_cpu_ptr(rt->rt6i_pcpu); 1072 prev = cmpxchg(p, NULL, pcpu_rt); 1073 if (prev) { 1074 /* If someone did it before us, return prev instead */ 1075 dst_release_immediate(&pcpu_rt->dst); 1076 pcpu_rt = prev; 1077 } 1078 } else { 1079 /* rt has been removed from the fib6 tree 1080 * before we have a chance to acquire the read_lock. 1081 * In this case, don't brother to create a pcpu rt 1082 * since rt is going away anyway. The next 1083 * dst_check() will trigger a re-lookup. 1084 */ 1085 dst_release_immediate(&pcpu_rt->dst); 1086 pcpu_rt = rt; 1087 } 1088 dst_hold(&pcpu_rt->dst); 1089 rt6_dst_from_metrics_check(pcpu_rt); 1090 read_unlock_bh(&table->tb6_lock); 1091 return pcpu_rt; 1092 } 1093 1094 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, 1095 int oif, struct flowi6 *fl6, int flags) 1096 { 1097 struct fib6_node *fn, *saved_fn; 1098 struct rt6_info *rt; 1099 int strict = 0; 1100 1101 strict |= flags & RT6_LOOKUP_F_IFACE; 1102 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; 1103 if (net->ipv6.devconf_all->forwarding == 0) 1104 strict |= RT6_LOOKUP_F_REACHABLE; 1105 1106 read_lock_bh(&table->tb6_lock); 1107 1108 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1109 saved_fn = fn; 1110 1111 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) 1112 oif = 0; 1113 1114 redo_rt6_select: 1115 rt = rt6_select(fn, oif, strict); 1116 if (rt->rt6i_nsiblings) 1117 rt = rt6_multipath_select(rt, fl6, oif, strict); 1118 if (rt == net->ipv6.ip6_null_entry) { 1119 fn = fib6_backtrack(fn, &fl6->saddr); 1120 if (fn) 1121 goto redo_rt6_select; 1122 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1123 /* also consider unreachable route */ 1124 strict &= ~RT6_LOOKUP_F_REACHABLE; 1125 fn = saved_fn; 1126 goto redo_rt6_select; 1127 } 1128 } 1129 1130 1131 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { 1132 dst_use(&rt->dst, jiffies); 1133 read_unlock_bh(&table->tb6_lock); 1134 1135 rt6_dst_from_metrics_check(rt); 1136 1137 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 1138 return rt; 1139 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1140 !(rt->rt6i_flags & RTF_GATEWAY))) { 1141 /* Create a RTF_CACHE clone which will not be 1142 * owned by the fib6 tree. It is for the special case where 1143 * the daddr in the skb during the neighbor look-up is different 1144 * from the fl6->daddr used to look-up route here. 1145 */ 1146 1147 struct rt6_info *uncached_rt; 1148 1149 dst_use(&rt->dst, jiffies); 1150 read_unlock_bh(&table->tb6_lock); 1151 1152 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1153 dst_release(&rt->dst); 1154 1155 if (uncached_rt) { 1156 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc() 1157 * No need for another dst_hold() 1158 */ 1159 rt6_uncached_list_add(uncached_rt); 1160 } else { 1161 uncached_rt = net->ipv6.ip6_null_entry; 1162 dst_hold(&uncached_rt->dst); 1163 } 1164 1165 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); 1166 return uncached_rt; 1167 1168 } else { 1169 /* Get a percpu copy */ 1170 1171 struct rt6_info *pcpu_rt; 1172 1173 rt->dst.lastuse = jiffies; 1174 rt->dst.__use++; 1175 pcpu_rt = rt6_get_pcpu_route(rt); 1176 1177 if (pcpu_rt) { 1178 read_unlock_bh(&table->tb6_lock); 1179 } else { 1180 /* We have to do the read_unlock first 1181 * because rt6_make_pcpu_route() may trigger 1182 * ip6_dst_gc() which will take the write_lock. 1183 */ 1184 dst_hold(&rt->dst); 1185 read_unlock_bh(&table->tb6_lock); 1186 pcpu_rt = rt6_make_pcpu_route(rt); 1187 dst_release(&rt->dst); 1188 } 1189 1190 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); 1191 return pcpu_rt; 1192 1193 } 1194 } 1195 EXPORT_SYMBOL_GPL(ip6_pol_route); 1196 1197 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 1198 struct flowi6 *fl6, int flags) 1199 { 1200 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1201 } 1202 1203 struct dst_entry *ip6_route_input_lookup(struct net *net, 1204 struct net_device *dev, 1205 struct flowi6 *fl6, int flags) 1206 { 1207 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1208 flags |= RT6_LOOKUP_F_IFACE; 1209 1210 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1211 } 1212 EXPORT_SYMBOL_GPL(ip6_route_input_lookup); 1213 1214 static void ip6_multipath_l3_keys(const struct sk_buff *skb, 1215 struct flow_keys *keys) 1216 { 1217 const struct ipv6hdr *outer_iph = ipv6_hdr(skb); 1218 const struct ipv6hdr *key_iph = outer_iph; 1219 const struct ipv6hdr *inner_iph; 1220 const struct icmp6hdr *icmph; 1221 struct ipv6hdr _inner_iph; 1222 1223 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) 1224 goto out; 1225 1226 icmph = icmp6_hdr(skb); 1227 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH && 1228 icmph->icmp6_type != ICMPV6_PKT_TOOBIG && 1229 icmph->icmp6_type != ICMPV6_TIME_EXCEED && 1230 icmph->icmp6_type != ICMPV6_PARAMPROB) 1231 goto out; 1232 1233 inner_iph = skb_header_pointer(skb, 1234 skb_transport_offset(skb) + sizeof(*icmph), 1235 sizeof(_inner_iph), &_inner_iph); 1236 if (!inner_iph) 1237 goto out; 1238 1239 key_iph = inner_iph; 1240 out: 1241 memset(keys, 0, sizeof(*keys)); 1242 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1243 keys->addrs.v6addrs.src = key_iph->saddr; 1244 keys->addrs.v6addrs.dst = key_iph->daddr; 1245 keys->tags.flow_label = ip6_flowinfo(key_iph); 1246 keys->basic.ip_proto = key_iph->nexthdr; 1247 } 1248 1249 /* if skb is set it will be used and fl6 can be NULL */ 1250 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb) 1251 { 1252 struct flow_keys hash_keys; 1253 1254 if (skb) { 1255 ip6_multipath_l3_keys(skb, &hash_keys); 1256 return flow_hash_from_keys(&hash_keys); 1257 } 1258 1259 return get_hash_from_flowi6(fl6); 1260 } 1261 1262 void ip6_route_input(struct sk_buff *skb) 1263 { 1264 const struct ipv6hdr *iph = ipv6_hdr(skb); 1265 struct net *net = dev_net(skb->dev); 1266 int flags = RT6_LOOKUP_F_HAS_SADDR; 1267 struct ip_tunnel_info *tun_info; 1268 struct flowi6 fl6 = { 1269 .flowi6_iif = skb->dev->ifindex, 1270 .daddr = iph->daddr, 1271 .saddr = iph->saddr, 1272 .flowlabel = ip6_flowinfo(iph), 1273 .flowi6_mark = skb->mark, 1274 .flowi6_proto = iph->nexthdr, 1275 }; 1276 1277 tun_info = skb_tunnel_info(skb); 1278 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1279 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 1280 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) 1281 fl6.mp_hash = rt6_multipath_hash(&fl6, skb); 1282 skb_dst_drop(skb); 1283 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); 1284 } 1285 1286 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 1287 struct flowi6 *fl6, int flags) 1288 { 1289 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 1290 } 1291 1292 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, 1293 struct flowi6 *fl6, int flags) 1294 { 1295 bool any_src; 1296 1297 if (rt6_need_strict(&fl6->daddr)) { 1298 struct dst_entry *dst; 1299 1300 dst = l3mdev_link_scope_lookup(net, fl6); 1301 if (dst) 1302 return dst; 1303 } 1304 1305 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1306 1307 any_src = ipv6_addr_any(&fl6->saddr); 1308 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || 1309 (fl6->flowi6_oif && any_src)) 1310 flags |= RT6_LOOKUP_F_IFACE; 1311 1312 if (!any_src) 1313 flags |= RT6_LOOKUP_F_HAS_SADDR; 1314 else if (sk) 1315 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 1316 1317 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 1318 } 1319 EXPORT_SYMBOL_GPL(ip6_route_output_flags); 1320 1321 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1322 { 1323 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1324 struct net_device *loopback_dev = net->loopback_dev; 1325 struct dst_entry *new = NULL; 1326 1327 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, 1328 DST_OBSOLETE_NONE, 0); 1329 if (rt) { 1330 rt6_info_init(rt); 1331 1332 new = &rt->dst; 1333 new->__use = 1; 1334 new->input = dst_discard; 1335 new->output = dst_discard_out; 1336 1337 dst_copy_metrics(new, &ort->dst); 1338 1339 rt->rt6i_idev = in6_dev_get(loopback_dev); 1340 rt->rt6i_gateway = ort->rt6i_gateway; 1341 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; 1342 rt->rt6i_metric = 0; 1343 1344 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 1345 #ifdef CONFIG_IPV6_SUBTREES 1346 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1347 #endif 1348 } 1349 1350 dst_release(dst_orig); 1351 return new ? new : ERR_PTR(-ENOMEM); 1352 } 1353 1354 /* 1355 * Destination cache support functions 1356 */ 1357 1358 static void rt6_dst_from_metrics_check(struct rt6_info *rt) 1359 { 1360 if (rt->dst.from && 1361 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) 1362 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); 1363 } 1364 1365 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 1366 { 1367 u32 rt_cookie = 0; 1368 1369 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) 1370 return NULL; 1371 1372 if (rt6_check_expired(rt)) 1373 return NULL; 1374 1375 return &rt->dst; 1376 } 1377 1378 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 1379 { 1380 if (!__rt6_check_expired(rt) && 1381 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1382 rt6_check((struct rt6_info *)(rt->dst.from), cookie)) 1383 return &rt->dst; 1384 else 1385 return NULL; 1386 } 1387 1388 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 1389 { 1390 struct rt6_info *rt; 1391 1392 rt = (struct rt6_info *) dst; 1393 1394 /* All IPV6 dsts are created with ->obsolete set to the value 1395 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1396 * into this function always. 1397 */ 1398 1399 rt6_dst_from_metrics_check(rt); 1400 1401 if (rt->rt6i_flags & RTF_PCPU || 1402 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) 1403 return rt6_dst_from_check(rt, cookie); 1404 else 1405 return rt6_check(rt, cookie); 1406 } 1407 1408 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 1409 { 1410 struct rt6_info *rt = (struct rt6_info *) dst; 1411 1412 if (rt) { 1413 if (rt->rt6i_flags & RTF_CACHE) { 1414 if (rt6_check_expired(rt)) { 1415 ip6_del_rt(rt); 1416 dst = NULL; 1417 } 1418 } else { 1419 dst_release(dst); 1420 dst = NULL; 1421 } 1422 } 1423 return dst; 1424 } 1425 1426 static void ip6_link_failure(struct sk_buff *skb) 1427 { 1428 struct rt6_info *rt; 1429 1430 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 1431 1432 rt = (struct rt6_info *) skb_dst(skb); 1433 if (rt) { 1434 if (rt->rt6i_flags & RTF_CACHE) { 1435 if (dst_hold_safe(&rt->dst)) 1436 ip6_del_rt(rt); 1437 } else { 1438 struct fib6_node *fn; 1439 1440 rcu_read_lock(); 1441 fn = rcu_dereference(rt->rt6i_node); 1442 if (fn && (rt->rt6i_flags & RTF_DEFAULT)) 1443 fn->fn_sernum = -1; 1444 rcu_read_unlock(); 1445 } 1446 } 1447 } 1448 1449 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 1450 { 1451 struct net *net = dev_net(rt->dst.dev); 1452 1453 rt->rt6i_flags |= RTF_MODIFIED; 1454 rt->rt6i_pmtu = mtu; 1455 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 1456 } 1457 1458 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) 1459 { 1460 return !(rt->rt6i_flags & RTF_CACHE) && 1461 (rt->rt6i_flags & RTF_PCPU || 1462 rcu_access_pointer(rt->rt6i_node)); 1463 } 1464 1465 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 1466 const struct ipv6hdr *iph, u32 mtu) 1467 { 1468 const struct in6_addr *daddr, *saddr; 1469 struct rt6_info *rt6 = (struct rt6_info *)dst; 1470 1471 if (rt6->rt6i_flags & RTF_LOCAL) 1472 return; 1473 1474 if (dst_metric_locked(dst, RTAX_MTU)) 1475 return; 1476 1477 if (iph) { 1478 daddr = &iph->daddr; 1479 saddr = &iph->saddr; 1480 } else if (sk) { 1481 daddr = &sk->sk_v6_daddr; 1482 saddr = &inet6_sk(sk)->saddr; 1483 } else { 1484 daddr = NULL; 1485 saddr = NULL; 1486 } 1487 dst_confirm_neigh(dst, daddr); 1488 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 1489 if (mtu >= dst_mtu(dst)) 1490 return; 1491 1492 if (!rt6_cache_allowed_for_pmtu(rt6)) { 1493 rt6_do_update_pmtu(rt6, mtu); 1494 } else if (daddr) { 1495 struct rt6_info *nrt6; 1496 1497 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 1498 if (nrt6) { 1499 rt6_do_update_pmtu(nrt6, mtu); 1500 1501 /* ip6_ins_rt(nrt6) will bump the 1502 * rt6->rt6i_node->fn_sernum 1503 * which will fail the next rt6_check() and 1504 * invalidate the sk->sk_dst_cache. 1505 */ 1506 ip6_ins_rt(nrt6); 1507 /* Release the reference taken in 1508 * ip6_rt_cache_alloc() 1509 */ 1510 dst_release(&nrt6->dst); 1511 } 1512 } 1513 } 1514 1515 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1516 struct sk_buff *skb, u32 mtu) 1517 { 1518 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 1519 } 1520 1521 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 1522 int oif, u32 mark, kuid_t uid) 1523 { 1524 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1525 struct dst_entry *dst; 1526 struct flowi6 fl6; 1527 1528 memset(&fl6, 0, sizeof(fl6)); 1529 fl6.flowi6_oif = oif; 1530 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 1531 fl6.daddr = iph->daddr; 1532 fl6.saddr = iph->saddr; 1533 fl6.flowlabel = ip6_flowinfo(iph); 1534 fl6.flowi6_uid = uid; 1535 1536 dst = ip6_route_output(net, NULL, &fl6); 1537 if (!dst->error) 1538 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 1539 dst_release(dst); 1540 } 1541 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 1542 1543 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 1544 { 1545 struct dst_entry *dst; 1546 1547 ip6_update_pmtu(skb, sock_net(sk), mtu, 1548 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid); 1549 1550 dst = __sk_dst_get(sk); 1551 if (!dst || !dst->obsolete || 1552 dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) 1553 return; 1554 1555 bh_lock_sock(sk); 1556 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) 1557 ip6_datagram_dst_update(sk, false); 1558 bh_unlock_sock(sk); 1559 } 1560 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 1561 1562 /* Handle redirects */ 1563 struct ip6rd_flowi { 1564 struct flowi6 fl6; 1565 struct in6_addr gateway; 1566 }; 1567 1568 static struct rt6_info *__ip6_route_redirect(struct net *net, 1569 struct fib6_table *table, 1570 struct flowi6 *fl6, 1571 int flags) 1572 { 1573 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 1574 struct rt6_info *rt; 1575 struct fib6_node *fn; 1576 1577 /* Get the "current" route for this destination and 1578 * check if the redirect has come from appropriate router. 1579 * 1580 * RFC 4861 specifies that redirects should only be 1581 * accepted if they come from the nexthop to the target. 1582 * Due to the way the routes are chosen, this notion 1583 * is a bit fuzzy and one might need to check all possible 1584 * routes. 1585 */ 1586 1587 read_lock_bh(&table->tb6_lock); 1588 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1589 restart: 1590 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 1591 if (rt6_check_expired(rt)) 1592 continue; 1593 if (rt->dst.error) 1594 break; 1595 if (!(rt->rt6i_flags & RTF_GATEWAY)) 1596 continue; 1597 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 1598 continue; 1599 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) 1600 continue; 1601 break; 1602 } 1603 1604 if (!rt) 1605 rt = net->ipv6.ip6_null_entry; 1606 else if (rt->dst.error) { 1607 rt = net->ipv6.ip6_null_entry; 1608 goto out; 1609 } 1610 1611 if (rt == net->ipv6.ip6_null_entry) { 1612 fn = fib6_backtrack(fn, &fl6->saddr); 1613 if (fn) 1614 goto restart; 1615 } 1616 1617 out: 1618 dst_hold(&rt->dst); 1619 1620 read_unlock_bh(&table->tb6_lock); 1621 1622 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); 1623 return rt; 1624 }; 1625 1626 static struct dst_entry *ip6_route_redirect(struct net *net, 1627 const struct flowi6 *fl6, 1628 const struct in6_addr *gateway) 1629 { 1630 int flags = RT6_LOOKUP_F_HAS_SADDR; 1631 struct ip6rd_flowi rdfl; 1632 1633 rdfl.fl6 = *fl6; 1634 rdfl.gateway = *gateway; 1635 1636 return fib6_rule_lookup(net, &rdfl.fl6, 1637 flags, __ip6_route_redirect); 1638 } 1639 1640 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, 1641 kuid_t uid) 1642 { 1643 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1644 struct dst_entry *dst; 1645 struct flowi6 fl6; 1646 1647 memset(&fl6, 0, sizeof(fl6)); 1648 fl6.flowi6_iif = LOOPBACK_IFINDEX; 1649 fl6.flowi6_oif = oif; 1650 fl6.flowi6_mark = mark; 1651 fl6.daddr = iph->daddr; 1652 fl6.saddr = iph->saddr; 1653 fl6.flowlabel = ip6_flowinfo(iph); 1654 fl6.flowi6_uid = uid; 1655 1656 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 1657 rt6_do_redirect(dst, NULL, skb); 1658 dst_release(dst); 1659 } 1660 EXPORT_SYMBOL_GPL(ip6_redirect); 1661 1662 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 1663 u32 mark) 1664 { 1665 const struct ipv6hdr *iph = ipv6_hdr(skb); 1666 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 1667 struct dst_entry *dst; 1668 struct flowi6 fl6; 1669 1670 memset(&fl6, 0, sizeof(fl6)); 1671 fl6.flowi6_iif = LOOPBACK_IFINDEX; 1672 fl6.flowi6_oif = oif; 1673 fl6.flowi6_mark = mark; 1674 fl6.daddr = msg->dest; 1675 fl6.saddr = iph->daddr; 1676 fl6.flowi6_uid = sock_net_uid(net, NULL); 1677 1678 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 1679 rt6_do_redirect(dst, NULL, skb); 1680 dst_release(dst); 1681 } 1682 1683 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 1684 { 1685 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark, 1686 sk->sk_uid); 1687 } 1688 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 1689 1690 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 1691 { 1692 struct net_device *dev = dst->dev; 1693 unsigned int mtu = dst_mtu(dst); 1694 struct net *net = dev_net(dev); 1695 1696 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 1697 1698 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 1699 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 1700 1701 /* 1702 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 1703 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 1704 * IPV6_MAXPLEN is also valid and means: "any MSS, 1705 * rely only on pmtu discovery" 1706 */ 1707 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 1708 mtu = IPV6_MAXPLEN; 1709 return mtu; 1710 } 1711 1712 static unsigned int ip6_mtu(const struct dst_entry *dst) 1713 { 1714 const struct rt6_info *rt = (const struct rt6_info *)dst; 1715 unsigned int mtu = rt->rt6i_pmtu; 1716 struct inet6_dev *idev; 1717 1718 if (mtu) 1719 goto out; 1720 1721 mtu = dst_metric_raw(dst, RTAX_MTU); 1722 if (mtu) 1723 goto out; 1724 1725 mtu = IPV6_MIN_MTU; 1726 1727 rcu_read_lock(); 1728 idev = __in6_dev_get(dst->dev); 1729 if (idev) 1730 mtu = idev->cnf.mtu6; 1731 rcu_read_unlock(); 1732 1733 out: 1734 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); 1735 1736 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1737 } 1738 1739 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 1740 struct flowi6 *fl6) 1741 { 1742 struct dst_entry *dst; 1743 struct rt6_info *rt; 1744 struct inet6_dev *idev = in6_dev_get(dev); 1745 struct net *net = dev_net(dev); 1746 1747 if (unlikely(!idev)) 1748 return ERR_PTR(-ENODEV); 1749 1750 rt = ip6_dst_alloc(net, dev, 0); 1751 if (unlikely(!rt)) { 1752 in6_dev_put(idev); 1753 dst = ERR_PTR(-ENOMEM); 1754 goto out; 1755 } 1756 1757 rt->dst.flags |= DST_HOST; 1758 rt->dst.output = ip6_output; 1759 rt->rt6i_gateway = fl6->daddr; 1760 rt->rt6i_dst.addr = fl6->daddr; 1761 rt->rt6i_dst.plen = 128; 1762 rt->rt6i_idev = idev; 1763 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 1764 1765 /* Add this dst into uncached_list so that rt6_ifdown() can 1766 * do proper release of the net_device 1767 */ 1768 rt6_uncached_list_add(rt); 1769 1770 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 1771 1772 out: 1773 return dst; 1774 } 1775 1776 static int ip6_dst_gc(struct dst_ops *ops) 1777 { 1778 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 1779 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 1780 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 1781 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 1782 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 1783 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 1784 int entries; 1785 1786 entries = dst_entries_get_fast(ops); 1787 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 1788 entries <= rt_max_size) 1789 goto out; 1790 1791 net->ipv6.ip6_rt_gc_expire++; 1792 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 1793 entries = dst_entries_get_slow(ops); 1794 if (entries < ops->gc_thresh) 1795 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 1796 out: 1797 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 1798 return entries > rt_max_size; 1799 } 1800 1801 static int ip6_convert_metrics(struct mx6_config *mxc, 1802 const struct fib6_config *cfg) 1803 { 1804 bool ecn_ca = false; 1805 struct nlattr *nla; 1806 int remaining; 1807 u32 *mp; 1808 1809 if (!cfg->fc_mx) 1810 return 0; 1811 1812 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 1813 if (unlikely(!mp)) 1814 return -ENOMEM; 1815 1816 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 1817 int type = nla_type(nla); 1818 u32 val; 1819 1820 if (!type) 1821 continue; 1822 if (unlikely(type > RTAX_MAX)) 1823 goto err; 1824 1825 if (type == RTAX_CC_ALGO) { 1826 char tmp[TCP_CA_NAME_MAX]; 1827 1828 nla_strlcpy(tmp, nla, sizeof(tmp)); 1829 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 1830 if (val == TCP_CA_UNSPEC) 1831 goto err; 1832 } else { 1833 val = nla_get_u32(nla); 1834 } 1835 if (type == RTAX_HOPLIMIT && val > 255) 1836 val = 255; 1837 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) 1838 goto err; 1839 1840 mp[type - 1] = val; 1841 __set_bit(type - 1, mxc->mx_valid); 1842 } 1843 1844 if (ecn_ca) { 1845 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); 1846 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; 1847 } 1848 1849 mxc->mx = mp; 1850 return 0; 1851 err: 1852 kfree(mp); 1853 return -EINVAL; 1854 } 1855 1856 static struct rt6_info *ip6_nh_lookup_table(struct net *net, 1857 struct fib6_config *cfg, 1858 const struct in6_addr *gw_addr) 1859 { 1860 struct flowi6 fl6 = { 1861 .flowi6_oif = cfg->fc_ifindex, 1862 .daddr = *gw_addr, 1863 .saddr = cfg->fc_prefsrc, 1864 }; 1865 struct fib6_table *table; 1866 struct rt6_info *rt; 1867 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE; 1868 1869 table = fib6_get_table(net, cfg->fc_table); 1870 if (!table) 1871 return NULL; 1872 1873 if (!ipv6_addr_any(&cfg->fc_prefsrc)) 1874 flags |= RT6_LOOKUP_F_HAS_SADDR; 1875 1876 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); 1877 1878 /* if table lookup failed, fall back to full lookup */ 1879 if (rt == net->ipv6.ip6_null_entry) { 1880 ip6_rt_put(rt); 1881 rt = NULL; 1882 } 1883 1884 return rt; 1885 } 1886 1887 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, 1888 struct netlink_ext_ack *extack) 1889 { 1890 struct net *net = cfg->fc_nlinfo.nl_net; 1891 struct rt6_info *rt = NULL; 1892 struct net_device *dev = NULL; 1893 struct inet6_dev *idev = NULL; 1894 struct fib6_table *table; 1895 int addr_type; 1896 int err = -EINVAL; 1897 1898 /* RTF_PCPU is an internal flag; can not be set by userspace */ 1899 if (cfg->fc_flags & RTF_PCPU) { 1900 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); 1901 goto out; 1902 } 1903 1904 if (cfg->fc_dst_len > 128) { 1905 NL_SET_ERR_MSG(extack, "Invalid prefix length"); 1906 goto out; 1907 } 1908 if (cfg->fc_src_len > 128) { 1909 NL_SET_ERR_MSG(extack, "Invalid source address length"); 1910 goto out; 1911 } 1912 #ifndef CONFIG_IPV6_SUBTREES 1913 if (cfg->fc_src_len) { 1914 NL_SET_ERR_MSG(extack, 1915 "Specifying source address requires IPV6_SUBTREES to be enabled"); 1916 goto out; 1917 } 1918 #endif 1919 if (cfg->fc_ifindex) { 1920 err = -ENODEV; 1921 dev = dev_get_by_index(net, cfg->fc_ifindex); 1922 if (!dev) 1923 goto out; 1924 idev = in6_dev_get(dev); 1925 if (!idev) 1926 goto out; 1927 } 1928 1929 if (cfg->fc_metric == 0) 1930 cfg->fc_metric = IP6_RT_PRIO_USER; 1931 1932 err = -ENOBUFS; 1933 if (cfg->fc_nlinfo.nlh && 1934 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 1935 table = fib6_get_table(net, cfg->fc_table); 1936 if (!table) { 1937 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 1938 table = fib6_new_table(net, cfg->fc_table); 1939 } 1940 } else { 1941 table = fib6_new_table(net, cfg->fc_table); 1942 } 1943 1944 if (!table) 1945 goto out; 1946 1947 rt = ip6_dst_alloc(net, NULL, 1948 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT); 1949 1950 if (!rt) { 1951 err = -ENOMEM; 1952 goto out; 1953 } 1954 1955 if (cfg->fc_flags & RTF_EXPIRES) 1956 rt6_set_expires(rt, jiffies + 1957 clock_t_to_jiffies(cfg->fc_expires)); 1958 else 1959 rt6_clean_expires(rt); 1960 1961 if (cfg->fc_protocol == RTPROT_UNSPEC) 1962 cfg->fc_protocol = RTPROT_BOOT; 1963 rt->rt6i_protocol = cfg->fc_protocol; 1964 1965 addr_type = ipv6_addr_type(&cfg->fc_dst); 1966 1967 if (addr_type & IPV6_ADDR_MULTICAST) 1968 rt->dst.input = ip6_mc_input; 1969 else if (cfg->fc_flags & RTF_LOCAL) 1970 rt->dst.input = ip6_input; 1971 else 1972 rt->dst.input = ip6_forward; 1973 1974 rt->dst.output = ip6_output; 1975 1976 if (cfg->fc_encap) { 1977 struct lwtunnel_state *lwtstate; 1978 1979 err = lwtunnel_build_state(cfg->fc_encap_type, 1980 cfg->fc_encap, AF_INET6, cfg, 1981 &lwtstate, extack); 1982 if (err) 1983 goto out; 1984 rt->dst.lwtstate = lwtstate_get(lwtstate); 1985 if (lwtunnel_output_redirect(rt->dst.lwtstate)) { 1986 rt->dst.lwtstate->orig_output = rt->dst.output; 1987 rt->dst.output = lwtunnel_output; 1988 } 1989 if (lwtunnel_input_redirect(rt->dst.lwtstate)) { 1990 rt->dst.lwtstate->orig_input = rt->dst.input; 1991 rt->dst.input = lwtunnel_input; 1992 } 1993 } 1994 1995 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 1996 rt->rt6i_dst.plen = cfg->fc_dst_len; 1997 if (rt->rt6i_dst.plen == 128) 1998 rt->dst.flags |= DST_HOST; 1999 2000 #ifdef CONFIG_IPV6_SUBTREES 2001 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 2002 rt->rt6i_src.plen = cfg->fc_src_len; 2003 #endif 2004 2005 rt->rt6i_metric = cfg->fc_metric; 2006 2007 /* We cannot add true routes via loopback here, 2008 they would result in kernel looping; promote them to reject routes 2009 */ 2010 if ((cfg->fc_flags & RTF_REJECT) || 2011 (dev && (dev->flags & IFF_LOOPBACK) && 2012 !(addr_type & IPV6_ADDR_LOOPBACK) && 2013 !(cfg->fc_flags & RTF_LOCAL))) { 2014 /* hold loopback dev/idev if we haven't done so. */ 2015 if (dev != net->loopback_dev) { 2016 if (dev) { 2017 dev_put(dev); 2018 in6_dev_put(idev); 2019 } 2020 dev = net->loopback_dev; 2021 dev_hold(dev); 2022 idev = in6_dev_get(dev); 2023 if (!idev) { 2024 err = -ENODEV; 2025 goto out; 2026 } 2027 } 2028 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 2029 switch (cfg->fc_type) { 2030 case RTN_BLACKHOLE: 2031 rt->dst.error = -EINVAL; 2032 rt->dst.output = dst_discard_out; 2033 rt->dst.input = dst_discard; 2034 break; 2035 case RTN_PROHIBIT: 2036 rt->dst.error = -EACCES; 2037 rt->dst.output = ip6_pkt_prohibit_out; 2038 rt->dst.input = ip6_pkt_prohibit; 2039 break; 2040 case RTN_THROW: 2041 case RTN_UNREACHABLE: 2042 default: 2043 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN 2044 : (cfg->fc_type == RTN_UNREACHABLE) 2045 ? -EHOSTUNREACH : -ENETUNREACH; 2046 rt->dst.output = ip6_pkt_discard_out; 2047 rt->dst.input = ip6_pkt_discard; 2048 break; 2049 } 2050 goto install_route; 2051 } 2052 2053 if (cfg->fc_flags & RTF_GATEWAY) { 2054 const struct in6_addr *gw_addr; 2055 int gwa_type; 2056 2057 gw_addr = &cfg->fc_gateway; 2058 gwa_type = ipv6_addr_type(gw_addr); 2059 2060 /* if gw_addr is local we will fail to detect this in case 2061 * address is still TENTATIVE (DAD in progress). rt6_lookup() 2062 * will return already-added prefix route via interface that 2063 * prefix route was assigned to, which might be non-loopback. 2064 */ 2065 err = -EINVAL; 2066 if (ipv6_chk_addr_and_flags(net, gw_addr, 2067 gwa_type & IPV6_ADDR_LINKLOCAL ? 2068 dev : NULL, 0, 0)) { 2069 NL_SET_ERR_MSG(extack, "Invalid gateway address"); 2070 goto out; 2071 } 2072 rt->rt6i_gateway = *gw_addr; 2073 2074 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 2075 struct rt6_info *grt = NULL; 2076 2077 /* IPv6 strictly inhibits using not link-local 2078 addresses as nexthop address. 2079 Otherwise, router will not able to send redirects. 2080 It is very good, but in some (rare!) circumstances 2081 (SIT, PtP, NBMA NOARP links) it is handy to allow 2082 some exceptions. --ANK 2083 We allow IPv4-mapped nexthops to support RFC4798-type 2084 addressing 2085 */ 2086 if (!(gwa_type & (IPV6_ADDR_UNICAST | 2087 IPV6_ADDR_MAPPED))) { 2088 NL_SET_ERR_MSG(extack, 2089 "Invalid gateway address"); 2090 goto out; 2091 } 2092 2093 if (cfg->fc_table) { 2094 grt = ip6_nh_lookup_table(net, cfg, gw_addr); 2095 2096 if (grt) { 2097 if (grt->rt6i_flags & RTF_GATEWAY || 2098 (dev && dev != grt->dst.dev)) { 2099 ip6_rt_put(grt); 2100 grt = NULL; 2101 } 2102 } 2103 } 2104 2105 if (!grt) 2106 grt = rt6_lookup(net, gw_addr, NULL, 2107 cfg->fc_ifindex, 1); 2108 2109 err = -EHOSTUNREACH; 2110 if (!grt) 2111 goto out; 2112 if (dev) { 2113 if (dev != grt->dst.dev) { 2114 ip6_rt_put(grt); 2115 goto out; 2116 } 2117 } else { 2118 dev = grt->dst.dev; 2119 idev = grt->rt6i_idev; 2120 dev_hold(dev); 2121 in6_dev_hold(grt->rt6i_idev); 2122 } 2123 if (!(grt->rt6i_flags & RTF_GATEWAY)) 2124 err = 0; 2125 ip6_rt_put(grt); 2126 2127 if (err) 2128 goto out; 2129 } 2130 err = -EINVAL; 2131 if (!dev) { 2132 NL_SET_ERR_MSG(extack, "Egress device not specified"); 2133 goto out; 2134 } else if (dev->flags & IFF_LOOPBACK) { 2135 NL_SET_ERR_MSG(extack, 2136 "Egress device can not be loopback device for this route"); 2137 goto out; 2138 } 2139 } 2140 2141 err = -ENODEV; 2142 if (!dev) 2143 goto out; 2144 2145 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 2146 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 2147 NL_SET_ERR_MSG(extack, "Invalid source address"); 2148 err = -EINVAL; 2149 goto out; 2150 } 2151 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 2152 rt->rt6i_prefsrc.plen = 128; 2153 } else 2154 rt->rt6i_prefsrc.plen = 0; 2155 2156 rt->rt6i_flags = cfg->fc_flags; 2157 2158 install_route: 2159 rt->dst.dev = dev; 2160 rt->rt6i_idev = idev; 2161 rt->rt6i_table = table; 2162 2163 cfg->fc_nlinfo.nl_net = dev_net(dev); 2164 2165 return rt; 2166 out: 2167 if (dev) 2168 dev_put(dev); 2169 if (idev) 2170 in6_dev_put(idev); 2171 if (rt) 2172 dst_release_immediate(&rt->dst); 2173 2174 return ERR_PTR(err); 2175 } 2176 2177 int ip6_route_add(struct fib6_config *cfg, 2178 struct netlink_ext_ack *extack) 2179 { 2180 struct mx6_config mxc = { .mx = NULL, }; 2181 struct rt6_info *rt; 2182 int err; 2183 2184 rt = ip6_route_info_create(cfg, extack); 2185 if (IS_ERR(rt)) { 2186 err = PTR_ERR(rt); 2187 rt = NULL; 2188 goto out; 2189 } 2190 2191 err = ip6_convert_metrics(&mxc, cfg); 2192 if (err) 2193 goto out; 2194 2195 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack); 2196 2197 kfree(mxc.mx); 2198 2199 return err; 2200 out: 2201 if (rt) 2202 dst_release_immediate(&rt->dst); 2203 2204 return err; 2205 } 2206 2207 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 2208 { 2209 int err; 2210 struct fib6_table *table; 2211 struct net *net = dev_net(rt->dst.dev); 2212 2213 if (rt == net->ipv6.ip6_null_entry) { 2214 err = -ENOENT; 2215 goto out; 2216 } 2217 2218 table = rt->rt6i_table; 2219 write_lock_bh(&table->tb6_lock); 2220 err = fib6_del(rt, info); 2221 write_unlock_bh(&table->tb6_lock); 2222 2223 out: 2224 ip6_rt_put(rt); 2225 return err; 2226 } 2227 2228 int ip6_del_rt(struct rt6_info *rt) 2229 { 2230 struct nl_info info = { 2231 .nl_net = dev_net(rt->dst.dev), 2232 }; 2233 return __ip6_del_rt(rt, &info); 2234 } 2235 2236 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) 2237 { 2238 struct nl_info *info = &cfg->fc_nlinfo; 2239 struct net *net = info->nl_net; 2240 struct sk_buff *skb = NULL; 2241 struct fib6_table *table; 2242 int err = -ENOENT; 2243 2244 if (rt == net->ipv6.ip6_null_entry) 2245 goto out_put; 2246 table = rt->rt6i_table; 2247 write_lock_bh(&table->tb6_lock); 2248 2249 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { 2250 struct rt6_info *sibling, *next_sibling; 2251 2252 /* prefer to send a single notification with all hops */ 2253 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 2254 if (skb) { 2255 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 2256 2257 if (rt6_fill_node(net, skb, rt, 2258 NULL, NULL, 0, RTM_DELROUTE, 2259 info->portid, seq, 0) < 0) { 2260 kfree_skb(skb); 2261 skb = NULL; 2262 } else 2263 info->skip_notify = 1; 2264 } 2265 2266 list_for_each_entry_safe(sibling, next_sibling, 2267 &rt->rt6i_siblings, 2268 rt6i_siblings) { 2269 err = fib6_del(sibling, info); 2270 if (err) 2271 goto out_unlock; 2272 } 2273 } 2274 2275 err = fib6_del(rt, info); 2276 out_unlock: 2277 write_unlock_bh(&table->tb6_lock); 2278 out_put: 2279 ip6_rt_put(rt); 2280 2281 if (skb) { 2282 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 2283 info->nlh, gfp_any()); 2284 } 2285 return err; 2286 } 2287 2288 static int ip6_route_del(struct fib6_config *cfg, 2289 struct netlink_ext_ack *extack) 2290 { 2291 struct fib6_table *table; 2292 struct fib6_node *fn; 2293 struct rt6_info *rt; 2294 int err = -ESRCH; 2295 2296 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 2297 if (!table) { 2298 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 2299 return err; 2300 } 2301 2302 read_lock_bh(&table->tb6_lock); 2303 2304 fn = fib6_locate(&table->tb6_root, 2305 &cfg->fc_dst, cfg->fc_dst_len, 2306 &cfg->fc_src, cfg->fc_src_len); 2307 2308 if (fn) { 2309 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2310 if ((rt->rt6i_flags & RTF_CACHE) && 2311 !(cfg->fc_flags & RTF_CACHE)) 2312 continue; 2313 if (cfg->fc_ifindex && 2314 (!rt->dst.dev || 2315 rt->dst.dev->ifindex != cfg->fc_ifindex)) 2316 continue; 2317 if (cfg->fc_flags & RTF_GATEWAY && 2318 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 2319 continue; 2320 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2321 continue; 2322 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) 2323 continue; 2324 dst_hold(&rt->dst); 2325 read_unlock_bh(&table->tb6_lock); 2326 2327 /* if gateway was specified only delete the one hop */ 2328 if (cfg->fc_flags & RTF_GATEWAY) 2329 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2330 2331 return __ip6_del_rt_siblings(rt, cfg); 2332 } 2333 } 2334 read_unlock_bh(&table->tb6_lock); 2335 2336 return err; 2337 } 2338 2339 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 2340 { 2341 struct netevent_redirect netevent; 2342 struct rt6_info *rt, *nrt = NULL; 2343 struct ndisc_options ndopts; 2344 struct inet6_dev *in6_dev; 2345 struct neighbour *neigh; 2346 struct rd_msg *msg; 2347 int optlen, on_link; 2348 u8 *lladdr; 2349 2350 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 2351 optlen -= sizeof(*msg); 2352 2353 if (optlen < 0) { 2354 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 2355 return; 2356 } 2357 2358 msg = (struct rd_msg *)icmp6_hdr(skb); 2359 2360 if (ipv6_addr_is_multicast(&msg->dest)) { 2361 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 2362 return; 2363 } 2364 2365 on_link = 0; 2366 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 2367 on_link = 1; 2368 } else if (ipv6_addr_type(&msg->target) != 2369 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 2370 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 2371 return; 2372 } 2373 2374 in6_dev = __in6_dev_get(skb->dev); 2375 if (!in6_dev) 2376 return; 2377 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 2378 return; 2379 2380 /* RFC2461 8.1: 2381 * The IP source address of the Redirect MUST be the same as the current 2382 * first-hop router for the specified ICMP Destination Address. 2383 */ 2384 2385 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { 2386 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 2387 return; 2388 } 2389 2390 lladdr = NULL; 2391 if (ndopts.nd_opts_tgt_lladdr) { 2392 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 2393 skb->dev); 2394 if (!lladdr) { 2395 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 2396 return; 2397 } 2398 } 2399 2400 rt = (struct rt6_info *) dst; 2401 if (rt->rt6i_flags & RTF_REJECT) { 2402 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 2403 return; 2404 } 2405 2406 /* Redirect received -> path was valid. 2407 * Look, redirects are sent only in response to data packets, 2408 * so that this nexthop apparently is reachable. --ANK 2409 */ 2410 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); 2411 2412 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 2413 if (!neigh) 2414 return; 2415 2416 /* 2417 * We have finally decided to accept it. 2418 */ 2419 2420 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, 2421 NEIGH_UPDATE_F_WEAK_OVERRIDE| 2422 NEIGH_UPDATE_F_OVERRIDE| 2423 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 2424 NEIGH_UPDATE_F_ISROUTER)), 2425 NDISC_REDIRECT, &ndopts); 2426 2427 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 2428 if (!nrt) 2429 goto out; 2430 2431 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 2432 if (on_link) 2433 nrt->rt6i_flags &= ~RTF_GATEWAY; 2434 2435 nrt->rt6i_protocol = RTPROT_REDIRECT; 2436 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 2437 2438 if (ip6_ins_rt(nrt)) 2439 goto out_release; 2440 2441 netevent.old = &rt->dst; 2442 netevent.new = &nrt->dst; 2443 netevent.daddr = &msg->dest; 2444 netevent.neigh = neigh; 2445 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 2446 2447 if (rt->rt6i_flags & RTF_CACHE) { 2448 rt = (struct rt6_info *) dst_clone(&rt->dst); 2449 ip6_del_rt(rt); 2450 } 2451 2452 out_release: 2453 /* Release the reference taken in 2454 * ip6_rt_cache_alloc() 2455 */ 2456 dst_release(&nrt->dst); 2457 2458 out: 2459 neigh_release(neigh); 2460 } 2461 2462 /* 2463 * Misc support functions 2464 */ 2465 2466 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) 2467 { 2468 BUG_ON(from->dst.from); 2469 2470 rt->rt6i_flags &= ~RTF_EXPIRES; 2471 dst_hold(&from->dst); 2472 rt->dst.from = &from->dst; 2473 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); 2474 } 2475 2476 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) 2477 { 2478 rt->dst.input = ort->dst.input; 2479 rt->dst.output = ort->dst.output; 2480 rt->rt6i_dst = ort->rt6i_dst; 2481 rt->dst.error = ort->dst.error; 2482 rt->rt6i_idev = ort->rt6i_idev; 2483 if (rt->rt6i_idev) 2484 in6_dev_hold(rt->rt6i_idev); 2485 rt->dst.lastuse = jiffies; 2486 rt->rt6i_gateway = ort->rt6i_gateway; 2487 rt->rt6i_flags = ort->rt6i_flags; 2488 rt6_set_from(rt, ort); 2489 rt->rt6i_metric = ort->rt6i_metric; 2490 #ifdef CONFIG_IPV6_SUBTREES 2491 rt->rt6i_src = ort->rt6i_src; 2492 #endif 2493 rt->rt6i_prefsrc = ort->rt6i_prefsrc; 2494 rt->rt6i_table = ort->rt6i_table; 2495 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); 2496 } 2497 2498 #ifdef CONFIG_IPV6_ROUTE_INFO 2499 static struct rt6_info *rt6_get_route_info(struct net *net, 2500 const struct in6_addr *prefix, int prefixlen, 2501 const struct in6_addr *gwaddr, 2502 struct net_device *dev) 2503 { 2504 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; 2505 int ifindex = dev->ifindex; 2506 struct fib6_node *fn; 2507 struct rt6_info *rt = NULL; 2508 struct fib6_table *table; 2509 2510 table = fib6_get_table(net, tb_id); 2511 if (!table) 2512 return NULL; 2513 2514 read_lock_bh(&table->tb6_lock); 2515 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); 2516 if (!fn) 2517 goto out; 2518 2519 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2520 if (rt->dst.dev->ifindex != ifindex) 2521 continue; 2522 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 2523 continue; 2524 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 2525 continue; 2526 dst_hold(&rt->dst); 2527 break; 2528 } 2529 out: 2530 read_unlock_bh(&table->tb6_lock); 2531 return rt; 2532 } 2533 2534 static struct rt6_info *rt6_add_route_info(struct net *net, 2535 const struct in6_addr *prefix, int prefixlen, 2536 const struct in6_addr *gwaddr, 2537 struct net_device *dev, 2538 unsigned int pref) 2539 { 2540 struct fib6_config cfg = { 2541 .fc_metric = IP6_RT_PRIO_USER, 2542 .fc_ifindex = dev->ifindex, 2543 .fc_dst_len = prefixlen, 2544 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 2545 RTF_UP | RTF_PREF(pref), 2546 .fc_protocol = RTPROT_RA, 2547 .fc_nlinfo.portid = 0, 2548 .fc_nlinfo.nlh = NULL, 2549 .fc_nlinfo.nl_net = net, 2550 }; 2551 2552 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO, 2553 cfg.fc_dst = *prefix; 2554 cfg.fc_gateway = *gwaddr; 2555 2556 /* We should treat it as a default route if prefix length is 0. */ 2557 if (!prefixlen) 2558 cfg.fc_flags |= RTF_DEFAULT; 2559 2560 ip6_route_add(&cfg, NULL); 2561 2562 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); 2563 } 2564 #endif 2565 2566 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 2567 { 2568 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; 2569 struct rt6_info *rt; 2570 struct fib6_table *table; 2571 2572 table = fib6_get_table(dev_net(dev), tb_id); 2573 if (!table) 2574 return NULL; 2575 2576 read_lock_bh(&table->tb6_lock); 2577 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 2578 if (dev == rt->dst.dev && 2579 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 2580 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 2581 break; 2582 } 2583 if (rt) 2584 dst_hold(&rt->dst); 2585 read_unlock_bh(&table->tb6_lock); 2586 return rt; 2587 } 2588 2589 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 2590 struct net_device *dev, 2591 unsigned int pref) 2592 { 2593 struct fib6_config cfg = { 2594 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, 2595 .fc_metric = IP6_RT_PRIO_USER, 2596 .fc_ifindex = dev->ifindex, 2597 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 2598 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 2599 .fc_protocol = RTPROT_RA, 2600 .fc_nlinfo.portid = 0, 2601 .fc_nlinfo.nlh = NULL, 2602 .fc_nlinfo.nl_net = dev_net(dev), 2603 }; 2604 2605 cfg.fc_gateway = *gwaddr; 2606 2607 if (!ip6_route_add(&cfg, NULL)) { 2608 struct fib6_table *table; 2609 2610 table = fib6_get_table(dev_net(dev), cfg.fc_table); 2611 if (table) 2612 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; 2613 } 2614 2615 return rt6_get_dflt_router(gwaddr, dev); 2616 } 2617 2618 static void __rt6_purge_dflt_routers(struct fib6_table *table) 2619 { 2620 struct rt6_info *rt; 2621 2622 restart: 2623 read_lock_bh(&table->tb6_lock); 2624 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 2625 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 2626 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 2627 dst_hold(&rt->dst); 2628 read_unlock_bh(&table->tb6_lock); 2629 ip6_del_rt(rt); 2630 goto restart; 2631 } 2632 } 2633 read_unlock_bh(&table->tb6_lock); 2634 2635 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; 2636 } 2637 2638 void rt6_purge_dflt_routers(struct net *net) 2639 { 2640 struct fib6_table *table; 2641 struct hlist_head *head; 2642 unsigned int h; 2643 2644 rcu_read_lock(); 2645 2646 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { 2647 head = &net->ipv6.fib_table_hash[h]; 2648 hlist_for_each_entry_rcu(table, head, tb6_hlist) { 2649 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) 2650 __rt6_purge_dflt_routers(table); 2651 } 2652 } 2653 2654 rcu_read_unlock(); 2655 } 2656 2657 static void rtmsg_to_fib6_config(struct net *net, 2658 struct in6_rtmsg *rtmsg, 2659 struct fib6_config *cfg) 2660 { 2661 memset(cfg, 0, sizeof(*cfg)); 2662 2663 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? 2664 : RT6_TABLE_MAIN; 2665 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 2666 cfg->fc_metric = rtmsg->rtmsg_metric; 2667 cfg->fc_expires = rtmsg->rtmsg_info; 2668 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 2669 cfg->fc_src_len = rtmsg->rtmsg_src_len; 2670 cfg->fc_flags = rtmsg->rtmsg_flags; 2671 2672 cfg->fc_nlinfo.nl_net = net; 2673 2674 cfg->fc_dst = rtmsg->rtmsg_dst; 2675 cfg->fc_src = rtmsg->rtmsg_src; 2676 cfg->fc_gateway = rtmsg->rtmsg_gateway; 2677 } 2678 2679 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 2680 { 2681 struct fib6_config cfg; 2682 struct in6_rtmsg rtmsg; 2683 int err; 2684 2685 switch (cmd) { 2686 case SIOCADDRT: /* Add a route */ 2687 case SIOCDELRT: /* Delete a route */ 2688 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 2689 return -EPERM; 2690 err = copy_from_user(&rtmsg, arg, 2691 sizeof(struct in6_rtmsg)); 2692 if (err) 2693 return -EFAULT; 2694 2695 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 2696 2697 rtnl_lock(); 2698 switch (cmd) { 2699 case SIOCADDRT: 2700 err = ip6_route_add(&cfg, NULL); 2701 break; 2702 case SIOCDELRT: 2703 err = ip6_route_del(&cfg, NULL); 2704 break; 2705 default: 2706 err = -EINVAL; 2707 } 2708 rtnl_unlock(); 2709 2710 return err; 2711 } 2712 2713 return -EINVAL; 2714 } 2715 2716 /* 2717 * Drop the packet on the floor 2718 */ 2719 2720 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 2721 { 2722 int type; 2723 struct dst_entry *dst = skb_dst(skb); 2724 switch (ipstats_mib_noroutes) { 2725 case IPSTATS_MIB_INNOROUTES: 2726 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 2727 if (type == IPV6_ADDR_ANY) { 2728 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 2729 IPSTATS_MIB_INADDRERRORS); 2730 break; 2731 } 2732 /* FALLTHROUGH */ 2733 case IPSTATS_MIB_OUTNOROUTES: 2734 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 2735 ipstats_mib_noroutes); 2736 break; 2737 } 2738 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 2739 kfree_skb(skb); 2740 return 0; 2741 } 2742 2743 static int ip6_pkt_discard(struct sk_buff *skb) 2744 { 2745 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 2746 } 2747 2748 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) 2749 { 2750 skb->dev = skb_dst(skb)->dev; 2751 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 2752 } 2753 2754 static int ip6_pkt_prohibit(struct sk_buff *skb) 2755 { 2756 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 2757 } 2758 2759 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) 2760 { 2761 skb->dev = skb_dst(skb)->dev; 2762 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 2763 } 2764 2765 /* 2766 * Allocate a dst for local (unicast / anycast) address. 2767 */ 2768 2769 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 2770 const struct in6_addr *addr, 2771 bool anycast) 2772 { 2773 u32 tb_id; 2774 struct net *net = dev_net(idev->dev); 2775 struct net_device *dev = idev->dev; 2776 struct rt6_info *rt; 2777 2778 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT); 2779 if (!rt) 2780 return ERR_PTR(-ENOMEM); 2781 2782 in6_dev_hold(idev); 2783 2784 rt->dst.flags |= DST_HOST; 2785 rt->dst.input = ip6_input; 2786 rt->dst.output = ip6_output; 2787 rt->rt6i_idev = idev; 2788 2789 rt->rt6i_protocol = RTPROT_KERNEL; 2790 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 2791 if (anycast) 2792 rt->rt6i_flags |= RTF_ANYCAST; 2793 else 2794 rt->rt6i_flags |= RTF_LOCAL; 2795 2796 rt->rt6i_gateway = *addr; 2797 rt->rt6i_dst.addr = *addr; 2798 rt->rt6i_dst.plen = 128; 2799 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL; 2800 rt->rt6i_table = fib6_get_table(net, tb_id); 2801 2802 return rt; 2803 } 2804 2805 /* remove deleted ip from prefsrc entries */ 2806 struct arg_dev_net_ip { 2807 struct net_device *dev; 2808 struct net *net; 2809 struct in6_addr *addr; 2810 }; 2811 2812 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 2813 { 2814 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 2815 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 2816 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 2817 2818 if (((void *)rt->dst.dev == dev || !dev) && 2819 rt != net->ipv6.ip6_null_entry && 2820 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 2821 /* remove prefsrc entry */ 2822 rt->rt6i_prefsrc.plen = 0; 2823 } 2824 return 0; 2825 } 2826 2827 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 2828 { 2829 struct net *net = dev_net(ifp->idev->dev); 2830 struct arg_dev_net_ip adni = { 2831 .dev = ifp->idev->dev, 2832 .net = net, 2833 .addr = &ifp->addr, 2834 }; 2835 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 2836 } 2837 2838 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 2839 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2840 2841 /* Remove routers and update dst entries when gateway turn into host. */ 2842 static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 2843 { 2844 struct in6_addr *gateway = (struct in6_addr *)arg; 2845 2846 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || 2847 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && 2848 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 2849 return -1; 2850 } 2851 return 0; 2852 } 2853 2854 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 2855 { 2856 fib6_clean_all(net, fib6_clean_tohost, gateway); 2857 } 2858 2859 struct arg_dev_net { 2860 struct net_device *dev; 2861 struct net *net; 2862 }; 2863 2864 /* called with write lock held for table with rt */ 2865 static int fib6_ifdown(struct rt6_info *rt, void *arg) 2866 { 2867 const struct arg_dev_net *adn = arg; 2868 const struct net_device *dev = adn->dev; 2869 2870 if ((rt->dst.dev == dev || !dev) && 2871 rt != adn->net->ipv6.ip6_null_entry && 2872 (rt->rt6i_nsiblings == 0 || 2873 (dev && netdev_unregistering(dev)) || 2874 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown)) 2875 return -1; 2876 2877 return 0; 2878 } 2879 2880 void rt6_ifdown(struct net *net, struct net_device *dev) 2881 { 2882 struct arg_dev_net adn = { 2883 .dev = dev, 2884 .net = net, 2885 }; 2886 2887 fib6_clean_all(net, fib6_ifdown, &adn); 2888 if (dev) 2889 rt6_uncached_list_flush_dev(net, dev); 2890 } 2891 2892 struct rt6_mtu_change_arg { 2893 struct net_device *dev; 2894 unsigned int mtu; 2895 }; 2896 2897 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 2898 { 2899 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 2900 struct inet6_dev *idev; 2901 2902 /* In IPv6 pmtu discovery is not optional, 2903 so that RTAX_MTU lock cannot disable it. 2904 We still use this lock to block changes 2905 caused by addrconf/ndisc. 2906 */ 2907 2908 idev = __in6_dev_get(arg->dev); 2909 if (!idev) 2910 return 0; 2911 2912 /* For administrative MTU increase, there is no way to discover 2913 IPv6 PMTU increase, so PMTU increase should be updated here. 2914 Since RFC 1981 doesn't include administrative MTU increase 2915 update PMTU increase is a MUST. (i.e. jumbo frame) 2916 */ 2917 /* 2918 If new MTU is less than route PMTU, this new MTU will be the 2919 lowest MTU in the path, update the route PMTU to reflect PMTU 2920 decreases; if new MTU is greater than route PMTU, and the 2921 old MTU is the lowest MTU in the path, update the route PMTU 2922 to reflect the increase. In this case if the other nodes' MTU 2923 also have the lowest MTU, TOO BIG MESSAGE will be lead to 2924 PMTU discovery. 2925 */ 2926 if (rt->dst.dev == arg->dev && 2927 dst_metric_raw(&rt->dst, RTAX_MTU) && 2928 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 2929 if (rt->rt6i_flags & RTF_CACHE) { 2930 /* For RTF_CACHE with rt6i_pmtu == 0 2931 * (i.e. a redirected route), 2932 * the metrics of its rt->dst.from has already 2933 * been updated. 2934 */ 2935 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) 2936 rt->rt6i_pmtu = arg->mtu; 2937 } else if (dst_mtu(&rt->dst) >= arg->mtu || 2938 (dst_mtu(&rt->dst) < arg->mtu && 2939 dst_mtu(&rt->dst) == idev->cnf.mtu6)) { 2940 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 2941 } 2942 } 2943 return 0; 2944 } 2945 2946 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 2947 { 2948 struct rt6_mtu_change_arg arg = { 2949 .dev = dev, 2950 .mtu = mtu, 2951 }; 2952 2953 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 2954 } 2955 2956 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 2957 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 2958 [RTA_OIF] = { .type = NLA_U32 }, 2959 [RTA_IIF] = { .type = NLA_U32 }, 2960 [RTA_PRIORITY] = { .type = NLA_U32 }, 2961 [RTA_METRICS] = { .type = NLA_NESTED }, 2962 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 2963 [RTA_PREF] = { .type = NLA_U8 }, 2964 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 2965 [RTA_ENCAP] = { .type = NLA_NESTED }, 2966 [RTA_EXPIRES] = { .type = NLA_U32 }, 2967 [RTA_UID] = { .type = NLA_U32 }, 2968 [RTA_MARK] = { .type = NLA_U32 }, 2969 }; 2970 2971 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 2972 struct fib6_config *cfg, 2973 struct netlink_ext_ack *extack) 2974 { 2975 struct rtmsg *rtm; 2976 struct nlattr *tb[RTA_MAX+1]; 2977 unsigned int pref; 2978 int err; 2979 2980 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 2981 NULL); 2982 if (err < 0) 2983 goto errout; 2984 2985 err = -EINVAL; 2986 rtm = nlmsg_data(nlh); 2987 memset(cfg, 0, sizeof(*cfg)); 2988 2989 cfg->fc_table = rtm->rtm_table; 2990 cfg->fc_dst_len = rtm->rtm_dst_len; 2991 cfg->fc_src_len = rtm->rtm_src_len; 2992 cfg->fc_flags = RTF_UP; 2993 cfg->fc_protocol = rtm->rtm_protocol; 2994 cfg->fc_type = rtm->rtm_type; 2995 2996 if (rtm->rtm_type == RTN_UNREACHABLE || 2997 rtm->rtm_type == RTN_BLACKHOLE || 2998 rtm->rtm_type == RTN_PROHIBIT || 2999 rtm->rtm_type == RTN_THROW) 3000 cfg->fc_flags |= RTF_REJECT; 3001 3002 if (rtm->rtm_type == RTN_LOCAL) 3003 cfg->fc_flags |= RTF_LOCAL; 3004 3005 if (rtm->rtm_flags & RTM_F_CLONED) 3006 cfg->fc_flags |= RTF_CACHE; 3007 3008 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 3009 cfg->fc_nlinfo.nlh = nlh; 3010 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 3011 3012 if (tb[RTA_GATEWAY]) { 3013 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 3014 cfg->fc_flags |= RTF_GATEWAY; 3015 } 3016 3017 if (tb[RTA_DST]) { 3018 int plen = (rtm->rtm_dst_len + 7) >> 3; 3019 3020 if (nla_len(tb[RTA_DST]) < plen) 3021 goto errout; 3022 3023 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 3024 } 3025 3026 if (tb[RTA_SRC]) { 3027 int plen = (rtm->rtm_src_len + 7) >> 3; 3028 3029 if (nla_len(tb[RTA_SRC]) < plen) 3030 goto errout; 3031 3032 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 3033 } 3034 3035 if (tb[RTA_PREFSRC]) 3036 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 3037 3038 if (tb[RTA_OIF]) 3039 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 3040 3041 if (tb[RTA_PRIORITY]) 3042 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 3043 3044 if (tb[RTA_METRICS]) { 3045 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 3046 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 3047 } 3048 3049 if (tb[RTA_TABLE]) 3050 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 3051 3052 if (tb[RTA_MULTIPATH]) { 3053 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 3054 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 3055 3056 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, 3057 cfg->fc_mp_len, extack); 3058 if (err < 0) 3059 goto errout; 3060 } 3061 3062 if (tb[RTA_PREF]) { 3063 pref = nla_get_u8(tb[RTA_PREF]); 3064 if (pref != ICMPV6_ROUTER_PREF_LOW && 3065 pref != ICMPV6_ROUTER_PREF_HIGH) 3066 pref = ICMPV6_ROUTER_PREF_MEDIUM; 3067 cfg->fc_flags |= RTF_PREF(pref); 3068 } 3069 3070 if (tb[RTA_ENCAP]) 3071 cfg->fc_encap = tb[RTA_ENCAP]; 3072 3073 if (tb[RTA_ENCAP_TYPE]) { 3074 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 3075 3076 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); 3077 if (err < 0) 3078 goto errout; 3079 } 3080 3081 if (tb[RTA_EXPIRES]) { 3082 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); 3083 3084 if (addrconf_finite_timeout(timeout)) { 3085 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); 3086 cfg->fc_flags |= RTF_EXPIRES; 3087 } 3088 } 3089 3090 err = 0; 3091 errout: 3092 return err; 3093 } 3094 3095 struct rt6_nh { 3096 struct rt6_info *rt6_info; 3097 struct fib6_config r_cfg; 3098 struct mx6_config mxc; 3099 struct list_head next; 3100 }; 3101 3102 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list) 3103 { 3104 struct rt6_nh *nh; 3105 3106 list_for_each_entry(nh, rt6_nh_list, next) { 3107 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n", 3108 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway, 3109 nh->r_cfg.fc_ifindex); 3110 } 3111 } 3112 3113 static int ip6_route_info_append(struct list_head *rt6_nh_list, 3114 struct rt6_info *rt, struct fib6_config *r_cfg) 3115 { 3116 struct rt6_nh *nh; 3117 int err = -EEXIST; 3118 3119 list_for_each_entry(nh, rt6_nh_list, next) { 3120 /* check if rt6_info already exists */ 3121 if (rt6_duplicate_nexthop(nh->rt6_info, rt)) 3122 return err; 3123 } 3124 3125 nh = kzalloc(sizeof(*nh), GFP_KERNEL); 3126 if (!nh) 3127 return -ENOMEM; 3128 nh->rt6_info = rt; 3129 err = ip6_convert_metrics(&nh->mxc, r_cfg); 3130 if (err) { 3131 kfree(nh); 3132 return err; 3133 } 3134 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); 3135 list_add_tail(&nh->next, rt6_nh_list); 3136 3137 return 0; 3138 } 3139 3140 static void ip6_route_mpath_notify(struct rt6_info *rt, 3141 struct rt6_info *rt_last, 3142 struct nl_info *info, 3143 __u16 nlflags) 3144 { 3145 /* if this is an APPEND route, then rt points to the first route 3146 * inserted and rt_last points to last route inserted. Userspace 3147 * wants a consistent dump of the route which starts at the first 3148 * nexthop. Since sibling routes are always added at the end of 3149 * the list, find the first sibling of the last route appended 3150 */ 3151 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) { 3152 rt = list_first_entry(&rt_last->rt6i_siblings, 3153 struct rt6_info, 3154 rt6i_siblings); 3155 } 3156 3157 if (rt) 3158 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); 3159 } 3160 3161 static int ip6_route_multipath_add(struct fib6_config *cfg, 3162 struct netlink_ext_ack *extack) 3163 { 3164 struct rt6_info *rt_notif = NULL, *rt_last = NULL; 3165 struct nl_info *info = &cfg->fc_nlinfo; 3166 struct fib6_config r_cfg; 3167 struct rtnexthop *rtnh; 3168 struct rt6_info *rt; 3169 struct rt6_nh *err_nh; 3170 struct rt6_nh *nh, *nh_safe; 3171 __u16 nlflags; 3172 int remaining; 3173 int attrlen; 3174 int err = 1; 3175 int nhn = 0; 3176 int replace = (cfg->fc_nlinfo.nlh && 3177 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); 3178 LIST_HEAD(rt6_nh_list); 3179 3180 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; 3181 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) 3182 nlflags |= NLM_F_APPEND; 3183 3184 remaining = cfg->fc_mp_len; 3185 rtnh = (struct rtnexthop *)cfg->fc_mp; 3186 3187 /* Parse a Multipath Entry and build a list (rt6_nh_list) of 3188 * rt6_info structs per nexthop 3189 */ 3190 while (rtnh_ok(rtnh, remaining)) { 3191 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3192 if (rtnh->rtnh_ifindex) 3193 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3194 3195 attrlen = rtnh_attrlen(rtnh); 3196 if (attrlen > 0) { 3197 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3198 3199 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3200 if (nla) { 3201 r_cfg.fc_gateway = nla_get_in6_addr(nla); 3202 r_cfg.fc_flags |= RTF_GATEWAY; 3203 } 3204 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 3205 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 3206 if (nla) 3207 r_cfg.fc_encap_type = nla_get_u16(nla); 3208 } 3209 3210 rt = ip6_route_info_create(&r_cfg, extack); 3211 if (IS_ERR(rt)) { 3212 err = PTR_ERR(rt); 3213 rt = NULL; 3214 goto cleanup; 3215 } 3216 3217 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); 3218 if (err) { 3219 dst_release_immediate(&rt->dst); 3220 goto cleanup; 3221 } 3222 3223 rtnh = rtnh_next(rtnh, &remaining); 3224 } 3225 3226 /* for add and replace send one notification with all nexthops. 3227 * Skip the notification in fib6_add_rt2node and send one with 3228 * the full route when done 3229 */ 3230 info->skip_notify = 1; 3231 3232 err_nh = NULL; 3233 list_for_each_entry(nh, &rt6_nh_list, next) { 3234 rt_last = nh->rt6_info; 3235 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack); 3236 /* save reference to first route for notification */ 3237 if (!rt_notif && !err) 3238 rt_notif = nh->rt6_info; 3239 3240 /* nh->rt6_info is used or freed at this point, reset to NULL*/ 3241 nh->rt6_info = NULL; 3242 if (err) { 3243 if (replace && nhn) 3244 ip6_print_replace_route_err(&rt6_nh_list); 3245 err_nh = nh; 3246 goto add_errout; 3247 } 3248 3249 /* Because each route is added like a single route we remove 3250 * these flags after the first nexthop: if there is a collision, 3251 * we have already failed to add the first nexthop: 3252 * fib6_add_rt2node() has rejected it; when replacing, old 3253 * nexthops have been replaced by first new, the rest should 3254 * be added to it. 3255 */ 3256 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 3257 NLM_F_REPLACE); 3258 nhn++; 3259 } 3260 3261 /* success ... tell user about new route */ 3262 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3263 goto cleanup; 3264 3265 add_errout: 3266 /* send notification for routes that were added so that 3267 * the delete notifications sent by ip6_route_del are 3268 * coherent 3269 */ 3270 if (rt_notif) 3271 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); 3272 3273 /* Delete routes that were already added */ 3274 list_for_each_entry(nh, &rt6_nh_list, next) { 3275 if (err_nh == nh) 3276 break; 3277 ip6_route_del(&nh->r_cfg, extack); 3278 } 3279 3280 cleanup: 3281 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { 3282 if (nh->rt6_info) 3283 dst_release_immediate(&nh->rt6_info->dst); 3284 kfree(nh->mxc.mx); 3285 list_del(&nh->next); 3286 kfree(nh); 3287 } 3288 3289 return err; 3290 } 3291 3292 static int ip6_route_multipath_del(struct fib6_config *cfg, 3293 struct netlink_ext_ack *extack) 3294 { 3295 struct fib6_config r_cfg; 3296 struct rtnexthop *rtnh; 3297 int remaining; 3298 int attrlen; 3299 int err = 1, last_err = 0; 3300 3301 remaining = cfg->fc_mp_len; 3302 rtnh = (struct rtnexthop *)cfg->fc_mp; 3303 3304 /* Parse a Multipath Entry */ 3305 while (rtnh_ok(rtnh, remaining)) { 3306 memcpy(&r_cfg, cfg, sizeof(*cfg)); 3307 if (rtnh->rtnh_ifindex) 3308 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 3309 3310 attrlen = rtnh_attrlen(rtnh); 3311 if (attrlen > 0) { 3312 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 3313 3314 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 3315 if (nla) { 3316 nla_memcpy(&r_cfg.fc_gateway, nla, 16); 3317 r_cfg.fc_flags |= RTF_GATEWAY; 3318 } 3319 } 3320 err = ip6_route_del(&r_cfg, extack); 3321 if (err) 3322 last_err = err; 3323 3324 rtnh = rtnh_next(rtnh, &remaining); 3325 } 3326 3327 return last_err; 3328 } 3329 3330 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3331 struct netlink_ext_ack *extack) 3332 { 3333 struct fib6_config cfg; 3334 int err; 3335 3336 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3337 if (err < 0) 3338 return err; 3339 3340 if (cfg.fc_mp) 3341 return ip6_route_multipath_del(&cfg, extack); 3342 else { 3343 cfg.fc_delete_all_nh = 1; 3344 return ip6_route_del(&cfg, extack); 3345 } 3346 } 3347 3348 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, 3349 struct netlink_ext_ack *extack) 3350 { 3351 struct fib6_config cfg; 3352 int err; 3353 3354 err = rtm_to_fib6_config(skb, nlh, &cfg, extack); 3355 if (err < 0) 3356 return err; 3357 3358 if (cfg.fc_mp) 3359 return ip6_route_multipath_add(&cfg, extack); 3360 else 3361 return ip6_route_add(&cfg, extack); 3362 } 3363 3364 static size_t rt6_nlmsg_size(struct rt6_info *rt) 3365 { 3366 int nexthop_len = 0; 3367 3368 if (rt->rt6i_nsiblings) { 3369 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */ 3370 + NLA_ALIGN(sizeof(struct rtnexthop)) 3371 + nla_total_size(16) /* RTA_GATEWAY */ 3372 + lwtunnel_get_encap_size(rt->dst.lwtstate); 3373 3374 nexthop_len *= rt->rt6i_nsiblings; 3375 } 3376 3377 return NLMSG_ALIGN(sizeof(struct rtmsg)) 3378 + nla_total_size(16) /* RTA_SRC */ 3379 + nla_total_size(16) /* RTA_DST */ 3380 + nla_total_size(16) /* RTA_GATEWAY */ 3381 + nla_total_size(16) /* RTA_PREFSRC */ 3382 + nla_total_size(4) /* RTA_TABLE */ 3383 + nla_total_size(4) /* RTA_IIF */ 3384 + nla_total_size(4) /* RTA_OIF */ 3385 + nla_total_size(4) /* RTA_PRIORITY */ 3386 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 3387 + nla_total_size(sizeof(struct rta_cacheinfo)) 3388 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 3389 + nla_total_size(1) /* RTA_PREF */ 3390 + lwtunnel_get_encap_size(rt->dst.lwtstate) 3391 + nexthop_len; 3392 } 3393 3394 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, 3395 unsigned int *flags, bool skip_oif) 3396 { 3397 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) { 3398 *flags |= RTNH_F_LINKDOWN; 3399 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 3400 *flags |= RTNH_F_DEAD; 3401 } 3402 3403 if (rt->rt6i_flags & RTF_GATEWAY) { 3404 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 3405 goto nla_put_failure; 3406 } 3407 3408 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) 3409 *flags |= RTNH_F_OFFLOAD; 3410 3411 /* not needed for multipath encoding b/c it has a rtnexthop struct */ 3412 if (!skip_oif && rt->dst.dev && 3413 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 3414 goto nla_put_failure; 3415 3416 if (rt->dst.lwtstate && 3417 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0) 3418 goto nla_put_failure; 3419 3420 return 0; 3421 3422 nla_put_failure: 3423 return -EMSGSIZE; 3424 } 3425 3426 /* add multipath next hop */ 3427 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt) 3428 { 3429 struct rtnexthop *rtnh; 3430 unsigned int flags = 0; 3431 3432 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 3433 if (!rtnh) 3434 goto nla_put_failure; 3435 3436 rtnh->rtnh_hops = 0; 3437 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0; 3438 3439 if (rt6_nexthop_info(skb, rt, &flags, true) < 0) 3440 goto nla_put_failure; 3441 3442 rtnh->rtnh_flags = flags; 3443 3444 /* length of rtnetlink header + attributes */ 3445 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; 3446 3447 return 0; 3448 3449 nla_put_failure: 3450 return -EMSGSIZE; 3451 } 3452 3453 static int rt6_fill_node(struct net *net, 3454 struct sk_buff *skb, struct rt6_info *rt, 3455 struct in6_addr *dst, struct in6_addr *src, 3456 int iif, int type, u32 portid, u32 seq, 3457 unsigned int flags) 3458 { 3459 u32 metrics[RTAX_MAX]; 3460 struct rtmsg *rtm; 3461 struct nlmsghdr *nlh; 3462 long expires; 3463 u32 table; 3464 3465 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 3466 if (!nlh) 3467 return -EMSGSIZE; 3468 3469 rtm = nlmsg_data(nlh); 3470 rtm->rtm_family = AF_INET6; 3471 rtm->rtm_dst_len = rt->rt6i_dst.plen; 3472 rtm->rtm_src_len = rt->rt6i_src.plen; 3473 rtm->rtm_tos = 0; 3474 if (rt->rt6i_table) 3475 table = rt->rt6i_table->tb6_id; 3476 else 3477 table = RT6_TABLE_UNSPEC; 3478 rtm->rtm_table = table; 3479 if (nla_put_u32(skb, RTA_TABLE, table)) 3480 goto nla_put_failure; 3481 if (rt->rt6i_flags & RTF_REJECT) { 3482 switch (rt->dst.error) { 3483 case -EINVAL: 3484 rtm->rtm_type = RTN_BLACKHOLE; 3485 break; 3486 case -EACCES: 3487 rtm->rtm_type = RTN_PROHIBIT; 3488 break; 3489 case -EAGAIN: 3490 rtm->rtm_type = RTN_THROW; 3491 break; 3492 default: 3493 rtm->rtm_type = RTN_UNREACHABLE; 3494 break; 3495 } 3496 } 3497 else if (rt->rt6i_flags & RTF_LOCAL) 3498 rtm->rtm_type = RTN_LOCAL; 3499 else if (rt->rt6i_flags & RTF_ANYCAST) 3500 rtm->rtm_type = RTN_ANYCAST; 3501 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 3502 rtm->rtm_type = RTN_LOCAL; 3503 else 3504 rtm->rtm_type = RTN_UNICAST; 3505 rtm->rtm_flags = 0; 3506 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 3507 rtm->rtm_protocol = rt->rt6i_protocol; 3508 3509 if (rt->rt6i_flags & RTF_CACHE) 3510 rtm->rtm_flags |= RTM_F_CLONED; 3511 3512 if (dst) { 3513 if (nla_put_in6_addr(skb, RTA_DST, dst)) 3514 goto nla_put_failure; 3515 rtm->rtm_dst_len = 128; 3516 } else if (rtm->rtm_dst_len) 3517 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 3518 goto nla_put_failure; 3519 #ifdef CONFIG_IPV6_SUBTREES 3520 if (src) { 3521 if (nla_put_in6_addr(skb, RTA_SRC, src)) 3522 goto nla_put_failure; 3523 rtm->rtm_src_len = 128; 3524 } else if (rtm->rtm_src_len && 3525 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 3526 goto nla_put_failure; 3527 #endif 3528 if (iif) { 3529 #ifdef CONFIG_IPV6_MROUTE 3530 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 3531 int err = ip6mr_get_route(net, skb, rtm, portid); 3532 3533 if (err == 0) 3534 return 0; 3535 if (err < 0) 3536 goto nla_put_failure; 3537 } else 3538 #endif 3539 if (nla_put_u32(skb, RTA_IIF, iif)) 3540 goto nla_put_failure; 3541 } else if (dst) { 3542 struct in6_addr saddr_buf; 3543 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 3544 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 3545 goto nla_put_failure; 3546 } 3547 3548 if (rt->rt6i_prefsrc.plen) { 3549 struct in6_addr saddr_buf; 3550 saddr_buf = rt->rt6i_prefsrc.addr; 3551 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 3552 goto nla_put_failure; 3553 } 3554 3555 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 3556 if (rt->rt6i_pmtu) 3557 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; 3558 if (rtnetlink_put_metrics(skb, metrics) < 0) 3559 goto nla_put_failure; 3560 3561 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 3562 goto nla_put_failure; 3563 3564 /* For multipath routes, walk the siblings list and add 3565 * each as a nexthop within RTA_MULTIPATH. 3566 */ 3567 if (rt->rt6i_nsiblings) { 3568 struct rt6_info *sibling, *next_sibling; 3569 struct nlattr *mp; 3570 3571 mp = nla_nest_start(skb, RTA_MULTIPATH); 3572 if (!mp) 3573 goto nla_put_failure; 3574 3575 if (rt6_add_nexthop(skb, rt) < 0) 3576 goto nla_put_failure; 3577 3578 list_for_each_entry_safe(sibling, next_sibling, 3579 &rt->rt6i_siblings, rt6i_siblings) { 3580 if (rt6_add_nexthop(skb, sibling) < 0) 3581 goto nla_put_failure; 3582 } 3583 3584 nla_nest_end(skb, mp); 3585 } else { 3586 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0) 3587 goto nla_put_failure; 3588 } 3589 3590 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 3591 3592 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 3593 goto nla_put_failure; 3594 3595 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 3596 goto nla_put_failure; 3597 3598 3599 nlmsg_end(skb, nlh); 3600 return 0; 3601 3602 nla_put_failure: 3603 nlmsg_cancel(skb, nlh); 3604 return -EMSGSIZE; 3605 } 3606 3607 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 3608 { 3609 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 3610 struct net *net = arg->net; 3611 3612 if (rt == net->ipv6.ip6_null_entry) 3613 return 0; 3614 3615 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 3616 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 3617 3618 /* user wants prefix routes only */ 3619 if (rtm->rtm_flags & RTM_F_PREFIX && 3620 !(rt->rt6i_flags & RTF_PREFIX_RT)) { 3621 /* success since this is not a prefix route */ 3622 return 1; 3623 } 3624 } 3625 3626 return rt6_fill_node(net, 3627 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 3628 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 3629 NLM_F_MULTI); 3630 } 3631 3632 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 3633 struct netlink_ext_ack *extack) 3634 { 3635 struct net *net = sock_net(in_skb->sk); 3636 struct nlattr *tb[RTA_MAX+1]; 3637 int err, iif = 0, oif = 0; 3638 struct dst_entry *dst; 3639 struct rt6_info *rt; 3640 struct sk_buff *skb; 3641 struct rtmsg *rtm; 3642 struct flowi6 fl6; 3643 bool fibmatch; 3644 3645 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, 3646 extack); 3647 if (err < 0) 3648 goto errout; 3649 3650 err = -EINVAL; 3651 memset(&fl6, 0, sizeof(fl6)); 3652 rtm = nlmsg_data(nlh); 3653 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); 3654 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); 3655 3656 if (tb[RTA_SRC]) { 3657 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 3658 goto errout; 3659 3660 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 3661 } 3662 3663 if (tb[RTA_DST]) { 3664 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 3665 goto errout; 3666 3667 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 3668 } 3669 3670 if (tb[RTA_IIF]) 3671 iif = nla_get_u32(tb[RTA_IIF]); 3672 3673 if (tb[RTA_OIF]) 3674 oif = nla_get_u32(tb[RTA_OIF]); 3675 3676 if (tb[RTA_MARK]) 3677 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 3678 3679 if (tb[RTA_UID]) 3680 fl6.flowi6_uid = make_kuid(current_user_ns(), 3681 nla_get_u32(tb[RTA_UID])); 3682 else 3683 fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); 3684 3685 if (iif) { 3686 struct net_device *dev; 3687 int flags = 0; 3688 3689 rcu_read_lock(); 3690 3691 dev = dev_get_by_index_rcu(net, iif); 3692 if (!dev) { 3693 rcu_read_unlock(); 3694 err = -ENODEV; 3695 goto errout; 3696 } 3697 3698 fl6.flowi6_iif = iif; 3699 3700 if (!ipv6_addr_any(&fl6.saddr)) 3701 flags |= RT6_LOOKUP_F_HAS_SADDR; 3702 3703 if (!fibmatch) 3704 dst = ip6_route_input_lookup(net, dev, &fl6, flags); 3705 else 3706 dst = ip6_route_lookup(net, &fl6, 0); 3707 3708 rcu_read_unlock(); 3709 } else { 3710 fl6.flowi6_oif = oif; 3711 3712 if (!fibmatch) 3713 dst = ip6_route_output(net, NULL, &fl6); 3714 else 3715 dst = ip6_route_lookup(net, &fl6, 0); 3716 } 3717 3718 3719 rt = container_of(dst, struct rt6_info, dst); 3720 if (rt->dst.error) { 3721 err = rt->dst.error; 3722 ip6_rt_put(rt); 3723 goto errout; 3724 } 3725 3726 if (rt == net->ipv6.ip6_null_entry) { 3727 err = rt->dst.error; 3728 ip6_rt_put(rt); 3729 goto errout; 3730 } 3731 3732 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3733 if (!skb) { 3734 ip6_rt_put(rt); 3735 err = -ENOBUFS; 3736 goto errout; 3737 } 3738 3739 skb_dst_set(skb, &rt->dst); 3740 if (fibmatch) 3741 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif, 3742 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 3743 nlh->nlmsg_seq, 0); 3744 else 3745 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 3746 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 3747 nlh->nlmsg_seq, 0); 3748 if (err < 0) { 3749 kfree_skb(skb); 3750 goto errout; 3751 } 3752 3753 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3754 errout: 3755 return err; 3756 } 3757 3758 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, 3759 unsigned int nlm_flags) 3760 { 3761 struct sk_buff *skb; 3762 struct net *net = info->nl_net; 3763 u32 seq; 3764 int err; 3765 3766 err = -ENOBUFS; 3767 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3768 3769 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3770 if (!skb) 3771 goto errout; 3772 3773 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 3774 event, info->portid, seq, nlm_flags); 3775 if (err < 0) { 3776 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 3777 WARN_ON(err == -EMSGSIZE); 3778 kfree_skb(skb); 3779 goto errout; 3780 } 3781 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3782 info->nlh, gfp_any()); 3783 return; 3784 errout: 3785 if (err < 0) 3786 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 3787 } 3788 3789 static int ip6_route_dev_notify(struct notifier_block *this, 3790 unsigned long event, void *ptr) 3791 { 3792 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 3793 struct net *net = dev_net(dev); 3794 3795 if (!(dev->flags & IFF_LOOPBACK)) 3796 return NOTIFY_OK; 3797 3798 if (event == NETDEV_REGISTER) { 3799 net->ipv6.ip6_null_entry->dst.dev = dev; 3800 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 3801 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3802 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 3803 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 3804 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 3805 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 3806 #endif 3807 } else if (event == NETDEV_UNREGISTER && 3808 dev->reg_state != NETREG_UNREGISTERED) { 3809 /* NETDEV_UNREGISTER could be fired for multiple times by 3810 * netdev_wait_allrefs(). Make sure we only call this once. 3811 */ 3812 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); 3813 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3814 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); 3815 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); 3816 #endif 3817 } 3818 3819 return NOTIFY_OK; 3820 } 3821 3822 /* 3823 * /proc 3824 */ 3825 3826 #ifdef CONFIG_PROC_FS 3827 3828 static const struct file_operations ipv6_route_proc_fops = { 3829 .owner = THIS_MODULE, 3830 .open = ipv6_route_open, 3831 .read = seq_read, 3832 .llseek = seq_lseek, 3833 .release = seq_release_net, 3834 }; 3835 3836 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 3837 { 3838 struct net *net = (struct net *)seq->private; 3839 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 3840 net->ipv6.rt6_stats->fib_nodes, 3841 net->ipv6.rt6_stats->fib_route_nodes, 3842 net->ipv6.rt6_stats->fib_rt_alloc, 3843 net->ipv6.rt6_stats->fib_rt_entries, 3844 net->ipv6.rt6_stats->fib_rt_cache, 3845 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 3846 net->ipv6.rt6_stats->fib_discarded_routes); 3847 3848 return 0; 3849 } 3850 3851 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 3852 { 3853 return single_open_net(inode, file, rt6_stats_seq_show); 3854 } 3855 3856 static const struct file_operations rt6_stats_seq_fops = { 3857 .owner = THIS_MODULE, 3858 .open = rt6_stats_seq_open, 3859 .read = seq_read, 3860 .llseek = seq_lseek, 3861 .release = single_release_net, 3862 }; 3863 #endif /* CONFIG_PROC_FS */ 3864 3865 #ifdef CONFIG_SYSCTL 3866 3867 static 3868 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 3869 void __user *buffer, size_t *lenp, loff_t *ppos) 3870 { 3871 struct net *net; 3872 int delay; 3873 if (!write) 3874 return -EINVAL; 3875 3876 net = (struct net *)ctl->extra1; 3877 delay = net->ipv6.sysctl.flush_delay; 3878 proc_dointvec(ctl, write, buffer, lenp, ppos); 3879 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 3880 return 0; 3881 } 3882 3883 struct ctl_table ipv6_route_table_template[] = { 3884 { 3885 .procname = "flush", 3886 .data = &init_net.ipv6.sysctl.flush_delay, 3887 .maxlen = sizeof(int), 3888 .mode = 0200, 3889 .proc_handler = ipv6_sysctl_rtcache_flush 3890 }, 3891 { 3892 .procname = "gc_thresh", 3893 .data = &ip6_dst_ops_template.gc_thresh, 3894 .maxlen = sizeof(int), 3895 .mode = 0644, 3896 .proc_handler = proc_dointvec, 3897 }, 3898 { 3899 .procname = "max_size", 3900 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 3901 .maxlen = sizeof(int), 3902 .mode = 0644, 3903 .proc_handler = proc_dointvec, 3904 }, 3905 { 3906 .procname = "gc_min_interval", 3907 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 3908 .maxlen = sizeof(int), 3909 .mode = 0644, 3910 .proc_handler = proc_dointvec_jiffies, 3911 }, 3912 { 3913 .procname = "gc_timeout", 3914 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 3915 .maxlen = sizeof(int), 3916 .mode = 0644, 3917 .proc_handler = proc_dointvec_jiffies, 3918 }, 3919 { 3920 .procname = "gc_interval", 3921 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 3922 .maxlen = sizeof(int), 3923 .mode = 0644, 3924 .proc_handler = proc_dointvec_jiffies, 3925 }, 3926 { 3927 .procname = "gc_elasticity", 3928 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 3929 .maxlen = sizeof(int), 3930 .mode = 0644, 3931 .proc_handler = proc_dointvec, 3932 }, 3933 { 3934 .procname = "mtu_expires", 3935 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 3936 .maxlen = sizeof(int), 3937 .mode = 0644, 3938 .proc_handler = proc_dointvec_jiffies, 3939 }, 3940 { 3941 .procname = "min_adv_mss", 3942 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 3943 .maxlen = sizeof(int), 3944 .mode = 0644, 3945 .proc_handler = proc_dointvec, 3946 }, 3947 { 3948 .procname = "gc_min_interval_ms", 3949 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 3950 .maxlen = sizeof(int), 3951 .mode = 0644, 3952 .proc_handler = proc_dointvec_ms_jiffies, 3953 }, 3954 { } 3955 }; 3956 3957 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 3958 { 3959 struct ctl_table *table; 3960 3961 table = kmemdup(ipv6_route_table_template, 3962 sizeof(ipv6_route_table_template), 3963 GFP_KERNEL); 3964 3965 if (table) { 3966 table[0].data = &net->ipv6.sysctl.flush_delay; 3967 table[0].extra1 = net; 3968 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 3969 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 3970 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 3971 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 3972 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 3973 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 3974 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 3975 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 3976 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 3977 3978 /* Don't export sysctls to unprivileged users */ 3979 if (net->user_ns != &init_user_ns) 3980 table[0].procname = NULL; 3981 } 3982 3983 return table; 3984 } 3985 #endif 3986 3987 static int __net_init ip6_route_net_init(struct net *net) 3988 { 3989 int ret = -ENOMEM; 3990 3991 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 3992 sizeof(net->ipv6.ip6_dst_ops)); 3993 3994 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 3995 goto out_ip6_dst_ops; 3996 3997 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 3998 sizeof(*net->ipv6.ip6_null_entry), 3999 GFP_KERNEL); 4000 if (!net->ipv6.ip6_null_entry) 4001 goto out_ip6_dst_entries; 4002 net->ipv6.ip6_null_entry->dst.path = 4003 (struct dst_entry *)net->ipv6.ip6_null_entry; 4004 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4005 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 4006 ip6_template_metrics, true); 4007 4008 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4009 net->ipv6.fib6_has_custom_rules = false; 4010 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 4011 sizeof(*net->ipv6.ip6_prohibit_entry), 4012 GFP_KERNEL); 4013 if (!net->ipv6.ip6_prohibit_entry) 4014 goto out_ip6_null_entry; 4015 net->ipv6.ip6_prohibit_entry->dst.path = 4016 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 4017 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4018 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 4019 ip6_template_metrics, true); 4020 4021 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 4022 sizeof(*net->ipv6.ip6_blk_hole_entry), 4023 GFP_KERNEL); 4024 if (!net->ipv6.ip6_blk_hole_entry) 4025 goto out_ip6_prohibit_entry; 4026 net->ipv6.ip6_blk_hole_entry->dst.path = 4027 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 4028 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 4029 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 4030 ip6_template_metrics, true); 4031 #endif 4032 4033 net->ipv6.sysctl.flush_delay = 0; 4034 net->ipv6.sysctl.ip6_rt_max_size = 4096; 4035 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 4036 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 4037 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 4038 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 4039 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 4040 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 4041 4042 net->ipv6.ip6_rt_gc_expire = 30*HZ; 4043 4044 ret = 0; 4045 out: 4046 return ret; 4047 4048 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4049 out_ip6_prohibit_entry: 4050 kfree(net->ipv6.ip6_prohibit_entry); 4051 out_ip6_null_entry: 4052 kfree(net->ipv6.ip6_null_entry); 4053 #endif 4054 out_ip6_dst_entries: 4055 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4056 out_ip6_dst_ops: 4057 goto out; 4058 } 4059 4060 static void __net_exit ip6_route_net_exit(struct net *net) 4061 { 4062 kfree(net->ipv6.ip6_null_entry); 4063 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4064 kfree(net->ipv6.ip6_prohibit_entry); 4065 kfree(net->ipv6.ip6_blk_hole_entry); 4066 #endif 4067 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 4068 } 4069 4070 static int __net_init ip6_route_net_init_late(struct net *net) 4071 { 4072 #ifdef CONFIG_PROC_FS 4073 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 4074 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); 4075 #endif 4076 return 0; 4077 } 4078 4079 static void __net_exit ip6_route_net_exit_late(struct net *net) 4080 { 4081 #ifdef CONFIG_PROC_FS 4082 remove_proc_entry("ipv6_route", net->proc_net); 4083 remove_proc_entry("rt6_stats", net->proc_net); 4084 #endif 4085 } 4086 4087 static struct pernet_operations ip6_route_net_ops = { 4088 .init = ip6_route_net_init, 4089 .exit = ip6_route_net_exit, 4090 }; 4091 4092 static int __net_init ipv6_inetpeer_init(struct net *net) 4093 { 4094 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 4095 4096 if (!bp) 4097 return -ENOMEM; 4098 inet_peer_base_init(bp); 4099 net->ipv6.peers = bp; 4100 return 0; 4101 } 4102 4103 static void __net_exit ipv6_inetpeer_exit(struct net *net) 4104 { 4105 struct inet_peer_base *bp = net->ipv6.peers; 4106 4107 net->ipv6.peers = NULL; 4108 inetpeer_invalidate_tree(bp); 4109 kfree(bp); 4110 } 4111 4112 static struct pernet_operations ipv6_inetpeer_ops = { 4113 .init = ipv6_inetpeer_init, 4114 .exit = ipv6_inetpeer_exit, 4115 }; 4116 4117 static struct pernet_operations ip6_route_net_late_ops = { 4118 .init = ip6_route_net_init_late, 4119 .exit = ip6_route_net_exit_late, 4120 }; 4121 4122 static struct notifier_block ip6_route_dev_notifier = { 4123 .notifier_call = ip6_route_dev_notify, 4124 .priority = ADDRCONF_NOTIFY_PRIORITY - 10, 4125 }; 4126 4127 void __init ip6_route_init_special_entries(void) 4128 { 4129 /* Registering of the loopback is done before this portion of code, 4130 * the loopback reference in rt6_info will not be taken, do it 4131 * manually for init_net */ 4132 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 4133 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4134 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 4135 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 4136 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4137 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 4138 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 4139 #endif 4140 } 4141 4142 int __init ip6_route_init(void) 4143 { 4144 int ret; 4145 int cpu; 4146 4147 ret = -ENOMEM; 4148 ip6_dst_ops_template.kmem_cachep = 4149 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 4150 SLAB_HWCACHE_ALIGN, NULL); 4151 if (!ip6_dst_ops_template.kmem_cachep) 4152 goto out; 4153 4154 ret = dst_entries_init(&ip6_dst_blackhole_ops); 4155 if (ret) 4156 goto out_kmem_cache; 4157 4158 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 4159 if (ret) 4160 goto out_dst_entries; 4161 4162 ret = register_pernet_subsys(&ip6_route_net_ops); 4163 if (ret) 4164 goto out_register_inetpeer; 4165 4166 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 4167 4168 ret = fib6_init(); 4169 if (ret) 4170 goto out_register_subsys; 4171 4172 ret = xfrm6_init(); 4173 if (ret) 4174 goto out_fib6_init; 4175 4176 ret = fib6_rules_init(); 4177 if (ret) 4178 goto xfrm6_init; 4179 4180 ret = register_pernet_subsys(&ip6_route_net_late_ops); 4181 if (ret) 4182 goto fib6_rules_init; 4183 4184 ret = -ENOBUFS; 4185 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) || 4186 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) || 4187 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 4188 RTNL_FLAG_DOIT_UNLOCKED)) 4189 goto out_register_late_subsys; 4190 4191 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 4192 if (ret) 4193 goto out_register_late_subsys; 4194 4195 for_each_possible_cpu(cpu) { 4196 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 4197 4198 INIT_LIST_HEAD(&ul->head); 4199 spin_lock_init(&ul->lock); 4200 } 4201 4202 out: 4203 return ret; 4204 4205 out_register_late_subsys: 4206 unregister_pernet_subsys(&ip6_route_net_late_ops); 4207 fib6_rules_init: 4208 fib6_rules_cleanup(); 4209 xfrm6_init: 4210 xfrm6_fini(); 4211 out_fib6_init: 4212 fib6_gc_cleanup(); 4213 out_register_subsys: 4214 unregister_pernet_subsys(&ip6_route_net_ops); 4215 out_register_inetpeer: 4216 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4217 out_dst_entries: 4218 dst_entries_destroy(&ip6_dst_blackhole_ops); 4219 out_kmem_cache: 4220 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4221 goto out; 4222 } 4223 4224 void ip6_route_cleanup(void) 4225 { 4226 unregister_netdevice_notifier(&ip6_route_dev_notifier); 4227 unregister_pernet_subsys(&ip6_route_net_late_ops); 4228 fib6_rules_cleanup(); 4229 xfrm6_fini(); 4230 fib6_gc_cleanup(); 4231 unregister_pernet_subsys(&ipv6_inetpeer_ops); 4232 unregister_pernet_subsys(&ip6_route_net_ops); 4233 dst_entries_destroy(&ip6_dst_blackhole_ops); 4234 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 4235 } 4236