1 /* 2 * Linux INET6 implementation 3 * FIB front-end. 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public License 10 * as published by the Free Software Foundation; either version 11 * 2 of the License, or (at your option) any later version. 12 */ 13 14 /* Changes: 15 * 16 * YOSHIFUJI Hideaki @USAGI 17 * reworked default router selection. 18 * - respect outgoing interface 19 * - select from (probably) reachable routers (i.e. 20 * routers in REACHABLE, STALE, DELAY or PROBE states). 21 * - always select the same router if it is (probably) 22 * reachable. otherwise, round-robin the list. 23 * Ville Nuorvala 24 * Fixed routing subtrees. 25 */ 26 27 #define pr_fmt(fmt) "IPv6: " fmt 28 29 #include <linux/capability.h> 30 #include <linux/errno.h> 31 #include <linux/export.h> 32 #include <linux/types.h> 33 #include <linux/times.h> 34 #include <linux/socket.h> 35 #include <linux/sockios.h> 36 #include <linux/net.h> 37 #include <linux/route.h> 38 #include <linux/netdevice.h> 39 #include <linux/in6.h> 40 #include <linux/mroute6.h> 41 #include <linux/init.h> 42 #include <linux/if_arp.h> 43 #include <linux/proc_fs.h> 44 #include <linux/seq_file.h> 45 #include <linux/nsproxy.h> 46 #include <linux/slab.h> 47 #include <net/net_namespace.h> 48 #include <net/snmp.h> 49 #include <net/ipv6.h> 50 #include <net/ip6_fib.h> 51 #include <net/ip6_route.h> 52 #include <net/ndisc.h> 53 #include <net/addrconf.h> 54 #include <net/tcp.h> 55 #include <linux/rtnetlink.h> 56 #include <net/dst.h> 57 #include <net/dst_metadata.h> 58 #include <net/xfrm.h> 59 #include <net/netevent.h> 60 #include <net/netlink.h> 61 #include <net/nexthop.h> 62 #include <net/lwtunnel.h> 63 #include <net/ip_tunnels.h> 64 65 #include <asm/uaccess.h> 66 67 #ifdef CONFIG_SYSCTL 68 #include <linux/sysctl.h> 69 #endif 70 71 enum rt6_nud_state { 72 RT6_NUD_FAIL_HARD = -3, 73 RT6_NUD_FAIL_PROBE = -2, 74 RT6_NUD_FAIL_DO_RR = -1, 75 RT6_NUD_SUCCEED = 1 76 }; 77 78 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort); 79 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); 80 static unsigned int ip6_default_advmss(const struct dst_entry *dst); 81 static unsigned int ip6_mtu(const struct dst_entry *dst); 82 static struct dst_entry *ip6_negative_advice(struct dst_entry *); 83 static void ip6_dst_destroy(struct dst_entry *); 84 static void ip6_dst_ifdown(struct dst_entry *, 85 struct net_device *dev, int how); 86 static int ip6_dst_gc(struct dst_ops *ops); 87 88 static int ip6_pkt_discard(struct sk_buff *skb); 89 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb); 90 static int ip6_pkt_prohibit(struct sk_buff *skb); 91 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb); 92 static void ip6_link_failure(struct sk_buff *skb); 93 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 94 struct sk_buff *skb, u32 mtu); 95 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, 96 struct sk_buff *skb); 97 static void rt6_dst_from_metrics_check(struct rt6_info *rt); 98 static int rt6_score_route(struct rt6_info *rt, int oif, int strict); 99 100 #ifdef CONFIG_IPV6_ROUTE_INFO 101 static struct rt6_info *rt6_add_route_info(struct net *net, 102 const struct in6_addr *prefix, int prefixlen, 103 const struct in6_addr *gwaddr, int ifindex, 104 unsigned int pref); 105 static struct rt6_info *rt6_get_route_info(struct net *net, 106 const struct in6_addr *prefix, int prefixlen, 107 const struct in6_addr *gwaddr, int ifindex); 108 #endif 109 110 struct uncached_list { 111 spinlock_t lock; 112 struct list_head head; 113 }; 114 115 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); 116 117 static void rt6_uncached_list_add(struct rt6_info *rt) 118 { 119 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); 120 121 rt->dst.flags |= DST_NOCACHE; 122 rt->rt6i_uncached_list = ul; 123 124 spin_lock_bh(&ul->lock); 125 list_add_tail(&rt->rt6i_uncached, &ul->head); 126 spin_unlock_bh(&ul->lock); 127 } 128 129 static void rt6_uncached_list_del(struct rt6_info *rt) 130 { 131 if (!list_empty(&rt->rt6i_uncached)) { 132 struct uncached_list *ul = rt->rt6i_uncached_list; 133 134 spin_lock_bh(&ul->lock); 135 list_del(&rt->rt6i_uncached); 136 spin_unlock_bh(&ul->lock); 137 } 138 } 139 140 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) 141 { 142 struct net_device *loopback_dev = net->loopback_dev; 143 int cpu; 144 145 for_each_possible_cpu(cpu) { 146 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 147 struct rt6_info *rt; 148 149 spin_lock_bh(&ul->lock); 150 list_for_each_entry(rt, &ul->head, rt6i_uncached) { 151 struct inet6_dev *rt_idev = rt->rt6i_idev; 152 struct net_device *rt_dev = rt->dst.dev; 153 154 if (rt_idev && (rt_idev->dev == dev || !dev) && 155 rt_idev->dev != loopback_dev) { 156 rt->rt6i_idev = in6_dev_get(loopback_dev); 157 in6_dev_put(rt_idev); 158 } 159 160 if (rt_dev && (rt_dev == dev || !dev) && 161 rt_dev != loopback_dev) { 162 rt->dst.dev = loopback_dev; 163 dev_hold(rt->dst.dev); 164 dev_put(rt_dev); 165 } 166 } 167 spin_unlock_bh(&ul->lock); 168 } 169 } 170 171 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) 172 { 173 return dst_metrics_write_ptr(rt->dst.from); 174 } 175 176 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) 177 { 178 struct rt6_info *rt = (struct rt6_info *)dst; 179 180 if (rt->rt6i_flags & RTF_PCPU) 181 return rt6_pcpu_cow_metrics(rt); 182 else if (rt->rt6i_flags & RTF_CACHE) 183 return NULL; 184 else 185 return dst_cow_metrics_generic(dst, old); 186 } 187 188 static inline const void *choose_neigh_daddr(struct rt6_info *rt, 189 struct sk_buff *skb, 190 const void *daddr) 191 { 192 struct in6_addr *p = &rt->rt6i_gateway; 193 194 if (!ipv6_addr_any(p)) 195 return (const void *) p; 196 else if (skb) 197 return &ipv6_hdr(skb)->daddr; 198 return daddr; 199 } 200 201 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, 202 struct sk_buff *skb, 203 const void *daddr) 204 { 205 struct rt6_info *rt = (struct rt6_info *) dst; 206 struct neighbour *n; 207 208 daddr = choose_neigh_daddr(rt, skb, daddr); 209 n = __ipv6_neigh_lookup(dst->dev, daddr); 210 if (n) 211 return n; 212 return neigh_create(&nd_tbl, daddr, dst->dev); 213 } 214 215 static struct dst_ops ip6_dst_ops_template = { 216 .family = AF_INET6, 217 .gc = ip6_dst_gc, 218 .gc_thresh = 1024, 219 .check = ip6_dst_check, 220 .default_advmss = ip6_default_advmss, 221 .mtu = ip6_mtu, 222 .cow_metrics = ipv6_cow_metrics, 223 .destroy = ip6_dst_destroy, 224 .ifdown = ip6_dst_ifdown, 225 .negative_advice = ip6_negative_advice, 226 .link_failure = ip6_link_failure, 227 .update_pmtu = ip6_rt_update_pmtu, 228 .redirect = rt6_do_redirect, 229 .local_out = __ip6_local_out, 230 .neigh_lookup = ip6_neigh_lookup, 231 }; 232 233 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) 234 { 235 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 236 237 return mtu ? : dst->dev->mtu; 238 } 239 240 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 241 struct sk_buff *skb, u32 mtu) 242 { 243 } 244 245 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 246 struct sk_buff *skb) 247 { 248 } 249 250 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, 251 unsigned long old) 252 { 253 return NULL; 254 } 255 256 static struct dst_ops ip6_dst_blackhole_ops = { 257 .family = AF_INET6, 258 .destroy = ip6_dst_destroy, 259 .check = ip6_dst_check, 260 .mtu = ip6_blackhole_mtu, 261 .default_advmss = ip6_default_advmss, 262 .update_pmtu = ip6_rt_blackhole_update_pmtu, 263 .redirect = ip6_rt_blackhole_redirect, 264 .cow_metrics = ip6_rt_blackhole_cow_metrics, 265 .neigh_lookup = ip6_neigh_lookup, 266 }; 267 268 static const u32 ip6_template_metrics[RTAX_MAX] = { 269 [RTAX_HOPLIMIT - 1] = 0, 270 }; 271 272 static const struct rt6_info ip6_null_entry_template = { 273 .dst = { 274 .__refcnt = ATOMIC_INIT(1), 275 .__use = 1, 276 .obsolete = DST_OBSOLETE_FORCE_CHK, 277 .error = -ENETUNREACH, 278 .input = ip6_pkt_discard, 279 .output = ip6_pkt_discard_out, 280 }, 281 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 282 .rt6i_protocol = RTPROT_KERNEL, 283 .rt6i_metric = ~(u32) 0, 284 .rt6i_ref = ATOMIC_INIT(1), 285 }; 286 287 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 288 289 static const struct rt6_info ip6_prohibit_entry_template = { 290 .dst = { 291 .__refcnt = ATOMIC_INIT(1), 292 .__use = 1, 293 .obsolete = DST_OBSOLETE_FORCE_CHK, 294 .error = -EACCES, 295 .input = ip6_pkt_prohibit, 296 .output = ip6_pkt_prohibit_out, 297 }, 298 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 299 .rt6i_protocol = RTPROT_KERNEL, 300 .rt6i_metric = ~(u32) 0, 301 .rt6i_ref = ATOMIC_INIT(1), 302 }; 303 304 static const struct rt6_info ip6_blk_hole_entry_template = { 305 .dst = { 306 .__refcnt = ATOMIC_INIT(1), 307 .__use = 1, 308 .obsolete = DST_OBSOLETE_FORCE_CHK, 309 .error = -EINVAL, 310 .input = dst_discard, 311 .output = dst_discard_sk, 312 }, 313 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), 314 .rt6i_protocol = RTPROT_KERNEL, 315 .rt6i_metric = ~(u32) 0, 316 .rt6i_ref = ATOMIC_INIT(1), 317 }; 318 319 #endif 320 321 /* allocate dst with ip6_dst_ops */ 322 static struct rt6_info *__ip6_dst_alloc(struct net *net, 323 struct net_device *dev, 324 int flags, 325 struct fib6_table *table) 326 { 327 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 328 0, DST_OBSOLETE_FORCE_CHK, flags); 329 330 if (rt) { 331 struct dst_entry *dst = &rt->dst; 332 333 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); 334 INIT_LIST_HEAD(&rt->rt6i_siblings); 335 INIT_LIST_HEAD(&rt->rt6i_uncached); 336 } 337 return rt; 338 } 339 340 static struct rt6_info *ip6_dst_alloc(struct net *net, 341 struct net_device *dev, 342 int flags, 343 struct fib6_table *table) 344 { 345 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table); 346 347 if (rt) { 348 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); 349 if (rt->rt6i_pcpu) { 350 int cpu; 351 352 for_each_possible_cpu(cpu) { 353 struct rt6_info **p; 354 355 p = per_cpu_ptr(rt->rt6i_pcpu, cpu); 356 /* no one shares rt */ 357 *p = NULL; 358 } 359 } else { 360 dst_destroy((struct dst_entry *)rt); 361 return NULL; 362 } 363 } 364 365 return rt; 366 } 367 368 static void ip6_dst_destroy(struct dst_entry *dst) 369 { 370 struct rt6_info *rt = (struct rt6_info *)dst; 371 struct dst_entry *from = dst->from; 372 struct inet6_dev *idev; 373 374 dst_destroy_metrics_generic(dst); 375 free_percpu(rt->rt6i_pcpu); 376 rt6_uncached_list_del(rt); 377 378 idev = rt->rt6i_idev; 379 if (idev) { 380 rt->rt6i_idev = NULL; 381 in6_dev_put(idev); 382 } 383 384 dst->from = NULL; 385 dst_release(from); 386 } 387 388 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 389 int how) 390 { 391 struct rt6_info *rt = (struct rt6_info *)dst; 392 struct inet6_dev *idev = rt->rt6i_idev; 393 struct net_device *loopback_dev = 394 dev_net(dev)->loopback_dev; 395 396 if (dev != loopback_dev) { 397 if (idev && idev->dev == dev) { 398 struct inet6_dev *loopback_idev = 399 in6_dev_get(loopback_dev); 400 if (loopback_idev) { 401 rt->rt6i_idev = loopback_idev; 402 in6_dev_put(idev); 403 } 404 } 405 } 406 } 407 408 static bool rt6_check_expired(const struct rt6_info *rt) 409 { 410 if (rt->rt6i_flags & RTF_EXPIRES) { 411 if (time_after(jiffies, rt->dst.expires)) 412 return true; 413 } else if (rt->dst.from) { 414 return rt6_check_expired((struct rt6_info *) rt->dst.from); 415 } 416 return false; 417 } 418 419 /* Multipath route selection: 420 * Hash based function using packet header and flowlabel. 421 * Adapted from fib_info_hashfn() 422 */ 423 static int rt6_info_hash_nhsfn(unsigned int candidate_count, 424 const struct flowi6 *fl6) 425 { 426 unsigned int val = fl6->flowi6_proto; 427 428 val ^= ipv6_addr_hash(&fl6->daddr); 429 val ^= ipv6_addr_hash(&fl6->saddr); 430 431 /* Work only if this not encapsulated */ 432 switch (fl6->flowi6_proto) { 433 case IPPROTO_UDP: 434 case IPPROTO_TCP: 435 case IPPROTO_SCTP: 436 val ^= (__force u16)fl6->fl6_sport; 437 val ^= (__force u16)fl6->fl6_dport; 438 break; 439 440 case IPPROTO_ICMPV6: 441 val ^= (__force u16)fl6->fl6_icmp_type; 442 val ^= (__force u16)fl6->fl6_icmp_code; 443 break; 444 } 445 /* RFC6438 recommands to use flowlabel */ 446 val ^= (__force u32)fl6->flowlabel; 447 448 /* Perhaps, we need to tune, this function? */ 449 val = val ^ (val >> 7) ^ (val >> 12); 450 return val % candidate_count; 451 } 452 453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match, 454 struct flowi6 *fl6, int oif, 455 int strict) 456 { 457 struct rt6_info *sibling, *next_sibling; 458 int route_choosen; 459 460 route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6); 461 /* Don't change the route, if route_choosen == 0 462 * (siblings does not include ourself) 463 */ 464 if (route_choosen) 465 list_for_each_entry_safe(sibling, next_sibling, 466 &match->rt6i_siblings, rt6i_siblings) { 467 route_choosen--; 468 if (route_choosen == 0) { 469 if (rt6_score_route(sibling, oif, strict) < 0) 470 break; 471 match = sibling; 472 break; 473 } 474 } 475 return match; 476 } 477 478 /* 479 * Route lookup. Any table->tb6_lock is implied. 480 */ 481 482 static inline struct rt6_info *rt6_device_match(struct net *net, 483 struct rt6_info *rt, 484 const struct in6_addr *saddr, 485 int oif, 486 int flags) 487 { 488 struct rt6_info *local = NULL; 489 struct rt6_info *sprt; 490 491 if (!oif && ipv6_addr_any(saddr)) 492 goto out; 493 494 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { 495 struct net_device *dev = sprt->dst.dev; 496 497 if (oif) { 498 if (dev->ifindex == oif) 499 return sprt; 500 if (dev->flags & IFF_LOOPBACK) { 501 if (!sprt->rt6i_idev || 502 sprt->rt6i_idev->dev->ifindex != oif) { 503 if (flags & RT6_LOOKUP_F_IFACE && oif) 504 continue; 505 if (local && (!oif || 506 local->rt6i_idev->dev->ifindex == oif)) 507 continue; 508 } 509 local = sprt; 510 } 511 } else { 512 if (ipv6_chk_addr(net, saddr, dev, 513 flags & RT6_LOOKUP_F_IFACE)) 514 return sprt; 515 } 516 } 517 518 if (oif) { 519 if (local) 520 return local; 521 522 if (flags & RT6_LOOKUP_F_IFACE) 523 return net->ipv6.ip6_null_entry; 524 } 525 out: 526 return rt; 527 } 528 529 #ifdef CONFIG_IPV6_ROUTER_PREF 530 struct __rt6_probe_work { 531 struct work_struct work; 532 struct in6_addr target; 533 struct net_device *dev; 534 }; 535 536 static void rt6_probe_deferred(struct work_struct *w) 537 { 538 struct in6_addr mcaddr; 539 struct __rt6_probe_work *work = 540 container_of(w, struct __rt6_probe_work, work); 541 542 addrconf_addr_solict_mult(&work->target, &mcaddr); 543 ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL); 544 dev_put(work->dev); 545 kfree(work); 546 } 547 548 static void rt6_probe(struct rt6_info *rt) 549 { 550 struct __rt6_probe_work *work; 551 struct neighbour *neigh; 552 /* 553 * Okay, this does not seem to be appropriate 554 * for now, however, we need to check if it 555 * is really so; aka Router Reachability Probing. 556 * 557 * Router Reachability Probe MUST be rate-limited 558 * to no more than one per minute. 559 */ 560 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY)) 561 return; 562 rcu_read_lock_bh(); 563 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 564 if (neigh) { 565 if (neigh->nud_state & NUD_VALID) 566 goto out; 567 568 work = NULL; 569 write_lock(&neigh->lock); 570 if (!(neigh->nud_state & NUD_VALID) && 571 time_after(jiffies, 572 neigh->updated + 573 rt->rt6i_idev->cnf.rtr_probe_interval)) { 574 work = kmalloc(sizeof(*work), GFP_ATOMIC); 575 if (work) 576 __neigh_set_probe_once(neigh); 577 } 578 write_unlock(&neigh->lock); 579 } else { 580 work = kmalloc(sizeof(*work), GFP_ATOMIC); 581 } 582 583 if (work) { 584 INIT_WORK(&work->work, rt6_probe_deferred); 585 work->target = rt->rt6i_gateway; 586 dev_hold(rt->dst.dev); 587 work->dev = rt->dst.dev; 588 schedule_work(&work->work); 589 } 590 591 out: 592 rcu_read_unlock_bh(); 593 } 594 #else 595 static inline void rt6_probe(struct rt6_info *rt) 596 { 597 } 598 #endif 599 600 /* 601 * Default Router Selection (RFC 2461 6.3.6) 602 */ 603 static inline int rt6_check_dev(struct rt6_info *rt, int oif) 604 { 605 struct net_device *dev = rt->dst.dev; 606 if (!oif || dev->ifindex == oif) 607 return 2; 608 if ((dev->flags & IFF_LOOPBACK) && 609 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif) 610 return 1; 611 return 0; 612 } 613 614 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt) 615 { 616 struct neighbour *neigh; 617 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; 618 619 if (rt->rt6i_flags & RTF_NONEXTHOP || 620 !(rt->rt6i_flags & RTF_GATEWAY)) 621 return RT6_NUD_SUCCEED; 622 623 rcu_read_lock_bh(); 624 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); 625 if (neigh) { 626 read_lock(&neigh->lock); 627 if (neigh->nud_state & NUD_VALID) 628 ret = RT6_NUD_SUCCEED; 629 #ifdef CONFIG_IPV6_ROUTER_PREF 630 else if (!(neigh->nud_state & NUD_FAILED)) 631 ret = RT6_NUD_SUCCEED; 632 else 633 ret = RT6_NUD_FAIL_PROBE; 634 #endif 635 read_unlock(&neigh->lock); 636 } else { 637 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? 638 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; 639 } 640 rcu_read_unlock_bh(); 641 642 return ret; 643 } 644 645 static int rt6_score_route(struct rt6_info *rt, int oif, 646 int strict) 647 { 648 int m; 649 650 m = rt6_check_dev(rt, oif); 651 if (!m && (strict & RT6_LOOKUP_F_IFACE)) 652 return RT6_NUD_FAIL_HARD; 653 #ifdef CONFIG_IPV6_ROUTER_PREF 654 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2; 655 #endif 656 if (strict & RT6_LOOKUP_F_REACHABLE) { 657 int n = rt6_check_neigh(rt); 658 if (n < 0) 659 return n; 660 } 661 return m; 662 } 663 664 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, 665 int *mpri, struct rt6_info *match, 666 bool *do_rr) 667 { 668 int m; 669 bool match_do_rr = false; 670 struct inet6_dev *idev = rt->rt6i_idev; 671 struct net_device *dev = rt->dst.dev; 672 673 if (dev && !netif_carrier_ok(dev) && 674 idev->cnf.ignore_routes_with_linkdown) 675 goto out; 676 677 if (rt6_check_expired(rt)) 678 goto out; 679 680 m = rt6_score_route(rt, oif, strict); 681 if (m == RT6_NUD_FAIL_DO_RR) { 682 match_do_rr = true; 683 m = 0; /* lowest valid score */ 684 } else if (m == RT6_NUD_FAIL_HARD) { 685 goto out; 686 } 687 688 if (strict & RT6_LOOKUP_F_REACHABLE) 689 rt6_probe(rt); 690 691 /* note that m can be RT6_NUD_FAIL_PROBE at this point */ 692 if (m > *mpri) { 693 *do_rr = match_do_rr; 694 *mpri = m; 695 match = rt; 696 } 697 out: 698 return match; 699 } 700 701 static struct rt6_info *find_rr_leaf(struct fib6_node *fn, 702 struct rt6_info *rr_head, 703 u32 metric, int oif, int strict, 704 bool *do_rr) 705 { 706 struct rt6_info *rt, *match, *cont; 707 int mpri = -1; 708 709 match = NULL; 710 cont = NULL; 711 for (rt = rr_head; rt; rt = rt->dst.rt6_next) { 712 if (rt->rt6i_metric != metric) { 713 cont = rt; 714 break; 715 } 716 717 match = find_match(rt, oif, strict, &mpri, match, do_rr); 718 } 719 720 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { 721 if (rt->rt6i_metric != metric) { 722 cont = rt; 723 break; 724 } 725 726 match = find_match(rt, oif, strict, &mpri, match, do_rr); 727 } 728 729 if (match || !cont) 730 return match; 731 732 for (rt = cont; rt; rt = rt->dst.rt6_next) 733 match = find_match(rt, oif, strict, &mpri, match, do_rr); 734 735 return match; 736 } 737 738 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) 739 { 740 struct rt6_info *match, *rt0; 741 struct net *net; 742 bool do_rr = false; 743 744 rt0 = fn->rr_ptr; 745 if (!rt0) 746 fn->rr_ptr = rt0 = fn->leaf; 747 748 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, 749 &do_rr); 750 751 if (do_rr) { 752 struct rt6_info *next = rt0->dst.rt6_next; 753 754 /* no entries matched; do round-robin */ 755 if (!next || next->rt6i_metric != rt0->rt6i_metric) 756 next = fn->leaf; 757 758 if (next != rt0) 759 fn->rr_ptr = next; 760 } 761 762 net = dev_net(rt0->dst.dev); 763 return match ? match : net->ipv6.ip6_null_entry; 764 } 765 766 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt) 767 { 768 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)); 769 } 770 771 #ifdef CONFIG_IPV6_ROUTE_INFO 772 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, 773 const struct in6_addr *gwaddr) 774 { 775 struct net *net = dev_net(dev); 776 struct route_info *rinfo = (struct route_info *) opt; 777 struct in6_addr prefix_buf, *prefix; 778 unsigned int pref; 779 unsigned long lifetime; 780 struct rt6_info *rt; 781 782 if (len < sizeof(struct route_info)) { 783 return -EINVAL; 784 } 785 786 /* Sanity check for prefix_len and length */ 787 if (rinfo->length > 3) { 788 return -EINVAL; 789 } else if (rinfo->prefix_len > 128) { 790 return -EINVAL; 791 } else if (rinfo->prefix_len > 64) { 792 if (rinfo->length < 2) { 793 return -EINVAL; 794 } 795 } else if (rinfo->prefix_len > 0) { 796 if (rinfo->length < 1) { 797 return -EINVAL; 798 } 799 } 800 801 pref = rinfo->route_pref; 802 if (pref == ICMPV6_ROUTER_PREF_INVALID) 803 return -EINVAL; 804 805 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); 806 807 if (rinfo->length == 3) 808 prefix = (struct in6_addr *)rinfo->prefix; 809 else { 810 /* this function is safe */ 811 ipv6_addr_prefix(&prefix_buf, 812 (struct in6_addr *)rinfo->prefix, 813 rinfo->prefix_len); 814 prefix = &prefix_buf; 815 } 816 817 if (rinfo->prefix_len == 0) 818 rt = rt6_get_dflt_router(gwaddr, dev); 819 else 820 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, 821 gwaddr, dev->ifindex); 822 823 if (rt && !lifetime) { 824 ip6_del_rt(rt); 825 rt = NULL; 826 } 827 828 if (!rt && lifetime) 829 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex, 830 pref); 831 else if (rt) 832 rt->rt6i_flags = RTF_ROUTEINFO | 833 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); 834 835 if (rt) { 836 if (!addrconf_finite_timeout(lifetime)) 837 rt6_clean_expires(rt); 838 else 839 rt6_set_expires(rt, jiffies + HZ * lifetime); 840 841 ip6_rt_put(rt); 842 } 843 return 0; 844 } 845 #endif 846 847 static struct fib6_node* fib6_backtrack(struct fib6_node *fn, 848 struct in6_addr *saddr) 849 { 850 struct fib6_node *pn; 851 while (1) { 852 if (fn->fn_flags & RTN_TL_ROOT) 853 return NULL; 854 pn = fn->parent; 855 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) 856 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); 857 else 858 fn = pn; 859 if (fn->fn_flags & RTN_RTINFO) 860 return fn; 861 } 862 } 863 864 static struct rt6_info *ip6_pol_route_lookup(struct net *net, 865 struct fib6_table *table, 866 struct flowi6 *fl6, int flags) 867 { 868 struct fib6_node *fn; 869 struct rt6_info *rt; 870 871 read_lock_bh(&table->tb6_lock); 872 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 873 restart: 874 rt = fn->leaf; 875 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); 876 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) 877 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); 878 if (rt == net->ipv6.ip6_null_entry) { 879 fn = fib6_backtrack(fn, &fl6->saddr); 880 if (fn) 881 goto restart; 882 } 883 dst_use(&rt->dst, jiffies); 884 read_unlock_bh(&table->tb6_lock); 885 return rt; 886 887 } 888 889 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, 890 int flags) 891 { 892 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup); 893 } 894 EXPORT_SYMBOL_GPL(ip6_route_lookup); 895 896 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, 897 const struct in6_addr *saddr, int oif, int strict) 898 { 899 struct flowi6 fl6 = { 900 .flowi6_oif = oif, 901 .daddr = *daddr, 902 }; 903 struct dst_entry *dst; 904 int flags = strict ? RT6_LOOKUP_F_IFACE : 0; 905 906 if (saddr) { 907 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 908 flags |= RT6_LOOKUP_F_HAS_SADDR; 909 } 910 911 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup); 912 if (dst->error == 0) 913 return (struct rt6_info *) dst; 914 915 dst_release(dst); 916 917 return NULL; 918 } 919 EXPORT_SYMBOL(rt6_lookup); 920 921 /* ip6_ins_rt is called with FREE table->tb6_lock. 922 It takes new route entry, the addition fails by any reason the 923 route is freed. In any case, if caller does not hold it, it may 924 be destroyed. 925 */ 926 927 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, 928 struct mx6_config *mxc) 929 { 930 int err; 931 struct fib6_table *table; 932 933 table = rt->rt6i_table; 934 write_lock_bh(&table->tb6_lock); 935 err = fib6_add(&table->tb6_root, rt, info, mxc); 936 write_unlock_bh(&table->tb6_lock); 937 938 return err; 939 } 940 941 int ip6_ins_rt(struct rt6_info *rt) 942 { 943 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), }; 944 struct mx6_config mxc = { .mx = NULL, }; 945 946 return __ip6_ins_rt(rt, &info, &mxc); 947 } 948 949 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, 950 const struct in6_addr *daddr, 951 const struct in6_addr *saddr) 952 { 953 struct rt6_info *rt; 954 955 /* 956 * Clone the route. 957 */ 958 959 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) 960 ort = (struct rt6_info *)ort->dst.from; 961 962 rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 963 0, ort->rt6i_table); 964 965 if (!rt) 966 return NULL; 967 968 ip6_rt_copy_init(rt, ort); 969 rt->rt6i_flags |= RTF_CACHE; 970 rt->rt6i_metric = 0; 971 rt->dst.flags |= DST_HOST; 972 rt->rt6i_dst.addr = *daddr; 973 rt->rt6i_dst.plen = 128; 974 975 if (!rt6_is_gw_or_nonexthop(ort)) { 976 if (ort->rt6i_dst.plen != 128 && 977 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr)) 978 rt->rt6i_flags |= RTF_ANYCAST; 979 #ifdef CONFIG_IPV6_SUBTREES 980 if (rt->rt6i_src.plen && saddr) { 981 rt->rt6i_src.addr = *saddr; 982 rt->rt6i_src.plen = 128; 983 } 984 #endif 985 } 986 987 return rt; 988 } 989 990 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) 991 { 992 struct rt6_info *pcpu_rt; 993 994 pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), 995 rt->dst.dev, rt->dst.flags, 996 rt->rt6i_table); 997 998 if (!pcpu_rt) 999 return NULL; 1000 ip6_rt_copy_init(pcpu_rt, rt); 1001 pcpu_rt->rt6i_protocol = rt->rt6i_protocol; 1002 pcpu_rt->rt6i_flags |= RTF_PCPU; 1003 return pcpu_rt; 1004 } 1005 1006 /* It should be called with read_lock_bh(&tb6_lock) acquired */ 1007 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) 1008 { 1009 struct rt6_info *pcpu_rt, *prev, **p; 1010 1011 p = this_cpu_ptr(rt->rt6i_pcpu); 1012 pcpu_rt = *p; 1013 1014 if (pcpu_rt) 1015 goto done; 1016 1017 pcpu_rt = ip6_rt_pcpu_alloc(rt); 1018 if (!pcpu_rt) { 1019 struct net *net = dev_net(rt->dst.dev); 1020 1021 pcpu_rt = net->ipv6.ip6_null_entry; 1022 goto done; 1023 } 1024 1025 prev = cmpxchg(p, NULL, pcpu_rt); 1026 if (prev) { 1027 /* If someone did it before us, return prev instead */ 1028 dst_destroy(&pcpu_rt->dst); 1029 pcpu_rt = prev; 1030 } 1031 1032 done: 1033 dst_hold(&pcpu_rt->dst); 1034 rt6_dst_from_metrics_check(pcpu_rt); 1035 return pcpu_rt; 1036 } 1037 1038 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, 1039 struct flowi6 *fl6, int flags) 1040 { 1041 struct fib6_node *fn, *saved_fn; 1042 struct rt6_info *rt; 1043 int strict = 0; 1044 1045 strict |= flags & RT6_LOOKUP_F_IFACE; 1046 if (net->ipv6.devconf_all->forwarding == 0) 1047 strict |= RT6_LOOKUP_F_REACHABLE; 1048 1049 read_lock_bh(&table->tb6_lock); 1050 1051 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1052 saved_fn = fn; 1053 1054 redo_rt6_select: 1055 rt = rt6_select(fn, oif, strict); 1056 if (rt->rt6i_nsiblings) 1057 rt = rt6_multipath_select(rt, fl6, oif, strict); 1058 if (rt == net->ipv6.ip6_null_entry) { 1059 fn = fib6_backtrack(fn, &fl6->saddr); 1060 if (fn) 1061 goto redo_rt6_select; 1062 else if (strict & RT6_LOOKUP_F_REACHABLE) { 1063 /* also consider unreachable route */ 1064 strict &= ~RT6_LOOKUP_F_REACHABLE; 1065 fn = saved_fn; 1066 goto redo_rt6_select; 1067 } 1068 } 1069 1070 1071 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { 1072 dst_use(&rt->dst, jiffies); 1073 read_unlock_bh(&table->tb6_lock); 1074 1075 rt6_dst_from_metrics_check(rt); 1076 return rt; 1077 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && 1078 !(rt->rt6i_flags & RTF_GATEWAY))) { 1079 /* Create a RTF_CACHE clone which will not be 1080 * owned by the fib6 tree. It is for the special case where 1081 * the daddr in the skb during the neighbor look-up is different 1082 * from the fl6->daddr used to look-up route here. 1083 */ 1084 1085 struct rt6_info *uncached_rt; 1086 1087 dst_use(&rt->dst, jiffies); 1088 read_unlock_bh(&table->tb6_lock); 1089 1090 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); 1091 dst_release(&rt->dst); 1092 1093 if (uncached_rt) 1094 rt6_uncached_list_add(uncached_rt); 1095 else 1096 uncached_rt = net->ipv6.ip6_null_entry; 1097 1098 dst_hold(&uncached_rt->dst); 1099 return uncached_rt; 1100 1101 } else { 1102 /* Get a percpu copy */ 1103 1104 struct rt6_info *pcpu_rt; 1105 1106 rt->dst.lastuse = jiffies; 1107 rt->dst.__use++; 1108 pcpu_rt = rt6_get_pcpu_route(rt); 1109 read_unlock_bh(&table->tb6_lock); 1110 1111 return pcpu_rt; 1112 } 1113 } 1114 1115 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, 1116 struct flowi6 *fl6, int flags) 1117 { 1118 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags); 1119 } 1120 1121 static struct dst_entry *ip6_route_input_lookup(struct net *net, 1122 struct net_device *dev, 1123 struct flowi6 *fl6, int flags) 1124 { 1125 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) 1126 flags |= RT6_LOOKUP_F_IFACE; 1127 1128 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input); 1129 } 1130 1131 void ip6_route_input(struct sk_buff *skb) 1132 { 1133 const struct ipv6hdr *iph = ipv6_hdr(skb); 1134 struct net *net = dev_net(skb->dev); 1135 int flags = RT6_LOOKUP_F_HAS_SADDR; 1136 struct ip_tunnel_info *tun_info; 1137 struct flowi6 fl6 = { 1138 .flowi6_iif = skb->dev->ifindex, 1139 .daddr = iph->daddr, 1140 .saddr = iph->saddr, 1141 .flowlabel = ip6_flowinfo(iph), 1142 .flowi6_mark = skb->mark, 1143 .flowi6_proto = iph->nexthdr, 1144 }; 1145 1146 tun_info = skb_tunnel_info(skb); 1147 if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) 1148 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; 1149 skb_dst_drop(skb); 1150 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); 1151 } 1152 1153 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, 1154 struct flowi6 *fl6, int flags) 1155 { 1156 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags); 1157 } 1158 1159 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, 1160 struct flowi6 *fl6) 1161 { 1162 int flags = 0; 1163 1164 fl6->flowi6_iif = LOOPBACK_IFINDEX; 1165 1166 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) 1167 flags |= RT6_LOOKUP_F_IFACE; 1168 1169 if (!ipv6_addr_any(&fl6->saddr)) 1170 flags |= RT6_LOOKUP_F_HAS_SADDR; 1171 else if (sk) 1172 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); 1173 1174 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output); 1175 } 1176 EXPORT_SYMBOL(ip6_route_output); 1177 1178 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) 1179 { 1180 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig; 1181 struct dst_entry *new = NULL; 1182 1183 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0); 1184 if (rt) { 1185 new = &rt->dst; 1186 1187 memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); 1188 1189 new->__use = 1; 1190 new->input = dst_discard; 1191 new->output = dst_discard_sk; 1192 1193 if (dst_metrics_read_only(&ort->dst)) 1194 new->_metrics = ort->dst._metrics; 1195 else 1196 dst_copy_metrics(new, &ort->dst); 1197 rt->rt6i_idev = ort->rt6i_idev; 1198 if (rt->rt6i_idev) 1199 in6_dev_hold(rt->rt6i_idev); 1200 1201 rt->rt6i_gateway = ort->rt6i_gateway; 1202 rt->rt6i_flags = ort->rt6i_flags; 1203 rt->rt6i_metric = 0; 1204 1205 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); 1206 #ifdef CONFIG_IPV6_SUBTREES 1207 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); 1208 #endif 1209 1210 dst_free(new); 1211 } 1212 1213 dst_release(dst_orig); 1214 return new ? new : ERR_PTR(-ENOMEM); 1215 } 1216 1217 /* 1218 * Destination cache support functions 1219 */ 1220 1221 static void rt6_dst_from_metrics_check(struct rt6_info *rt) 1222 { 1223 if (rt->dst.from && 1224 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from)) 1225 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); 1226 } 1227 1228 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) 1229 { 1230 if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) 1231 return NULL; 1232 1233 if (rt6_check_expired(rt)) 1234 return NULL; 1235 1236 return &rt->dst; 1237 } 1238 1239 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) 1240 { 1241 if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1242 rt6_check((struct rt6_info *)(rt->dst.from), cookie)) 1243 return &rt->dst; 1244 else 1245 return NULL; 1246 } 1247 1248 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) 1249 { 1250 struct rt6_info *rt; 1251 1252 rt = (struct rt6_info *) dst; 1253 1254 /* All IPV6 dsts are created with ->obsolete set to the value 1255 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1256 * into this function always. 1257 */ 1258 1259 rt6_dst_from_metrics_check(rt); 1260 1261 if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE)) 1262 return rt6_dst_from_check(rt, cookie); 1263 else 1264 return rt6_check(rt, cookie); 1265 } 1266 1267 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) 1268 { 1269 struct rt6_info *rt = (struct rt6_info *) dst; 1270 1271 if (rt) { 1272 if (rt->rt6i_flags & RTF_CACHE) { 1273 if (rt6_check_expired(rt)) { 1274 ip6_del_rt(rt); 1275 dst = NULL; 1276 } 1277 } else { 1278 dst_release(dst); 1279 dst = NULL; 1280 } 1281 } 1282 return dst; 1283 } 1284 1285 static void ip6_link_failure(struct sk_buff *skb) 1286 { 1287 struct rt6_info *rt; 1288 1289 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); 1290 1291 rt = (struct rt6_info *) skb_dst(skb); 1292 if (rt) { 1293 if (rt->rt6i_flags & RTF_CACHE) { 1294 dst_hold(&rt->dst); 1295 if (ip6_del_rt(rt)) 1296 dst_free(&rt->dst); 1297 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { 1298 rt->rt6i_node->fn_sernum = -1; 1299 } 1300 } 1301 } 1302 1303 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) 1304 { 1305 struct net *net = dev_net(rt->dst.dev); 1306 1307 rt->rt6i_flags |= RTF_MODIFIED; 1308 rt->rt6i_pmtu = mtu; 1309 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); 1310 } 1311 1312 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, 1313 const struct ipv6hdr *iph, u32 mtu) 1314 { 1315 struct rt6_info *rt6 = (struct rt6_info *)dst; 1316 1317 if (rt6->rt6i_flags & RTF_LOCAL) 1318 return; 1319 1320 dst_confirm(dst); 1321 mtu = max_t(u32, mtu, IPV6_MIN_MTU); 1322 if (mtu >= dst_mtu(dst)) 1323 return; 1324 1325 if (rt6->rt6i_flags & RTF_CACHE) { 1326 rt6_do_update_pmtu(rt6, mtu); 1327 } else { 1328 const struct in6_addr *daddr, *saddr; 1329 struct rt6_info *nrt6; 1330 1331 if (iph) { 1332 daddr = &iph->daddr; 1333 saddr = &iph->saddr; 1334 } else if (sk) { 1335 daddr = &sk->sk_v6_daddr; 1336 saddr = &inet6_sk(sk)->saddr; 1337 } else { 1338 return; 1339 } 1340 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); 1341 if (nrt6) { 1342 rt6_do_update_pmtu(nrt6, mtu); 1343 1344 /* ip6_ins_rt(nrt6) will bump the 1345 * rt6->rt6i_node->fn_sernum 1346 * which will fail the next rt6_check() and 1347 * invalidate the sk->sk_dst_cache. 1348 */ 1349 ip6_ins_rt(nrt6); 1350 } 1351 } 1352 } 1353 1354 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1355 struct sk_buff *skb, u32 mtu) 1356 { 1357 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu); 1358 } 1359 1360 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, 1361 int oif, u32 mark) 1362 { 1363 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1364 struct dst_entry *dst; 1365 struct flowi6 fl6; 1366 1367 memset(&fl6, 0, sizeof(fl6)); 1368 fl6.flowi6_oif = oif; 1369 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark); 1370 fl6.daddr = iph->daddr; 1371 fl6.saddr = iph->saddr; 1372 fl6.flowlabel = ip6_flowinfo(iph); 1373 1374 dst = ip6_route_output(net, NULL, &fl6); 1375 if (!dst->error) 1376 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu)); 1377 dst_release(dst); 1378 } 1379 EXPORT_SYMBOL_GPL(ip6_update_pmtu); 1380 1381 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) 1382 { 1383 ip6_update_pmtu(skb, sock_net(sk), mtu, 1384 sk->sk_bound_dev_if, sk->sk_mark); 1385 } 1386 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); 1387 1388 /* Handle redirects */ 1389 struct ip6rd_flowi { 1390 struct flowi6 fl6; 1391 struct in6_addr gateway; 1392 }; 1393 1394 static struct rt6_info *__ip6_route_redirect(struct net *net, 1395 struct fib6_table *table, 1396 struct flowi6 *fl6, 1397 int flags) 1398 { 1399 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; 1400 struct rt6_info *rt; 1401 struct fib6_node *fn; 1402 1403 /* Get the "current" route for this destination and 1404 * check if the redirect has come from approriate router. 1405 * 1406 * RFC 4861 specifies that redirects should only be 1407 * accepted if they come from the nexthop to the target. 1408 * Due to the way the routes are chosen, this notion 1409 * is a bit fuzzy and one might need to check all possible 1410 * routes. 1411 */ 1412 1413 read_lock_bh(&table->tb6_lock); 1414 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); 1415 restart: 1416 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 1417 if (rt6_check_expired(rt)) 1418 continue; 1419 if (rt->dst.error) 1420 break; 1421 if (!(rt->rt6i_flags & RTF_GATEWAY)) 1422 continue; 1423 if (fl6->flowi6_oif != rt->dst.dev->ifindex) 1424 continue; 1425 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) 1426 continue; 1427 break; 1428 } 1429 1430 if (!rt) 1431 rt = net->ipv6.ip6_null_entry; 1432 else if (rt->dst.error) { 1433 rt = net->ipv6.ip6_null_entry; 1434 goto out; 1435 } 1436 1437 if (rt == net->ipv6.ip6_null_entry) { 1438 fn = fib6_backtrack(fn, &fl6->saddr); 1439 if (fn) 1440 goto restart; 1441 } 1442 1443 out: 1444 dst_hold(&rt->dst); 1445 1446 read_unlock_bh(&table->tb6_lock); 1447 1448 return rt; 1449 }; 1450 1451 static struct dst_entry *ip6_route_redirect(struct net *net, 1452 const struct flowi6 *fl6, 1453 const struct in6_addr *gateway) 1454 { 1455 int flags = RT6_LOOKUP_F_HAS_SADDR; 1456 struct ip6rd_flowi rdfl; 1457 1458 rdfl.fl6 = *fl6; 1459 rdfl.gateway = *gateway; 1460 1461 return fib6_rule_lookup(net, &rdfl.fl6, 1462 flags, __ip6_route_redirect); 1463 } 1464 1465 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark) 1466 { 1467 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; 1468 struct dst_entry *dst; 1469 struct flowi6 fl6; 1470 1471 memset(&fl6, 0, sizeof(fl6)); 1472 fl6.flowi6_iif = LOOPBACK_IFINDEX; 1473 fl6.flowi6_oif = oif; 1474 fl6.flowi6_mark = mark; 1475 fl6.daddr = iph->daddr; 1476 fl6.saddr = iph->saddr; 1477 fl6.flowlabel = ip6_flowinfo(iph); 1478 1479 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr); 1480 rt6_do_redirect(dst, NULL, skb); 1481 dst_release(dst); 1482 } 1483 EXPORT_SYMBOL_GPL(ip6_redirect); 1484 1485 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif, 1486 u32 mark) 1487 { 1488 const struct ipv6hdr *iph = ipv6_hdr(skb); 1489 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); 1490 struct dst_entry *dst; 1491 struct flowi6 fl6; 1492 1493 memset(&fl6, 0, sizeof(fl6)); 1494 fl6.flowi6_iif = LOOPBACK_IFINDEX; 1495 fl6.flowi6_oif = oif; 1496 fl6.flowi6_mark = mark; 1497 fl6.daddr = msg->dest; 1498 fl6.saddr = iph->daddr; 1499 1500 dst = ip6_route_redirect(net, &fl6, &iph->saddr); 1501 rt6_do_redirect(dst, NULL, skb); 1502 dst_release(dst); 1503 } 1504 1505 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) 1506 { 1507 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark); 1508 } 1509 EXPORT_SYMBOL_GPL(ip6_sk_redirect); 1510 1511 static unsigned int ip6_default_advmss(const struct dst_entry *dst) 1512 { 1513 struct net_device *dev = dst->dev; 1514 unsigned int mtu = dst_mtu(dst); 1515 struct net *net = dev_net(dev); 1516 1517 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); 1518 1519 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) 1520 mtu = net->ipv6.sysctl.ip6_rt_min_advmss; 1521 1522 /* 1523 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 1524 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 1525 * IPV6_MAXPLEN is also valid and means: "any MSS, 1526 * rely only on pmtu discovery" 1527 */ 1528 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) 1529 mtu = IPV6_MAXPLEN; 1530 return mtu; 1531 } 1532 1533 static unsigned int ip6_mtu(const struct dst_entry *dst) 1534 { 1535 const struct rt6_info *rt = (const struct rt6_info *)dst; 1536 unsigned int mtu = rt->rt6i_pmtu; 1537 struct inet6_dev *idev; 1538 1539 if (mtu) 1540 goto out; 1541 1542 mtu = dst_metric_raw(dst, RTAX_MTU); 1543 if (mtu) 1544 goto out; 1545 1546 mtu = IPV6_MIN_MTU; 1547 1548 rcu_read_lock(); 1549 idev = __in6_dev_get(dst->dev); 1550 if (idev) 1551 mtu = idev->cnf.mtu6; 1552 rcu_read_unlock(); 1553 1554 out: 1555 return min_t(unsigned int, mtu, IP6_MAX_MTU); 1556 } 1557 1558 static struct dst_entry *icmp6_dst_gc_list; 1559 static DEFINE_SPINLOCK(icmp6_dst_lock); 1560 1561 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, 1562 struct flowi6 *fl6) 1563 { 1564 struct dst_entry *dst; 1565 struct rt6_info *rt; 1566 struct inet6_dev *idev = in6_dev_get(dev); 1567 struct net *net = dev_net(dev); 1568 1569 if (unlikely(!idev)) 1570 return ERR_PTR(-ENODEV); 1571 1572 rt = ip6_dst_alloc(net, dev, 0, NULL); 1573 if (unlikely(!rt)) { 1574 in6_dev_put(idev); 1575 dst = ERR_PTR(-ENOMEM); 1576 goto out; 1577 } 1578 1579 rt->dst.flags |= DST_HOST; 1580 rt->dst.output = ip6_output; 1581 atomic_set(&rt->dst.__refcnt, 1); 1582 rt->rt6i_gateway = fl6->daddr; 1583 rt->rt6i_dst.addr = fl6->daddr; 1584 rt->rt6i_dst.plen = 128; 1585 rt->rt6i_idev = idev; 1586 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); 1587 1588 spin_lock_bh(&icmp6_dst_lock); 1589 rt->dst.next = icmp6_dst_gc_list; 1590 icmp6_dst_gc_list = &rt->dst; 1591 spin_unlock_bh(&icmp6_dst_lock); 1592 1593 fib6_force_start_gc(net); 1594 1595 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); 1596 1597 out: 1598 return dst; 1599 } 1600 1601 int icmp6_dst_gc(void) 1602 { 1603 struct dst_entry *dst, **pprev; 1604 int more = 0; 1605 1606 spin_lock_bh(&icmp6_dst_lock); 1607 pprev = &icmp6_dst_gc_list; 1608 1609 while ((dst = *pprev) != NULL) { 1610 if (!atomic_read(&dst->__refcnt)) { 1611 *pprev = dst->next; 1612 dst_free(dst); 1613 } else { 1614 pprev = &dst->next; 1615 ++more; 1616 } 1617 } 1618 1619 spin_unlock_bh(&icmp6_dst_lock); 1620 1621 return more; 1622 } 1623 1624 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg), 1625 void *arg) 1626 { 1627 struct dst_entry *dst, **pprev; 1628 1629 spin_lock_bh(&icmp6_dst_lock); 1630 pprev = &icmp6_dst_gc_list; 1631 while ((dst = *pprev) != NULL) { 1632 struct rt6_info *rt = (struct rt6_info *) dst; 1633 if (func(rt, arg)) { 1634 *pprev = dst->next; 1635 dst_free(dst); 1636 } else { 1637 pprev = &dst->next; 1638 } 1639 } 1640 spin_unlock_bh(&icmp6_dst_lock); 1641 } 1642 1643 static int ip6_dst_gc(struct dst_ops *ops) 1644 { 1645 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); 1646 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; 1647 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size; 1648 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; 1649 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; 1650 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; 1651 int entries; 1652 1653 entries = dst_entries_get_fast(ops); 1654 if (time_after(rt_last_gc + rt_min_interval, jiffies) && 1655 entries <= rt_max_size) 1656 goto out; 1657 1658 net->ipv6.ip6_rt_gc_expire++; 1659 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true); 1660 entries = dst_entries_get_slow(ops); 1661 if (entries < ops->gc_thresh) 1662 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; 1663 out: 1664 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; 1665 return entries > rt_max_size; 1666 } 1667 1668 static int ip6_convert_metrics(struct mx6_config *mxc, 1669 const struct fib6_config *cfg) 1670 { 1671 struct nlattr *nla; 1672 int remaining; 1673 u32 *mp; 1674 1675 if (!cfg->fc_mx) 1676 return 0; 1677 1678 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 1679 if (unlikely(!mp)) 1680 return -ENOMEM; 1681 1682 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 1683 int type = nla_type(nla); 1684 1685 if (type) { 1686 u32 val; 1687 1688 if (unlikely(type > RTAX_MAX)) 1689 goto err; 1690 if (type == RTAX_CC_ALGO) { 1691 char tmp[TCP_CA_NAME_MAX]; 1692 1693 nla_strlcpy(tmp, nla, sizeof(tmp)); 1694 val = tcp_ca_get_key_by_name(tmp); 1695 if (val == TCP_CA_UNSPEC) 1696 goto err; 1697 } else { 1698 val = nla_get_u32(nla); 1699 } 1700 1701 mp[type - 1] = val; 1702 __set_bit(type - 1, mxc->mx_valid); 1703 } 1704 } 1705 1706 mxc->mx = mp; 1707 1708 return 0; 1709 err: 1710 kfree(mp); 1711 return -EINVAL; 1712 } 1713 1714 int ip6_route_add(struct fib6_config *cfg) 1715 { 1716 int err; 1717 struct net *net = cfg->fc_nlinfo.nl_net; 1718 struct rt6_info *rt = NULL; 1719 struct net_device *dev = NULL; 1720 struct inet6_dev *idev = NULL; 1721 struct fib6_table *table; 1722 struct mx6_config mxc = { .mx = NULL, }; 1723 int addr_type; 1724 1725 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) 1726 return -EINVAL; 1727 #ifndef CONFIG_IPV6_SUBTREES 1728 if (cfg->fc_src_len) 1729 return -EINVAL; 1730 #endif 1731 if (cfg->fc_ifindex) { 1732 err = -ENODEV; 1733 dev = dev_get_by_index(net, cfg->fc_ifindex); 1734 if (!dev) 1735 goto out; 1736 idev = in6_dev_get(dev); 1737 if (!idev) 1738 goto out; 1739 } 1740 1741 if (cfg->fc_metric == 0) 1742 cfg->fc_metric = IP6_RT_PRIO_USER; 1743 1744 err = -ENOBUFS; 1745 if (cfg->fc_nlinfo.nlh && 1746 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { 1747 table = fib6_get_table(net, cfg->fc_table); 1748 if (!table) { 1749 pr_warn("NLM_F_CREATE should be specified when creating new route\n"); 1750 table = fib6_new_table(net, cfg->fc_table); 1751 } 1752 } else { 1753 table = fib6_new_table(net, cfg->fc_table); 1754 } 1755 1756 if (!table) 1757 goto out; 1758 1759 rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table); 1760 1761 if (!rt) { 1762 err = -ENOMEM; 1763 goto out; 1764 } 1765 1766 if (cfg->fc_flags & RTF_EXPIRES) 1767 rt6_set_expires(rt, jiffies + 1768 clock_t_to_jiffies(cfg->fc_expires)); 1769 else 1770 rt6_clean_expires(rt); 1771 1772 if (cfg->fc_protocol == RTPROT_UNSPEC) 1773 cfg->fc_protocol = RTPROT_BOOT; 1774 rt->rt6i_protocol = cfg->fc_protocol; 1775 1776 addr_type = ipv6_addr_type(&cfg->fc_dst); 1777 1778 if (addr_type & IPV6_ADDR_MULTICAST) 1779 rt->dst.input = ip6_mc_input; 1780 else if (cfg->fc_flags & RTF_LOCAL) 1781 rt->dst.input = ip6_input; 1782 else 1783 rt->dst.input = ip6_forward; 1784 1785 rt->dst.output = ip6_output; 1786 1787 if (cfg->fc_encap) { 1788 struct lwtunnel_state *lwtstate; 1789 1790 err = lwtunnel_build_state(dev, cfg->fc_encap_type, 1791 cfg->fc_encap, &lwtstate); 1792 if (err) 1793 goto out; 1794 rt->dst.lwtstate = lwtstate_get(lwtstate); 1795 if (lwtunnel_output_redirect(rt->dst.lwtstate)) { 1796 rt->dst.lwtstate->orig_output = rt->dst.output; 1797 rt->dst.output = lwtunnel_output; 1798 } 1799 if (lwtunnel_input_redirect(rt->dst.lwtstate)) { 1800 rt->dst.lwtstate->orig_input = rt->dst.input; 1801 rt->dst.input = lwtunnel_input; 1802 } 1803 } 1804 1805 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); 1806 rt->rt6i_dst.plen = cfg->fc_dst_len; 1807 if (rt->rt6i_dst.plen == 128) 1808 rt->dst.flags |= DST_HOST; 1809 1810 #ifdef CONFIG_IPV6_SUBTREES 1811 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len); 1812 rt->rt6i_src.plen = cfg->fc_src_len; 1813 #endif 1814 1815 rt->rt6i_metric = cfg->fc_metric; 1816 1817 /* We cannot add true routes via loopback here, 1818 they would result in kernel looping; promote them to reject routes 1819 */ 1820 if ((cfg->fc_flags & RTF_REJECT) || 1821 (dev && (dev->flags & IFF_LOOPBACK) && 1822 !(addr_type & IPV6_ADDR_LOOPBACK) && 1823 !(cfg->fc_flags & RTF_LOCAL))) { 1824 /* hold loopback dev/idev if we haven't done so. */ 1825 if (dev != net->loopback_dev) { 1826 if (dev) { 1827 dev_put(dev); 1828 in6_dev_put(idev); 1829 } 1830 dev = net->loopback_dev; 1831 dev_hold(dev); 1832 idev = in6_dev_get(dev); 1833 if (!idev) { 1834 err = -ENODEV; 1835 goto out; 1836 } 1837 } 1838 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; 1839 switch (cfg->fc_type) { 1840 case RTN_BLACKHOLE: 1841 rt->dst.error = -EINVAL; 1842 rt->dst.output = dst_discard_sk; 1843 rt->dst.input = dst_discard; 1844 break; 1845 case RTN_PROHIBIT: 1846 rt->dst.error = -EACCES; 1847 rt->dst.output = ip6_pkt_prohibit_out; 1848 rt->dst.input = ip6_pkt_prohibit; 1849 break; 1850 case RTN_THROW: 1851 default: 1852 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN 1853 : -ENETUNREACH; 1854 rt->dst.output = ip6_pkt_discard_out; 1855 rt->dst.input = ip6_pkt_discard; 1856 break; 1857 } 1858 goto install_route; 1859 } 1860 1861 if (cfg->fc_flags & RTF_GATEWAY) { 1862 const struct in6_addr *gw_addr; 1863 int gwa_type; 1864 1865 gw_addr = &cfg->fc_gateway; 1866 gwa_type = ipv6_addr_type(gw_addr); 1867 1868 /* if gw_addr is local we will fail to detect this in case 1869 * address is still TENTATIVE (DAD in progress). rt6_lookup() 1870 * will return already-added prefix route via interface that 1871 * prefix route was assigned to, which might be non-loopback. 1872 */ 1873 err = -EINVAL; 1874 if (ipv6_chk_addr_and_flags(net, gw_addr, 1875 gwa_type & IPV6_ADDR_LINKLOCAL ? 1876 dev : NULL, 0, 0)) 1877 goto out; 1878 1879 rt->rt6i_gateway = *gw_addr; 1880 1881 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { 1882 struct rt6_info *grt; 1883 1884 /* IPv6 strictly inhibits using not link-local 1885 addresses as nexthop address. 1886 Otherwise, router will not able to send redirects. 1887 It is very good, but in some (rare!) circumstances 1888 (SIT, PtP, NBMA NOARP links) it is handy to allow 1889 some exceptions. --ANK 1890 */ 1891 if (!(gwa_type & IPV6_ADDR_UNICAST)) 1892 goto out; 1893 1894 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); 1895 1896 err = -EHOSTUNREACH; 1897 if (!grt) 1898 goto out; 1899 if (dev) { 1900 if (dev != grt->dst.dev) { 1901 ip6_rt_put(grt); 1902 goto out; 1903 } 1904 } else { 1905 dev = grt->dst.dev; 1906 idev = grt->rt6i_idev; 1907 dev_hold(dev); 1908 in6_dev_hold(grt->rt6i_idev); 1909 } 1910 if (!(grt->rt6i_flags & RTF_GATEWAY)) 1911 err = 0; 1912 ip6_rt_put(grt); 1913 1914 if (err) 1915 goto out; 1916 } 1917 err = -EINVAL; 1918 if (!dev || (dev->flags & IFF_LOOPBACK)) 1919 goto out; 1920 } 1921 1922 err = -ENODEV; 1923 if (!dev) 1924 goto out; 1925 1926 if (!ipv6_addr_any(&cfg->fc_prefsrc)) { 1927 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { 1928 err = -EINVAL; 1929 goto out; 1930 } 1931 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc; 1932 rt->rt6i_prefsrc.plen = 128; 1933 } else 1934 rt->rt6i_prefsrc.plen = 0; 1935 1936 rt->rt6i_flags = cfg->fc_flags; 1937 1938 install_route: 1939 rt->dst.dev = dev; 1940 rt->rt6i_idev = idev; 1941 rt->rt6i_table = table; 1942 1943 cfg->fc_nlinfo.nl_net = dev_net(dev); 1944 1945 err = ip6_convert_metrics(&mxc, cfg); 1946 if (err) 1947 goto out; 1948 1949 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc); 1950 1951 kfree(mxc.mx); 1952 return err; 1953 out: 1954 if (dev) 1955 dev_put(dev); 1956 if (idev) 1957 in6_dev_put(idev); 1958 if (rt) 1959 dst_free(&rt->dst); 1960 return err; 1961 } 1962 1963 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) 1964 { 1965 int err; 1966 struct fib6_table *table; 1967 struct net *net = dev_net(rt->dst.dev); 1968 1969 if (rt == net->ipv6.ip6_null_entry) { 1970 err = -ENOENT; 1971 goto out; 1972 } 1973 1974 table = rt->rt6i_table; 1975 write_lock_bh(&table->tb6_lock); 1976 err = fib6_del(rt, info); 1977 write_unlock_bh(&table->tb6_lock); 1978 1979 out: 1980 ip6_rt_put(rt); 1981 return err; 1982 } 1983 1984 int ip6_del_rt(struct rt6_info *rt) 1985 { 1986 struct nl_info info = { 1987 .nl_net = dev_net(rt->dst.dev), 1988 }; 1989 return __ip6_del_rt(rt, &info); 1990 } 1991 1992 static int ip6_route_del(struct fib6_config *cfg) 1993 { 1994 struct fib6_table *table; 1995 struct fib6_node *fn; 1996 struct rt6_info *rt; 1997 int err = -ESRCH; 1998 1999 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); 2000 if (!table) 2001 return err; 2002 2003 read_lock_bh(&table->tb6_lock); 2004 2005 fn = fib6_locate(&table->tb6_root, 2006 &cfg->fc_dst, cfg->fc_dst_len, 2007 &cfg->fc_src, cfg->fc_src_len); 2008 2009 if (fn) { 2010 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2011 if ((rt->rt6i_flags & RTF_CACHE) && 2012 !(cfg->fc_flags & RTF_CACHE)) 2013 continue; 2014 if (cfg->fc_ifindex && 2015 (!rt->dst.dev || 2016 rt->dst.dev->ifindex != cfg->fc_ifindex)) 2017 continue; 2018 if (cfg->fc_flags & RTF_GATEWAY && 2019 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) 2020 continue; 2021 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric) 2022 continue; 2023 dst_hold(&rt->dst); 2024 read_unlock_bh(&table->tb6_lock); 2025 2026 return __ip6_del_rt(rt, &cfg->fc_nlinfo); 2027 } 2028 } 2029 read_unlock_bh(&table->tb6_lock); 2030 2031 return err; 2032 } 2033 2034 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 2035 { 2036 struct net *net = dev_net(skb->dev); 2037 struct netevent_redirect netevent; 2038 struct rt6_info *rt, *nrt = NULL; 2039 struct ndisc_options ndopts; 2040 struct inet6_dev *in6_dev; 2041 struct neighbour *neigh; 2042 struct rd_msg *msg; 2043 int optlen, on_link; 2044 u8 *lladdr; 2045 2046 optlen = skb_tail_pointer(skb) - skb_transport_header(skb); 2047 optlen -= sizeof(*msg); 2048 2049 if (optlen < 0) { 2050 net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); 2051 return; 2052 } 2053 2054 msg = (struct rd_msg *)icmp6_hdr(skb); 2055 2056 if (ipv6_addr_is_multicast(&msg->dest)) { 2057 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); 2058 return; 2059 } 2060 2061 on_link = 0; 2062 if (ipv6_addr_equal(&msg->dest, &msg->target)) { 2063 on_link = 1; 2064 } else if (ipv6_addr_type(&msg->target) != 2065 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { 2066 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); 2067 return; 2068 } 2069 2070 in6_dev = __in6_dev_get(skb->dev); 2071 if (!in6_dev) 2072 return; 2073 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) 2074 return; 2075 2076 /* RFC2461 8.1: 2077 * The IP source address of the Redirect MUST be the same as the current 2078 * first-hop router for the specified ICMP Destination Address. 2079 */ 2080 2081 if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) { 2082 net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); 2083 return; 2084 } 2085 2086 lladdr = NULL; 2087 if (ndopts.nd_opts_tgt_lladdr) { 2088 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, 2089 skb->dev); 2090 if (!lladdr) { 2091 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); 2092 return; 2093 } 2094 } 2095 2096 rt = (struct rt6_info *) dst; 2097 if (rt == net->ipv6.ip6_null_entry) { 2098 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); 2099 return; 2100 } 2101 2102 /* Redirect received -> path was valid. 2103 * Look, redirects are sent only in response to data packets, 2104 * so that this nexthop apparently is reachable. --ANK 2105 */ 2106 dst_confirm(&rt->dst); 2107 2108 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); 2109 if (!neigh) 2110 return; 2111 2112 /* 2113 * We have finally decided to accept it. 2114 */ 2115 2116 neigh_update(neigh, lladdr, NUD_STALE, 2117 NEIGH_UPDATE_F_WEAK_OVERRIDE| 2118 NEIGH_UPDATE_F_OVERRIDE| 2119 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| 2120 NEIGH_UPDATE_F_ISROUTER)) 2121 ); 2122 2123 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL); 2124 if (!nrt) 2125 goto out; 2126 2127 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; 2128 if (on_link) 2129 nrt->rt6i_flags &= ~RTF_GATEWAY; 2130 2131 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; 2132 2133 if (ip6_ins_rt(nrt)) 2134 goto out; 2135 2136 netevent.old = &rt->dst; 2137 netevent.new = &nrt->dst; 2138 netevent.daddr = &msg->dest; 2139 netevent.neigh = neigh; 2140 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); 2141 2142 if (rt->rt6i_flags & RTF_CACHE) { 2143 rt = (struct rt6_info *) dst_clone(&rt->dst); 2144 ip6_del_rt(rt); 2145 } 2146 2147 out: 2148 neigh_release(neigh); 2149 } 2150 2151 /* 2152 * Misc support functions 2153 */ 2154 2155 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from) 2156 { 2157 BUG_ON(from->dst.from); 2158 2159 rt->rt6i_flags &= ~RTF_EXPIRES; 2160 dst_hold(&from->dst); 2161 rt->dst.from = &from->dst; 2162 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true); 2163 } 2164 2165 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) 2166 { 2167 rt->dst.input = ort->dst.input; 2168 rt->dst.output = ort->dst.output; 2169 rt->rt6i_dst = ort->rt6i_dst; 2170 rt->dst.error = ort->dst.error; 2171 rt->rt6i_idev = ort->rt6i_idev; 2172 if (rt->rt6i_idev) 2173 in6_dev_hold(rt->rt6i_idev); 2174 rt->dst.lastuse = jiffies; 2175 rt->rt6i_gateway = ort->rt6i_gateway; 2176 rt->rt6i_flags = ort->rt6i_flags; 2177 rt6_set_from(rt, ort); 2178 rt->rt6i_metric = ort->rt6i_metric; 2179 #ifdef CONFIG_IPV6_SUBTREES 2180 rt->rt6i_src = ort->rt6i_src; 2181 #endif 2182 rt->rt6i_prefsrc = ort->rt6i_prefsrc; 2183 rt->rt6i_table = ort->rt6i_table; 2184 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); 2185 } 2186 2187 #ifdef CONFIG_IPV6_ROUTE_INFO 2188 static struct rt6_info *rt6_get_route_info(struct net *net, 2189 const struct in6_addr *prefix, int prefixlen, 2190 const struct in6_addr *gwaddr, int ifindex) 2191 { 2192 struct fib6_node *fn; 2193 struct rt6_info *rt = NULL; 2194 struct fib6_table *table; 2195 2196 table = fib6_get_table(net, RT6_TABLE_INFO); 2197 if (!table) 2198 return NULL; 2199 2200 read_lock_bh(&table->tb6_lock); 2201 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); 2202 if (!fn) 2203 goto out; 2204 2205 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { 2206 if (rt->dst.dev->ifindex != ifindex) 2207 continue; 2208 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) 2209 continue; 2210 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) 2211 continue; 2212 dst_hold(&rt->dst); 2213 break; 2214 } 2215 out: 2216 read_unlock_bh(&table->tb6_lock); 2217 return rt; 2218 } 2219 2220 static struct rt6_info *rt6_add_route_info(struct net *net, 2221 const struct in6_addr *prefix, int prefixlen, 2222 const struct in6_addr *gwaddr, int ifindex, 2223 unsigned int pref) 2224 { 2225 struct fib6_config cfg = { 2226 .fc_table = RT6_TABLE_INFO, 2227 .fc_metric = IP6_RT_PRIO_USER, 2228 .fc_ifindex = ifindex, 2229 .fc_dst_len = prefixlen, 2230 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | 2231 RTF_UP | RTF_PREF(pref), 2232 .fc_nlinfo.portid = 0, 2233 .fc_nlinfo.nlh = NULL, 2234 .fc_nlinfo.nl_net = net, 2235 }; 2236 2237 cfg.fc_dst = *prefix; 2238 cfg.fc_gateway = *gwaddr; 2239 2240 /* We should treat it as a default route if prefix length is 0. */ 2241 if (!prefixlen) 2242 cfg.fc_flags |= RTF_DEFAULT; 2243 2244 ip6_route_add(&cfg); 2245 2246 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex); 2247 } 2248 #endif 2249 2250 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev) 2251 { 2252 struct rt6_info *rt; 2253 struct fib6_table *table; 2254 2255 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT); 2256 if (!table) 2257 return NULL; 2258 2259 read_lock_bh(&table->tb6_lock); 2260 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 2261 if (dev == rt->dst.dev && 2262 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && 2263 ipv6_addr_equal(&rt->rt6i_gateway, addr)) 2264 break; 2265 } 2266 if (rt) 2267 dst_hold(&rt->dst); 2268 read_unlock_bh(&table->tb6_lock); 2269 return rt; 2270 } 2271 2272 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, 2273 struct net_device *dev, 2274 unsigned int pref) 2275 { 2276 struct fib6_config cfg = { 2277 .fc_table = RT6_TABLE_DFLT, 2278 .fc_metric = IP6_RT_PRIO_USER, 2279 .fc_ifindex = dev->ifindex, 2280 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | 2281 RTF_UP | RTF_EXPIRES | RTF_PREF(pref), 2282 .fc_nlinfo.portid = 0, 2283 .fc_nlinfo.nlh = NULL, 2284 .fc_nlinfo.nl_net = dev_net(dev), 2285 }; 2286 2287 cfg.fc_gateway = *gwaddr; 2288 2289 ip6_route_add(&cfg); 2290 2291 return rt6_get_dflt_router(gwaddr, dev); 2292 } 2293 2294 void rt6_purge_dflt_routers(struct net *net) 2295 { 2296 struct rt6_info *rt; 2297 struct fib6_table *table; 2298 2299 /* NOTE: Keep consistent with rt6_get_dflt_router */ 2300 table = fib6_get_table(net, RT6_TABLE_DFLT); 2301 if (!table) 2302 return; 2303 2304 restart: 2305 read_lock_bh(&table->tb6_lock); 2306 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { 2307 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && 2308 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { 2309 dst_hold(&rt->dst); 2310 read_unlock_bh(&table->tb6_lock); 2311 ip6_del_rt(rt); 2312 goto restart; 2313 } 2314 } 2315 read_unlock_bh(&table->tb6_lock); 2316 } 2317 2318 static void rtmsg_to_fib6_config(struct net *net, 2319 struct in6_rtmsg *rtmsg, 2320 struct fib6_config *cfg) 2321 { 2322 memset(cfg, 0, sizeof(*cfg)); 2323 2324 cfg->fc_table = RT6_TABLE_MAIN; 2325 cfg->fc_ifindex = rtmsg->rtmsg_ifindex; 2326 cfg->fc_metric = rtmsg->rtmsg_metric; 2327 cfg->fc_expires = rtmsg->rtmsg_info; 2328 cfg->fc_dst_len = rtmsg->rtmsg_dst_len; 2329 cfg->fc_src_len = rtmsg->rtmsg_src_len; 2330 cfg->fc_flags = rtmsg->rtmsg_flags; 2331 2332 cfg->fc_nlinfo.nl_net = net; 2333 2334 cfg->fc_dst = rtmsg->rtmsg_dst; 2335 cfg->fc_src = rtmsg->rtmsg_src; 2336 cfg->fc_gateway = rtmsg->rtmsg_gateway; 2337 } 2338 2339 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) 2340 { 2341 struct fib6_config cfg; 2342 struct in6_rtmsg rtmsg; 2343 int err; 2344 2345 switch (cmd) { 2346 case SIOCADDRT: /* Add a route */ 2347 case SIOCDELRT: /* Delete a route */ 2348 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 2349 return -EPERM; 2350 err = copy_from_user(&rtmsg, arg, 2351 sizeof(struct in6_rtmsg)); 2352 if (err) 2353 return -EFAULT; 2354 2355 rtmsg_to_fib6_config(net, &rtmsg, &cfg); 2356 2357 rtnl_lock(); 2358 switch (cmd) { 2359 case SIOCADDRT: 2360 err = ip6_route_add(&cfg); 2361 break; 2362 case SIOCDELRT: 2363 err = ip6_route_del(&cfg); 2364 break; 2365 default: 2366 err = -EINVAL; 2367 } 2368 rtnl_unlock(); 2369 2370 return err; 2371 } 2372 2373 return -EINVAL; 2374 } 2375 2376 /* 2377 * Drop the packet on the floor 2378 */ 2379 2380 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) 2381 { 2382 int type; 2383 struct dst_entry *dst = skb_dst(skb); 2384 switch (ipstats_mib_noroutes) { 2385 case IPSTATS_MIB_INNOROUTES: 2386 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); 2387 if (type == IPV6_ADDR_ANY) { 2388 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 2389 IPSTATS_MIB_INADDRERRORS); 2390 break; 2391 } 2392 /* FALLTHROUGH */ 2393 case IPSTATS_MIB_OUTNOROUTES: 2394 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), 2395 ipstats_mib_noroutes); 2396 break; 2397 } 2398 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); 2399 kfree_skb(skb); 2400 return 0; 2401 } 2402 2403 static int ip6_pkt_discard(struct sk_buff *skb) 2404 { 2405 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); 2406 } 2407 2408 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb) 2409 { 2410 skb->dev = skb_dst(skb)->dev; 2411 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); 2412 } 2413 2414 static int ip6_pkt_prohibit(struct sk_buff *skb) 2415 { 2416 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); 2417 } 2418 2419 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb) 2420 { 2421 skb->dev = skb_dst(skb)->dev; 2422 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); 2423 } 2424 2425 /* 2426 * Allocate a dst for local (unicast / anycast) address. 2427 */ 2428 2429 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, 2430 const struct in6_addr *addr, 2431 bool anycast) 2432 { 2433 struct net *net = dev_net(idev->dev); 2434 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 2435 DST_NOCOUNT, NULL); 2436 if (!rt) 2437 return ERR_PTR(-ENOMEM); 2438 2439 in6_dev_hold(idev); 2440 2441 rt->dst.flags |= DST_HOST; 2442 rt->dst.input = ip6_input; 2443 rt->dst.output = ip6_output; 2444 rt->rt6i_idev = idev; 2445 2446 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; 2447 if (anycast) 2448 rt->rt6i_flags |= RTF_ANYCAST; 2449 else 2450 rt->rt6i_flags |= RTF_LOCAL; 2451 2452 rt->rt6i_gateway = *addr; 2453 rt->rt6i_dst.addr = *addr; 2454 rt->rt6i_dst.plen = 128; 2455 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); 2456 2457 atomic_set(&rt->dst.__refcnt, 1); 2458 2459 return rt; 2460 } 2461 2462 int ip6_route_get_saddr(struct net *net, 2463 struct rt6_info *rt, 2464 const struct in6_addr *daddr, 2465 unsigned int prefs, 2466 struct in6_addr *saddr) 2467 { 2468 struct inet6_dev *idev = 2469 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL; 2470 int err = 0; 2471 if (rt && rt->rt6i_prefsrc.plen) 2472 *saddr = rt->rt6i_prefsrc.addr; 2473 else 2474 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL, 2475 daddr, prefs, saddr); 2476 return err; 2477 } 2478 2479 /* remove deleted ip from prefsrc entries */ 2480 struct arg_dev_net_ip { 2481 struct net_device *dev; 2482 struct net *net; 2483 struct in6_addr *addr; 2484 }; 2485 2486 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) 2487 { 2488 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev; 2489 struct net *net = ((struct arg_dev_net_ip *)arg)->net; 2490 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; 2491 2492 if (((void *)rt->dst.dev == dev || !dev) && 2493 rt != net->ipv6.ip6_null_entry && 2494 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { 2495 /* remove prefsrc entry */ 2496 rt->rt6i_prefsrc.plen = 0; 2497 } 2498 return 0; 2499 } 2500 2501 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) 2502 { 2503 struct net *net = dev_net(ifp->idev->dev); 2504 struct arg_dev_net_ip adni = { 2505 .dev = ifp->idev->dev, 2506 .net = net, 2507 .addr = &ifp->addr, 2508 }; 2509 fib6_clean_all(net, fib6_remove_prefsrc, &adni); 2510 } 2511 2512 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) 2513 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) 2514 2515 /* Remove routers and update dst entries when gateway turn into host. */ 2516 static int fib6_clean_tohost(struct rt6_info *rt, void *arg) 2517 { 2518 struct in6_addr *gateway = (struct in6_addr *)arg; 2519 2520 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || 2521 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && 2522 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { 2523 return -1; 2524 } 2525 return 0; 2526 } 2527 2528 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) 2529 { 2530 fib6_clean_all(net, fib6_clean_tohost, gateway); 2531 } 2532 2533 struct arg_dev_net { 2534 struct net_device *dev; 2535 struct net *net; 2536 }; 2537 2538 static int fib6_ifdown(struct rt6_info *rt, void *arg) 2539 { 2540 const struct arg_dev_net *adn = arg; 2541 const struct net_device *dev = adn->dev; 2542 2543 if ((rt->dst.dev == dev || !dev) && 2544 rt != adn->net->ipv6.ip6_null_entry) 2545 return -1; 2546 2547 return 0; 2548 } 2549 2550 void rt6_ifdown(struct net *net, struct net_device *dev) 2551 { 2552 struct arg_dev_net adn = { 2553 .dev = dev, 2554 .net = net, 2555 }; 2556 2557 fib6_clean_all(net, fib6_ifdown, &adn); 2558 icmp6_clean_all(fib6_ifdown, &adn); 2559 rt6_uncached_list_flush_dev(net, dev); 2560 } 2561 2562 struct rt6_mtu_change_arg { 2563 struct net_device *dev; 2564 unsigned int mtu; 2565 }; 2566 2567 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) 2568 { 2569 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; 2570 struct inet6_dev *idev; 2571 2572 /* In IPv6 pmtu discovery is not optional, 2573 so that RTAX_MTU lock cannot disable it. 2574 We still use this lock to block changes 2575 caused by addrconf/ndisc. 2576 */ 2577 2578 idev = __in6_dev_get(arg->dev); 2579 if (!idev) 2580 return 0; 2581 2582 /* For administrative MTU increase, there is no way to discover 2583 IPv6 PMTU increase, so PMTU increase should be updated here. 2584 Since RFC 1981 doesn't include administrative MTU increase 2585 update PMTU increase is a MUST. (i.e. jumbo frame) 2586 */ 2587 /* 2588 If new MTU is less than route PMTU, this new MTU will be the 2589 lowest MTU in the path, update the route PMTU to reflect PMTU 2590 decreases; if new MTU is greater than route PMTU, and the 2591 old MTU is the lowest MTU in the path, update the route PMTU 2592 to reflect the increase. In this case if the other nodes' MTU 2593 also have the lowest MTU, TOO BIG MESSAGE will be lead to 2594 PMTU discouvery. 2595 */ 2596 if (rt->dst.dev == arg->dev && 2597 !dst_metric_locked(&rt->dst, RTAX_MTU)) { 2598 if (rt->rt6i_flags & RTF_CACHE) { 2599 /* For RTF_CACHE with rt6i_pmtu == 0 2600 * (i.e. a redirected route), 2601 * the metrics of its rt->dst.from has already 2602 * been updated. 2603 */ 2604 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) 2605 rt->rt6i_pmtu = arg->mtu; 2606 } else if (dst_mtu(&rt->dst) >= arg->mtu || 2607 (dst_mtu(&rt->dst) < arg->mtu && 2608 dst_mtu(&rt->dst) == idev->cnf.mtu6)) { 2609 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); 2610 } 2611 } 2612 return 0; 2613 } 2614 2615 void rt6_mtu_change(struct net_device *dev, unsigned int mtu) 2616 { 2617 struct rt6_mtu_change_arg arg = { 2618 .dev = dev, 2619 .mtu = mtu, 2620 }; 2621 2622 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); 2623 } 2624 2625 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { 2626 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, 2627 [RTA_OIF] = { .type = NLA_U32 }, 2628 [RTA_IIF] = { .type = NLA_U32 }, 2629 [RTA_PRIORITY] = { .type = NLA_U32 }, 2630 [RTA_METRICS] = { .type = NLA_NESTED }, 2631 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 2632 [RTA_PREF] = { .type = NLA_U8 }, 2633 [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, 2634 [RTA_ENCAP] = { .type = NLA_NESTED }, 2635 }; 2636 2637 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, 2638 struct fib6_config *cfg) 2639 { 2640 struct rtmsg *rtm; 2641 struct nlattr *tb[RTA_MAX+1]; 2642 unsigned int pref; 2643 int err; 2644 2645 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); 2646 if (err < 0) 2647 goto errout; 2648 2649 err = -EINVAL; 2650 rtm = nlmsg_data(nlh); 2651 memset(cfg, 0, sizeof(*cfg)); 2652 2653 cfg->fc_table = rtm->rtm_table; 2654 cfg->fc_dst_len = rtm->rtm_dst_len; 2655 cfg->fc_src_len = rtm->rtm_src_len; 2656 cfg->fc_flags = RTF_UP; 2657 cfg->fc_protocol = rtm->rtm_protocol; 2658 cfg->fc_type = rtm->rtm_type; 2659 2660 if (rtm->rtm_type == RTN_UNREACHABLE || 2661 rtm->rtm_type == RTN_BLACKHOLE || 2662 rtm->rtm_type == RTN_PROHIBIT || 2663 rtm->rtm_type == RTN_THROW) 2664 cfg->fc_flags |= RTF_REJECT; 2665 2666 if (rtm->rtm_type == RTN_LOCAL) 2667 cfg->fc_flags |= RTF_LOCAL; 2668 2669 if (rtm->rtm_flags & RTM_F_CLONED) 2670 cfg->fc_flags |= RTF_CACHE; 2671 2672 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 2673 cfg->fc_nlinfo.nlh = nlh; 2674 cfg->fc_nlinfo.nl_net = sock_net(skb->sk); 2675 2676 if (tb[RTA_GATEWAY]) { 2677 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); 2678 cfg->fc_flags |= RTF_GATEWAY; 2679 } 2680 2681 if (tb[RTA_DST]) { 2682 int plen = (rtm->rtm_dst_len + 7) >> 3; 2683 2684 if (nla_len(tb[RTA_DST]) < plen) 2685 goto errout; 2686 2687 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); 2688 } 2689 2690 if (tb[RTA_SRC]) { 2691 int plen = (rtm->rtm_src_len + 7) >> 3; 2692 2693 if (nla_len(tb[RTA_SRC]) < plen) 2694 goto errout; 2695 2696 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); 2697 } 2698 2699 if (tb[RTA_PREFSRC]) 2700 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); 2701 2702 if (tb[RTA_OIF]) 2703 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); 2704 2705 if (tb[RTA_PRIORITY]) 2706 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); 2707 2708 if (tb[RTA_METRICS]) { 2709 cfg->fc_mx = nla_data(tb[RTA_METRICS]); 2710 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); 2711 } 2712 2713 if (tb[RTA_TABLE]) 2714 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); 2715 2716 if (tb[RTA_MULTIPATH]) { 2717 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); 2718 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); 2719 } 2720 2721 if (tb[RTA_PREF]) { 2722 pref = nla_get_u8(tb[RTA_PREF]); 2723 if (pref != ICMPV6_ROUTER_PREF_LOW && 2724 pref != ICMPV6_ROUTER_PREF_HIGH) 2725 pref = ICMPV6_ROUTER_PREF_MEDIUM; 2726 cfg->fc_flags |= RTF_PREF(pref); 2727 } 2728 2729 if (tb[RTA_ENCAP]) 2730 cfg->fc_encap = tb[RTA_ENCAP]; 2731 2732 if (tb[RTA_ENCAP_TYPE]) 2733 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); 2734 2735 err = 0; 2736 errout: 2737 return err; 2738 } 2739 2740 static int ip6_route_multipath(struct fib6_config *cfg, int add) 2741 { 2742 struct fib6_config r_cfg; 2743 struct rtnexthop *rtnh; 2744 int remaining; 2745 int attrlen; 2746 int err = 0, last_err = 0; 2747 2748 remaining = cfg->fc_mp_len; 2749 beginning: 2750 rtnh = (struct rtnexthop *)cfg->fc_mp; 2751 2752 /* Parse a Multipath Entry */ 2753 while (rtnh_ok(rtnh, remaining)) { 2754 memcpy(&r_cfg, cfg, sizeof(*cfg)); 2755 if (rtnh->rtnh_ifindex) 2756 r_cfg.fc_ifindex = rtnh->rtnh_ifindex; 2757 2758 attrlen = rtnh_attrlen(rtnh); 2759 if (attrlen > 0) { 2760 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 2761 2762 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 2763 if (nla) { 2764 r_cfg.fc_gateway = nla_get_in6_addr(nla); 2765 r_cfg.fc_flags |= RTF_GATEWAY; 2766 } 2767 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 2768 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 2769 if (nla) 2770 r_cfg.fc_encap_type = nla_get_u16(nla); 2771 } 2772 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); 2773 if (err) { 2774 last_err = err; 2775 /* If we are trying to remove a route, do not stop the 2776 * loop when ip6_route_del() fails (because next hop is 2777 * already gone), we should try to remove all next hops. 2778 */ 2779 if (add) { 2780 /* If add fails, we should try to delete all 2781 * next hops that have been already added. 2782 */ 2783 add = 0; 2784 remaining = cfg->fc_mp_len - remaining; 2785 goto beginning; 2786 } 2787 } 2788 /* Because each route is added like a single route we remove 2789 * these flags after the first nexthop: if there is a collision, 2790 * we have already failed to add the first nexthop: 2791 * fib6_add_rt2node() has rejected it; when replacing, old 2792 * nexthops have been replaced by first new, the rest should 2793 * be added to it. 2794 */ 2795 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | 2796 NLM_F_REPLACE); 2797 rtnh = rtnh_next(rtnh, &remaining); 2798 } 2799 2800 return last_err; 2801 } 2802 2803 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) 2804 { 2805 struct fib6_config cfg; 2806 int err; 2807 2808 err = rtm_to_fib6_config(skb, nlh, &cfg); 2809 if (err < 0) 2810 return err; 2811 2812 if (cfg.fc_mp) 2813 return ip6_route_multipath(&cfg, 0); 2814 else 2815 return ip6_route_del(&cfg); 2816 } 2817 2818 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) 2819 { 2820 struct fib6_config cfg; 2821 int err; 2822 2823 err = rtm_to_fib6_config(skb, nlh, &cfg); 2824 if (err < 0) 2825 return err; 2826 2827 if (cfg.fc_mp) 2828 return ip6_route_multipath(&cfg, 1); 2829 else 2830 return ip6_route_add(&cfg); 2831 } 2832 2833 static inline size_t rt6_nlmsg_size(struct rt6_info *rt) 2834 { 2835 return NLMSG_ALIGN(sizeof(struct rtmsg)) 2836 + nla_total_size(16) /* RTA_SRC */ 2837 + nla_total_size(16) /* RTA_DST */ 2838 + nla_total_size(16) /* RTA_GATEWAY */ 2839 + nla_total_size(16) /* RTA_PREFSRC */ 2840 + nla_total_size(4) /* RTA_TABLE */ 2841 + nla_total_size(4) /* RTA_IIF */ 2842 + nla_total_size(4) /* RTA_OIF */ 2843 + nla_total_size(4) /* RTA_PRIORITY */ 2844 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ 2845 + nla_total_size(sizeof(struct rta_cacheinfo)) 2846 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ 2847 + nla_total_size(1) /* RTA_PREF */ 2848 + lwtunnel_get_encap_size(rt->dst.lwtstate); 2849 } 2850 2851 static int rt6_fill_node(struct net *net, 2852 struct sk_buff *skb, struct rt6_info *rt, 2853 struct in6_addr *dst, struct in6_addr *src, 2854 int iif, int type, u32 portid, u32 seq, 2855 int prefix, int nowait, unsigned int flags) 2856 { 2857 u32 metrics[RTAX_MAX]; 2858 struct rtmsg *rtm; 2859 struct nlmsghdr *nlh; 2860 long expires; 2861 u32 table; 2862 2863 if (prefix) { /* user wants prefix routes only */ 2864 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { 2865 /* success since this is not a prefix route */ 2866 return 1; 2867 } 2868 } 2869 2870 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); 2871 if (!nlh) 2872 return -EMSGSIZE; 2873 2874 rtm = nlmsg_data(nlh); 2875 rtm->rtm_family = AF_INET6; 2876 rtm->rtm_dst_len = rt->rt6i_dst.plen; 2877 rtm->rtm_src_len = rt->rt6i_src.plen; 2878 rtm->rtm_tos = 0; 2879 if (rt->rt6i_table) 2880 table = rt->rt6i_table->tb6_id; 2881 else 2882 table = RT6_TABLE_UNSPEC; 2883 rtm->rtm_table = table; 2884 if (nla_put_u32(skb, RTA_TABLE, table)) 2885 goto nla_put_failure; 2886 if (rt->rt6i_flags & RTF_REJECT) { 2887 switch (rt->dst.error) { 2888 case -EINVAL: 2889 rtm->rtm_type = RTN_BLACKHOLE; 2890 break; 2891 case -EACCES: 2892 rtm->rtm_type = RTN_PROHIBIT; 2893 break; 2894 case -EAGAIN: 2895 rtm->rtm_type = RTN_THROW; 2896 break; 2897 default: 2898 rtm->rtm_type = RTN_UNREACHABLE; 2899 break; 2900 } 2901 } 2902 else if (rt->rt6i_flags & RTF_LOCAL) 2903 rtm->rtm_type = RTN_LOCAL; 2904 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK)) 2905 rtm->rtm_type = RTN_LOCAL; 2906 else 2907 rtm->rtm_type = RTN_UNICAST; 2908 rtm->rtm_flags = 0; 2909 if (!netif_carrier_ok(rt->dst.dev)) { 2910 rtm->rtm_flags |= RTNH_F_LINKDOWN; 2911 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) 2912 rtm->rtm_flags |= RTNH_F_DEAD; 2913 } 2914 rtm->rtm_scope = RT_SCOPE_UNIVERSE; 2915 rtm->rtm_protocol = rt->rt6i_protocol; 2916 if (rt->rt6i_flags & RTF_DYNAMIC) 2917 rtm->rtm_protocol = RTPROT_REDIRECT; 2918 else if (rt->rt6i_flags & RTF_ADDRCONF) { 2919 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) 2920 rtm->rtm_protocol = RTPROT_RA; 2921 else 2922 rtm->rtm_protocol = RTPROT_KERNEL; 2923 } 2924 2925 if (rt->rt6i_flags & RTF_CACHE) 2926 rtm->rtm_flags |= RTM_F_CLONED; 2927 2928 if (dst) { 2929 if (nla_put_in6_addr(skb, RTA_DST, dst)) 2930 goto nla_put_failure; 2931 rtm->rtm_dst_len = 128; 2932 } else if (rtm->rtm_dst_len) 2933 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr)) 2934 goto nla_put_failure; 2935 #ifdef CONFIG_IPV6_SUBTREES 2936 if (src) { 2937 if (nla_put_in6_addr(skb, RTA_SRC, src)) 2938 goto nla_put_failure; 2939 rtm->rtm_src_len = 128; 2940 } else if (rtm->rtm_src_len && 2941 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr)) 2942 goto nla_put_failure; 2943 #endif 2944 if (iif) { 2945 #ifdef CONFIG_IPV6_MROUTE 2946 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) { 2947 int err = ip6mr_get_route(net, skb, rtm, nowait); 2948 if (err <= 0) { 2949 if (!nowait) { 2950 if (err == 0) 2951 return 0; 2952 goto nla_put_failure; 2953 } else { 2954 if (err == -EMSGSIZE) 2955 goto nla_put_failure; 2956 } 2957 } 2958 } else 2959 #endif 2960 if (nla_put_u32(skb, RTA_IIF, iif)) 2961 goto nla_put_failure; 2962 } else if (dst) { 2963 struct in6_addr saddr_buf; 2964 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 && 2965 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 2966 goto nla_put_failure; 2967 } 2968 2969 if (rt->rt6i_prefsrc.plen) { 2970 struct in6_addr saddr_buf; 2971 saddr_buf = rt->rt6i_prefsrc.addr; 2972 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) 2973 goto nla_put_failure; 2974 } 2975 2976 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2977 if (rt->rt6i_pmtu) 2978 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu; 2979 if (rtnetlink_put_metrics(skb, metrics) < 0) 2980 goto nla_put_failure; 2981 2982 if (rt->rt6i_flags & RTF_GATEWAY) { 2983 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0) 2984 goto nla_put_failure; 2985 } 2986 2987 if (rt->dst.dev && 2988 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2989 goto nla_put_failure; 2990 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 2991 goto nla_put_failure; 2992 2993 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0; 2994 2995 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 2996 goto nla_put_failure; 2997 2998 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) 2999 goto nla_put_failure; 3000 3001 lwtunnel_fill_encap(skb, rt->dst.lwtstate); 3002 3003 nlmsg_end(skb, nlh); 3004 return 0; 3005 3006 nla_put_failure: 3007 nlmsg_cancel(skb, nlh); 3008 return -EMSGSIZE; 3009 } 3010 3011 int rt6_dump_route(struct rt6_info *rt, void *p_arg) 3012 { 3013 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; 3014 int prefix; 3015 3016 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) { 3017 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh); 3018 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; 3019 } else 3020 prefix = 0; 3021 3022 return rt6_fill_node(arg->net, 3023 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, 3024 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, 3025 prefix, 0, NLM_F_MULTI); 3026 } 3027 3028 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) 3029 { 3030 struct net *net = sock_net(in_skb->sk); 3031 struct nlattr *tb[RTA_MAX+1]; 3032 struct rt6_info *rt; 3033 struct sk_buff *skb; 3034 struct rtmsg *rtm; 3035 struct flowi6 fl6; 3036 int err, iif = 0, oif = 0; 3037 3038 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); 3039 if (err < 0) 3040 goto errout; 3041 3042 err = -EINVAL; 3043 memset(&fl6, 0, sizeof(fl6)); 3044 3045 if (tb[RTA_SRC]) { 3046 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) 3047 goto errout; 3048 3049 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); 3050 } 3051 3052 if (tb[RTA_DST]) { 3053 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) 3054 goto errout; 3055 3056 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); 3057 } 3058 3059 if (tb[RTA_IIF]) 3060 iif = nla_get_u32(tb[RTA_IIF]); 3061 3062 if (tb[RTA_OIF]) 3063 oif = nla_get_u32(tb[RTA_OIF]); 3064 3065 if (tb[RTA_MARK]) 3066 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); 3067 3068 if (iif) { 3069 struct net_device *dev; 3070 int flags = 0; 3071 3072 dev = __dev_get_by_index(net, iif); 3073 if (!dev) { 3074 err = -ENODEV; 3075 goto errout; 3076 } 3077 3078 fl6.flowi6_iif = iif; 3079 3080 if (!ipv6_addr_any(&fl6.saddr)) 3081 flags |= RT6_LOOKUP_F_HAS_SADDR; 3082 3083 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6, 3084 flags); 3085 } else { 3086 fl6.flowi6_oif = oif; 3087 3088 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6); 3089 } 3090 3091 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3092 if (!skb) { 3093 ip6_rt_put(rt); 3094 err = -ENOBUFS; 3095 goto errout; 3096 } 3097 3098 /* Reserve room for dummy headers, this skb can pass 3099 through good chunk of routing engine. 3100 */ 3101 skb_reset_mac_header(skb); 3102 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); 3103 3104 skb_dst_set(skb, &rt->dst); 3105 3106 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif, 3107 RTM_NEWROUTE, NETLINK_CB(in_skb).portid, 3108 nlh->nlmsg_seq, 0, 0, 0); 3109 if (err < 0) { 3110 kfree_skb(skb); 3111 goto errout; 3112 } 3113 3114 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3115 errout: 3116 return err; 3117 } 3118 3119 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) 3120 { 3121 struct sk_buff *skb; 3122 struct net *net = info->nl_net; 3123 u32 seq; 3124 int err; 3125 3126 err = -ENOBUFS; 3127 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 3128 3129 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); 3130 if (!skb) 3131 goto errout; 3132 3133 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, 3134 event, info->portid, seq, 0, 0, 0); 3135 if (err < 0) { 3136 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ 3137 WARN_ON(err == -EMSGSIZE); 3138 kfree_skb(skb); 3139 goto errout; 3140 } 3141 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, 3142 info->nlh, gfp_any()); 3143 return; 3144 errout: 3145 if (err < 0) 3146 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 3147 } 3148 3149 static int ip6_route_dev_notify(struct notifier_block *this, 3150 unsigned long event, void *ptr) 3151 { 3152 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 3153 struct net *net = dev_net(dev); 3154 3155 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { 3156 net->ipv6.ip6_null_entry->dst.dev = dev; 3157 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); 3158 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3159 net->ipv6.ip6_prohibit_entry->dst.dev = dev; 3160 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); 3161 net->ipv6.ip6_blk_hole_entry->dst.dev = dev; 3162 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); 3163 #endif 3164 } 3165 3166 return NOTIFY_OK; 3167 } 3168 3169 /* 3170 * /proc 3171 */ 3172 3173 #ifdef CONFIG_PROC_FS 3174 3175 static const struct file_operations ipv6_route_proc_fops = { 3176 .owner = THIS_MODULE, 3177 .open = ipv6_route_open, 3178 .read = seq_read, 3179 .llseek = seq_lseek, 3180 .release = seq_release_net, 3181 }; 3182 3183 static int rt6_stats_seq_show(struct seq_file *seq, void *v) 3184 { 3185 struct net *net = (struct net *)seq->private; 3186 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", 3187 net->ipv6.rt6_stats->fib_nodes, 3188 net->ipv6.rt6_stats->fib_route_nodes, 3189 net->ipv6.rt6_stats->fib_rt_alloc, 3190 net->ipv6.rt6_stats->fib_rt_entries, 3191 net->ipv6.rt6_stats->fib_rt_cache, 3192 dst_entries_get_slow(&net->ipv6.ip6_dst_ops), 3193 net->ipv6.rt6_stats->fib_discarded_routes); 3194 3195 return 0; 3196 } 3197 3198 static int rt6_stats_seq_open(struct inode *inode, struct file *file) 3199 { 3200 return single_open_net(inode, file, rt6_stats_seq_show); 3201 } 3202 3203 static const struct file_operations rt6_stats_seq_fops = { 3204 .owner = THIS_MODULE, 3205 .open = rt6_stats_seq_open, 3206 .read = seq_read, 3207 .llseek = seq_lseek, 3208 .release = single_release_net, 3209 }; 3210 #endif /* CONFIG_PROC_FS */ 3211 3212 #ifdef CONFIG_SYSCTL 3213 3214 static 3215 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, 3216 void __user *buffer, size_t *lenp, loff_t *ppos) 3217 { 3218 struct net *net; 3219 int delay; 3220 if (!write) 3221 return -EINVAL; 3222 3223 net = (struct net *)ctl->extra1; 3224 delay = net->ipv6.sysctl.flush_delay; 3225 proc_dointvec(ctl, write, buffer, lenp, ppos); 3226 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); 3227 return 0; 3228 } 3229 3230 struct ctl_table ipv6_route_table_template[] = { 3231 { 3232 .procname = "flush", 3233 .data = &init_net.ipv6.sysctl.flush_delay, 3234 .maxlen = sizeof(int), 3235 .mode = 0200, 3236 .proc_handler = ipv6_sysctl_rtcache_flush 3237 }, 3238 { 3239 .procname = "gc_thresh", 3240 .data = &ip6_dst_ops_template.gc_thresh, 3241 .maxlen = sizeof(int), 3242 .mode = 0644, 3243 .proc_handler = proc_dointvec, 3244 }, 3245 { 3246 .procname = "max_size", 3247 .data = &init_net.ipv6.sysctl.ip6_rt_max_size, 3248 .maxlen = sizeof(int), 3249 .mode = 0644, 3250 .proc_handler = proc_dointvec, 3251 }, 3252 { 3253 .procname = "gc_min_interval", 3254 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 3255 .maxlen = sizeof(int), 3256 .mode = 0644, 3257 .proc_handler = proc_dointvec_jiffies, 3258 }, 3259 { 3260 .procname = "gc_timeout", 3261 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, 3262 .maxlen = sizeof(int), 3263 .mode = 0644, 3264 .proc_handler = proc_dointvec_jiffies, 3265 }, 3266 { 3267 .procname = "gc_interval", 3268 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, 3269 .maxlen = sizeof(int), 3270 .mode = 0644, 3271 .proc_handler = proc_dointvec_jiffies, 3272 }, 3273 { 3274 .procname = "gc_elasticity", 3275 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, 3276 .maxlen = sizeof(int), 3277 .mode = 0644, 3278 .proc_handler = proc_dointvec, 3279 }, 3280 { 3281 .procname = "mtu_expires", 3282 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, 3283 .maxlen = sizeof(int), 3284 .mode = 0644, 3285 .proc_handler = proc_dointvec_jiffies, 3286 }, 3287 { 3288 .procname = "min_adv_mss", 3289 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, 3290 .maxlen = sizeof(int), 3291 .mode = 0644, 3292 .proc_handler = proc_dointvec, 3293 }, 3294 { 3295 .procname = "gc_min_interval_ms", 3296 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, 3297 .maxlen = sizeof(int), 3298 .mode = 0644, 3299 .proc_handler = proc_dointvec_ms_jiffies, 3300 }, 3301 { } 3302 }; 3303 3304 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) 3305 { 3306 struct ctl_table *table; 3307 3308 table = kmemdup(ipv6_route_table_template, 3309 sizeof(ipv6_route_table_template), 3310 GFP_KERNEL); 3311 3312 if (table) { 3313 table[0].data = &net->ipv6.sysctl.flush_delay; 3314 table[0].extra1 = net; 3315 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; 3316 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; 3317 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 3318 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; 3319 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; 3320 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; 3321 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; 3322 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; 3323 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; 3324 3325 /* Don't export sysctls to unprivileged users */ 3326 if (net->user_ns != &init_user_ns) 3327 table[0].procname = NULL; 3328 } 3329 3330 return table; 3331 } 3332 #endif 3333 3334 static int __net_init ip6_route_net_init(struct net *net) 3335 { 3336 int ret = -ENOMEM; 3337 3338 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, 3339 sizeof(net->ipv6.ip6_dst_ops)); 3340 3341 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) 3342 goto out_ip6_dst_ops; 3343 3344 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, 3345 sizeof(*net->ipv6.ip6_null_entry), 3346 GFP_KERNEL); 3347 if (!net->ipv6.ip6_null_entry) 3348 goto out_ip6_dst_entries; 3349 net->ipv6.ip6_null_entry->dst.path = 3350 (struct dst_entry *)net->ipv6.ip6_null_entry; 3351 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; 3352 dst_init_metrics(&net->ipv6.ip6_null_entry->dst, 3353 ip6_template_metrics, true); 3354 3355 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3356 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, 3357 sizeof(*net->ipv6.ip6_prohibit_entry), 3358 GFP_KERNEL); 3359 if (!net->ipv6.ip6_prohibit_entry) 3360 goto out_ip6_null_entry; 3361 net->ipv6.ip6_prohibit_entry->dst.path = 3362 (struct dst_entry *)net->ipv6.ip6_prohibit_entry; 3363 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; 3364 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, 3365 ip6_template_metrics, true); 3366 3367 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, 3368 sizeof(*net->ipv6.ip6_blk_hole_entry), 3369 GFP_KERNEL); 3370 if (!net->ipv6.ip6_blk_hole_entry) 3371 goto out_ip6_prohibit_entry; 3372 net->ipv6.ip6_blk_hole_entry->dst.path = 3373 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry; 3374 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; 3375 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, 3376 ip6_template_metrics, true); 3377 #endif 3378 3379 net->ipv6.sysctl.flush_delay = 0; 3380 net->ipv6.sysctl.ip6_rt_max_size = 4096; 3381 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; 3382 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; 3383 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; 3384 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; 3385 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; 3386 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; 3387 3388 net->ipv6.ip6_rt_gc_expire = 30*HZ; 3389 3390 ret = 0; 3391 out: 3392 return ret; 3393 3394 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3395 out_ip6_prohibit_entry: 3396 kfree(net->ipv6.ip6_prohibit_entry); 3397 out_ip6_null_entry: 3398 kfree(net->ipv6.ip6_null_entry); 3399 #endif 3400 out_ip6_dst_entries: 3401 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 3402 out_ip6_dst_ops: 3403 goto out; 3404 } 3405 3406 static void __net_exit ip6_route_net_exit(struct net *net) 3407 { 3408 kfree(net->ipv6.ip6_null_entry); 3409 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3410 kfree(net->ipv6.ip6_prohibit_entry); 3411 kfree(net->ipv6.ip6_blk_hole_entry); 3412 #endif 3413 dst_entries_destroy(&net->ipv6.ip6_dst_ops); 3414 } 3415 3416 static int __net_init ip6_route_net_init_late(struct net *net) 3417 { 3418 #ifdef CONFIG_PROC_FS 3419 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops); 3420 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops); 3421 #endif 3422 return 0; 3423 } 3424 3425 static void __net_exit ip6_route_net_exit_late(struct net *net) 3426 { 3427 #ifdef CONFIG_PROC_FS 3428 remove_proc_entry("ipv6_route", net->proc_net); 3429 remove_proc_entry("rt6_stats", net->proc_net); 3430 #endif 3431 } 3432 3433 static struct pernet_operations ip6_route_net_ops = { 3434 .init = ip6_route_net_init, 3435 .exit = ip6_route_net_exit, 3436 }; 3437 3438 static int __net_init ipv6_inetpeer_init(struct net *net) 3439 { 3440 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 3441 3442 if (!bp) 3443 return -ENOMEM; 3444 inet_peer_base_init(bp); 3445 net->ipv6.peers = bp; 3446 return 0; 3447 } 3448 3449 static void __net_exit ipv6_inetpeer_exit(struct net *net) 3450 { 3451 struct inet_peer_base *bp = net->ipv6.peers; 3452 3453 net->ipv6.peers = NULL; 3454 inetpeer_invalidate_tree(bp); 3455 kfree(bp); 3456 } 3457 3458 static struct pernet_operations ipv6_inetpeer_ops = { 3459 .init = ipv6_inetpeer_init, 3460 .exit = ipv6_inetpeer_exit, 3461 }; 3462 3463 static struct pernet_operations ip6_route_net_late_ops = { 3464 .init = ip6_route_net_init_late, 3465 .exit = ip6_route_net_exit_late, 3466 }; 3467 3468 static struct notifier_block ip6_route_dev_notifier = { 3469 .notifier_call = ip6_route_dev_notify, 3470 .priority = 0, 3471 }; 3472 3473 int __init ip6_route_init(void) 3474 { 3475 int ret; 3476 int cpu; 3477 3478 ret = -ENOMEM; 3479 ip6_dst_ops_template.kmem_cachep = 3480 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, 3481 SLAB_HWCACHE_ALIGN, NULL); 3482 if (!ip6_dst_ops_template.kmem_cachep) 3483 goto out; 3484 3485 ret = dst_entries_init(&ip6_dst_blackhole_ops); 3486 if (ret) 3487 goto out_kmem_cache; 3488 3489 ret = register_pernet_subsys(&ipv6_inetpeer_ops); 3490 if (ret) 3491 goto out_dst_entries; 3492 3493 ret = register_pernet_subsys(&ip6_route_net_ops); 3494 if (ret) 3495 goto out_register_inetpeer; 3496 3497 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; 3498 3499 /* Registering of the loopback is done before this portion of code, 3500 * the loopback reference in rt6_info will not be taken, do it 3501 * manually for init_net */ 3502 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; 3503 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 3504 #ifdef CONFIG_IPV6_MULTIPLE_TABLES 3505 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; 3506 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 3507 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; 3508 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); 3509 #endif 3510 ret = fib6_init(); 3511 if (ret) 3512 goto out_register_subsys; 3513 3514 ret = xfrm6_init(); 3515 if (ret) 3516 goto out_fib6_init; 3517 3518 ret = fib6_rules_init(); 3519 if (ret) 3520 goto xfrm6_init; 3521 3522 ret = register_pernet_subsys(&ip6_route_net_late_ops); 3523 if (ret) 3524 goto fib6_rules_init; 3525 3526 ret = -ENOBUFS; 3527 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) || 3528 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) || 3529 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL)) 3530 goto out_register_late_subsys; 3531 3532 ret = register_netdevice_notifier(&ip6_route_dev_notifier); 3533 if (ret) 3534 goto out_register_late_subsys; 3535 3536 for_each_possible_cpu(cpu) { 3537 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); 3538 3539 INIT_LIST_HEAD(&ul->head); 3540 spin_lock_init(&ul->lock); 3541 } 3542 3543 out: 3544 return ret; 3545 3546 out_register_late_subsys: 3547 unregister_pernet_subsys(&ip6_route_net_late_ops); 3548 fib6_rules_init: 3549 fib6_rules_cleanup(); 3550 xfrm6_init: 3551 xfrm6_fini(); 3552 out_fib6_init: 3553 fib6_gc_cleanup(); 3554 out_register_subsys: 3555 unregister_pernet_subsys(&ip6_route_net_ops); 3556 out_register_inetpeer: 3557 unregister_pernet_subsys(&ipv6_inetpeer_ops); 3558 out_dst_entries: 3559 dst_entries_destroy(&ip6_dst_blackhole_ops); 3560 out_kmem_cache: 3561 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 3562 goto out; 3563 } 3564 3565 void ip6_route_cleanup(void) 3566 { 3567 unregister_netdevice_notifier(&ip6_route_dev_notifier); 3568 unregister_pernet_subsys(&ip6_route_net_late_ops); 3569 fib6_rules_cleanup(); 3570 xfrm6_fini(); 3571 fib6_gc_cleanup(); 3572 unregister_pernet_subsys(&ipv6_inetpeer_ops); 3573 unregister_pernet_subsys(&ip6_route_net_ops); 3574 dst_entries_destroy(&ip6_dst_blackhole_ops); 3575 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); 3576 } 3577