1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IPv4 Forwarding Information Base: semantics. 7 * 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/uaccess.h> 17 #include <linux/bitops.h> 18 #include <linux/types.h> 19 #include <linux/kernel.h> 20 #include <linux/jiffies.h> 21 #include <linux/mm.h> 22 #include <linux/string.h> 23 #include <linux/socket.h> 24 #include <linux/sockios.h> 25 #include <linux/errno.h> 26 #include <linux/in.h> 27 #include <linux/inet.h> 28 #include <linux/inetdevice.h> 29 #include <linux/netdevice.h> 30 #include <linux/if_arp.h> 31 #include <linux/proc_fs.h> 32 #include <linux/skbuff.h> 33 #include <linux/init.h> 34 #include <linux/slab.h> 35 36 #include <net/arp.h> 37 #include <net/ip.h> 38 #include <net/protocol.h> 39 #include <net/route.h> 40 #include <net/tcp.h> 41 #include <net/sock.h> 42 #include <net/ip_fib.h> 43 #include <net/netlink.h> 44 #include <net/nexthop.h> 45 #include <net/lwtunnel.h> 46 47 #include "fib_lookup.h" 48 49 static DEFINE_SPINLOCK(fib_info_lock); 50 static struct hlist_head *fib_info_hash; 51 static struct hlist_head *fib_info_laddrhash; 52 static unsigned int fib_info_hash_size; 53 static unsigned int fib_info_cnt; 54 55 #define DEVINDEX_HASHBITS 8 56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) 57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; 58 59 #ifdef CONFIG_IP_ROUTE_MULTIPATH 60 61 #define for_nexthops(fi) { \ 62 int nhsel; const struct fib_nh *nh; \ 63 for (nhsel = 0, nh = (fi)->fib_nh; \ 64 nhsel < (fi)->fib_nhs; \ 65 nh++, nhsel++) 66 67 #define change_nexthops(fi) { \ 68 int nhsel; struct fib_nh *nexthop_nh; \ 69 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 70 nhsel < (fi)->fib_nhs; \ 71 nexthop_nh++, nhsel++) 72 73 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 74 75 /* Hope, that gcc will optimize it to get rid of dummy loop */ 76 77 #define for_nexthops(fi) { \ 78 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ 79 for (nhsel = 0; nhsel < 1; nhsel++) 80 81 #define change_nexthops(fi) { \ 82 int nhsel; \ 83 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 84 for (nhsel = 0; nhsel < 1; nhsel++) 85 86 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 87 88 #define endfor_nexthops(fi) } 89 90 91 const struct fib_prop fib_props[RTN_MAX + 1] = { 92 [RTN_UNSPEC] = { 93 .error = 0, 94 .scope = RT_SCOPE_NOWHERE, 95 }, 96 [RTN_UNICAST] = { 97 .error = 0, 98 .scope = RT_SCOPE_UNIVERSE, 99 }, 100 [RTN_LOCAL] = { 101 .error = 0, 102 .scope = RT_SCOPE_HOST, 103 }, 104 [RTN_BROADCAST] = { 105 .error = 0, 106 .scope = RT_SCOPE_LINK, 107 }, 108 [RTN_ANYCAST] = { 109 .error = 0, 110 .scope = RT_SCOPE_LINK, 111 }, 112 [RTN_MULTICAST] = { 113 .error = 0, 114 .scope = RT_SCOPE_UNIVERSE, 115 }, 116 [RTN_BLACKHOLE] = { 117 .error = -EINVAL, 118 .scope = RT_SCOPE_UNIVERSE, 119 }, 120 [RTN_UNREACHABLE] = { 121 .error = -EHOSTUNREACH, 122 .scope = RT_SCOPE_UNIVERSE, 123 }, 124 [RTN_PROHIBIT] = { 125 .error = -EACCES, 126 .scope = RT_SCOPE_UNIVERSE, 127 }, 128 [RTN_THROW] = { 129 .error = -EAGAIN, 130 .scope = RT_SCOPE_UNIVERSE, 131 }, 132 [RTN_NAT] = { 133 .error = -EINVAL, 134 .scope = RT_SCOPE_NOWHERE, 135 }, 136 [RTN_XRESOLVE] = { 137 .error = -EINVAL, 138 .scope = RT_SCOPE_NOWHERE, 139 }, 140 }; 141 142 static void rt_fibinfo_free(struct rtable __rcu **rtp) 143 { 144 struct rtable *rt = rcu_dereference_protected(*rtp, 1); 145 146 if (!rt) 147 return; 148 149 /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); 150 * because we waited an RCU grace period before calling 151 * free_fib_info_rcu() 152 */ 153 154 dst_free(&rt->dst); 155 } 156 157 static void free_nh_exceptions(struct fib_nh *nh) 158 { 159 struct fnhe_hash_bucket *hash; 160 int i; 161 162 hash = rcu_dereference_protected(nh->nh_exceptions, 1); 163 if (!hash) 164 return; 165 for (i = 0; i < FNHE_HASH_SIZE; i++) { 166 struct fib_nh_exception *fnhe; 167 168 fnhe = rcu_dereference_protected(hash[i].chain, 1); 169 while (fnhe) { 170 struct fib_nh_exception *next; 171 172 next = rcu_dereference_protected(fnhe->fnhe_next, 1); 173 174 rt_fibinfo_free(&fnhe->fnhe_rth_input); 175 rt_fibinfo_free(&fnhe->fnhe_rth_output); 176 177 kfree(fnhe); 178 179 fnhe = next; 180 } 181 } 182 kfree(hash); 183 } 184 185 static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) 186 { 187 int cpu; 188 189 if (!rtp) 190 return; 191 192 for_each_possible_cpu(cpu) { 193 struct rtable *rt; 194 195 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); 196 if (rt) 197 dst_free(&rt->dst); 198 } 199 free_percpu(rtp); 200 } 201 202 /* Release a nexthop info record */ 203 static void free_fib_info_rcu(struct rcu_head *head) 204 { 205 struct fib_info *fi = container_of(head, struct fib_info, rcu); 206 struct dst_metrics *m; 207 208 change_nexthops(fi) { 209 if (nexthop_nh->nh_dev) 210 dev_put(nexthop_nh->nh_dev); 211 lwtstate_put(nexthop_nh->nh_lwtstate); 212 free_nh_exceptions(nexthop_nh); 213 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); 214 rt_fibinfo_free(&nexthop_nh->nh_rth_input); 215 } endfor_nexthops(fi); 216 217 m = fi->fib_metrics; 218 if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) 219 kfree(m); 220 kfree(fi); 221 } 222 223 void free_fib_info(struct fib_info *fi) 224 { 225 if (fi->fib_dead == 0) { 226 pr_warn("Freeing alive fib_info %p\n", fi); 227 return; 228 } 229 fib_info_cnt--; 230 #ifdef CONFIG_IP_ROUTE_CLASSID 231 change_nexthops(fi) { 232 if (nexthop_nh->nh_tclassid) 233 fi->fib_net->ipv4.fib_num_tclassid_users--; 234 } endfor_nexthops(fi); 235 #endif 236 call_rcu(&fi->rcu, free_fib_info_rcu); 237 } 238 EXPORT_SYMBOL_GPL(free_fib_info); 239 240 void fib_release_info(struct fib_info *fi) 241 { 242 spin_lock_bh(&fib_info_lock); 243 if (fi && --fi->fib_treeref == 0) { 244 hlist_del(&fi->fib_hash); 245 if (fi->fib_prefsrc) 246 hlist_del(&fi->fib_lhash); 247 change_nexthops(fi) { 248 if (!nexthop_nh->nh_dev) 249 continue; 250 hlist_del(&nexthop_nh->nh_hash); 251 } endfor_nexthops(fi) 252 fi->fib_dead = 1; 253 fib_info_put(fi); 254 } 255 spin_unlock_bh(&fib_info_lock); 256 } 257 258 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 259 { 260 const struct fib_nh *onh = ofi->fib_nh; 261 262 for_nexthops(fi) { 263 if (nh->nh_oif != onh->nh_oif || 264 nh->nh_gw != onh->nh_gw || 265 nh->nh_scope != onh->nh_scope || 266 #ifdef CONFIG_IP_ROUTE_MULTIPATH 267 nh->nh_weight != onh->nh_weight || 268 #endif 269 #ifdef CONFIG_IP_ROUTE_CLASSID 270 nh->nh_tclassid != onh->nh_tclassid || 271 #endif 272 lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || 273 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) 274 return -1; 275 onh++; 276 } endfor_nexthops(fi); 277 return 0; 278 } 279 280 static inline unsigned int fib_devindex_hashfn(unsigned int val) 281 { 282 unsigned int mask = DEVINDEX_HASHSIZE - 1; 283 284 return (val ^ 285 (val >> DEVINDEX_HASHBITS) ^ 286 (val >> (DEVINDEX_HASHBITS * 2))) & mask; 287 } 288 289 static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 290 { 291 unsigned int mask = (fib_info_hash_size - 1); 292 unsigned int val = fi->fib_nhs; 293 294 val ^= (fi->fib_protocol << 8) | fi->fib_scope; 295 val ^= (__force u32)fi->fib_prefsrc; 296 val ^= fi->fib_priority; 297 for_nexthops(fi) { 298 val ^= fib_devindex_hashfn(nh->nh_oif); 299 } endfor_nexthops(fi) 300 301 return (val ^ (val >> 7) ^ (val >> 12)) & mask; 302 } 303 304 static struct fib_info *fib_find_info(const struct fib_info *nfi) 305 { 306 struct hlist_head *head; 307 struct fib_info *fi; 308 unsigned int hash; 309 310 hash = fib_info_hashfn(nfi); 311 head = &fib_info_hash[hash]; 312 313 hlist_for_each_entry(fi, head, fib_hash) { 314 if (!net_eq(fi->fib_net, nfi->fib_net)) 315 continue; 316 if (fi->fib_nhs != nfi->fib_nhs) 317 continue; 318 if (nfi->fib_protocol == fi->fib_protocol && 319 nfi->fib_scope == fi->fib_scope && 320 nfi->fib_prefsrc == fi->fib_prefsrc && 321 nfi->fib_priority == fi->fib_priority && 322 nfi->fib_type == fi->fib_type && 323 memcmp(nfi->fib_metrics, fi->fib_metrics, 324 sizeof(u32) * RTAX_MAX) == 0 && 325 !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && 326 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 327 return fi; 328 } 329 330 return NULL; 331 } 332 333 /* Check, that the gateway is already configured. 334 * Used only by redirect accept routine. 335 */ 336 int ip_fib_check_default(__be32 gw, struct net_device *dev) 337 { 338 struct hlist_head *head; 339 struct fib_nh *nh; 340 unsigned int hash; 341 342 spin_lock(&fib_info_lock); 343 344 hash = fib_devindex_hashfn(dev->ifindex); 345 head = &fib_info_devhash[hash]; 346 hlist_for_each_entry(nh, head, nh_hash) { 347 if (nh->nh_dev == dev && 348 nh->nh_gw == gw && 349 !(nh->nh_flags & RTNH_F_DEAD)) { 350 spin_unlock(&fib_info_lock); 351 return 0; 352 } 353 } 354 355 spin_unlock(&fib_info_lock); 356 357 return -1; 358 } 359 360 static inline size_t fib_nlmsg_size(struct fib_info *fi) 361 { 362 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) 363 + nla_total_size(4) /* RTA_TABLE */ 364 + nla_total_size(4) /* RTA_DST */ 365 + nla_total_size(4) /* RTA_PRIORITY */ 366 + nla_total_size(4) /* RTA_PREFSRC */ 367 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ 368 369 /* space for nested metrics */ 370 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 371 372 if (fi->fib_nhs) { 373 size_t nh_encapsize = 0; 374 /* Also handles the special case fib_nhs == 1 */ 375 376 /* each nexthop is packed in an attribute */ 377 size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); 378 379 /* may contain flow and gateway attribute */ 380 nhsize += 2 * nla_total_size(4); 381 382 /* grab encap info */ 383 for_nexthops(fi) { 384 if (nh->nh_lwtstate) { 385 /* RTA_ENCAP_TYPE */ 386 nh_encapsize += lwtunnel_get_encap_size( 387 nh->nh_lwtstate); 388 /* RTA_ENCAP */ 389 nh_encapsize += nla_total_size(2); 390 } 391 } endfor_nexthops(fi); 392 393 /* all nexthops are packed in a nested attribute */ 394 payload += nla_total_size((fi->fib_nhs * nhsize) + 395 nh_encapsize); 396 397 } 398 399 return payload; 400 } 401 402 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 403 int dst_len, u32 tb_id, const struct nl_info *info, 404 unsigned int nlm_flags) 405 { 406 struct sk_buff *skb; 407 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 408 int err = -ENOBUFS; 409 410 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 411 if (!skb) 412 goto errout; 413 414 err = fib_dump_info(skb, info->portid, seq, event, tb_id, 415 fa->fa_type, key, dst_len, 416 fa->fa_tos, fa->fa_info, nlm_flags); 417 if (err < 0) { 418 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 419 WARN_ON(err == -EMSGSIZE); 420 kfree_skb(skb); 421 goto errout; 422 } 423 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, 424 info->nlh, GFP_KERNEL); 425 return; 426 errout: 427 if (err < 0) 428 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 429 } 430 431 static int fib_detect_death(struct fib_info *fi, int order, 432 struct fib_info **last_resort, int *last_idx, 433 int dflt) 434 { 435 struct neighbour *n; 436 int state = NUD_NONE; 437 438 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); 439 if (n) { 440 state = n->nud_state; 441 neigh_release(n); 442 } else { 443 return 0; 444 } 445 if (state == NUD_REACHABLE) 446 return 0; 447 if ((state & NUD_VALID) && order != dflt) 448 return 0; 449 if ((state & NUD_VALID) || 450 (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { 451 *last_resort = fi; 452 *last_idx = order; 453 } 454 return 1; 455 } 456 457 #ifdef CONFIG_IP_ROUTE_MULTIPATH 458 459 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) 460 { 461 int nhs = 0; 462 463 while (rtnh_ok(rtnh, remaining)) { 464 nhs++; 465 rtnh = rtnh_next(rtnh, &remaining); 466 } 467 468 /* leftover implies invalid nexthop configuration, discard it */ 469 return remaining > 0 ? 0 : nhs; 470 } 471 472 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 473 int remaining, struct fib_config *cfg) 474 { 475 int ret; 476 477 change_nexthops(fi) { 478 int attrlen; 479 480 if (!rtnh_ok(rtnh, remaining)) 481 return -EINVAL; 482 483 if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) 484 return -EINVAL; 485 486 nexthop_nh->nh_flags = 487 (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 488 nexthop_nh->nh_oif = rtnh->rtnh_ifindex; 489 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; 490 491 attrlen = rtnh_attrlen(rtnh); 492 if (attrlen > 0) { 493 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 494 495 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 496 nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0; 497 #ifdef CONFIG_IP_ROUTE_CLASSID 498 nla = nla_find(attrs, attrlen, RTA_FLOW); 499 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 500 if (nexthop_nh->nh_tclassid) 501 fi->fib_net->ipv4.fib_num_tclassid_users++; 502 #endif 503 nla = nla_find(attrs, attrlen, RTA_ENCAP); 504 if (nla) { 505 struct lwtunnel_state *lwtstate; 506 struct nlattr *nla_entype; 507 508 nla_entype = nla_find(attrs, attrlen, 509 RTA_ENCAP_TYPE); 510 if (!nla_entype) 511 goto err_inval; 512 513 ret = lwtunnel_build_state(nla_get_u16( 514 nla_entype), 515 nla, AF_INET, cfg, 516 &lwtstate); 517 if (ret) 518 goto errout; 519 nexthop_nh->nh_lwtstate = 520 lwtstate_get(lwtstate); 521 } 522 } 523 524 rtnh = rtnh_next(rtnh, &remaining); 525 } endfor_nexthops(fi); 526 527 return 0; 528 529 err_inval: 530 ret = -EINVAL; 531 532 errout: 533 return ret; 534 } 535 536 static void fib_rebalance(struct fib_info *fi) 537 { 538 int total; 539 int w; 540 struct in_device *in_dev; 541 542 if (fi->fib_nhs < 2) 543 return; 544 545 total = 0; 546 for_nexthops(fi) { 547 if (nh->nh_flags & RTNH_F_DEAD) 548 continue; 549 550 in_dev = __in_dev_get_rtnl(nh->nh_dev); 551 552 if (in_dev && 553 IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && 554 nh->nh_flags & RTNH_F_LINKDOWN) 555 continue; 556 557 total += nh->nh_weight; 558 } endfor_nexthops(fi); 559 560 w = 0; 561 change_nexthops(fi) { 562 int upper_bound; 563 564 in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev); 565 566 if (nexthop_nh->nh_flags & RTNH_F_DEAD) { 567 upper_bound = -1; 568 } else if (in_dev && 569 IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && 570 nexthop_nh->nh_flags & RTNH_F_LINKDOWN) { 571 upper_bound = -1; 572 } else { 573 w += nexthop_nh->nh_weight; 574 upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, 575 total) - 1; 576 } 577 578 atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); 579 } endfor_nexthops(fi); 580 } 581 582 static inline void fib_add_weight(struct fib_info *fi, 583 const struct fib_nh *nh) 584 { 585 fi->fib_weight += nh->nh_weight; 586 } 587 588 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 589 590 #define fib_rebalance(fi) do { } while (0) 591 #define fib_add_weight(fi, nh) do { } while (0) 592 593 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 594 595 static int fib_encap_match(u16 encap_type, 596 struct nlattr *encap, 597 const struct fib_nh *nh, 598 const struct fib_config *cfg) 599 { 600 struct lwtunnel_state *lwtstate; 601 int ret, result = 0; 602 603 if (encap_type == LWTUNNEL_ENCAP_NONE) 604 return 0; 605 606 ret = lwtunnel_build_state(encap_type, encap, 607 AF_INET, cfg, &lwtstate); 608 if (!ret) { 609 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); 610 lwtstate_free(lwtstate); 611 } 612 613 return result; 614 } 615 616 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) 617 { 618 #ifdef CONFIG_IP_ROUTE_MULTIPATH 619 struct rtnexthop *rtnh; 620 int remaining; 621 #endif 622 623 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) 624 return 1; 625 626 if (cfg->fc_oif || cfg->fc_gw) { 627 if (cfg->fc_encap) { 628 if (fib_encap_match(cfg->fc_encap_type, 629 cfg->fc_encap, fi->fib_nh, cfg)) 630 return 1; 631 } 632 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && 633 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) 634 return 0; 635 return 1; 636 } 637 638 #ifdef CONFIG_IP_ROUTE_MULTIPATH 639 if (!cfg->fc_mp) 640 return 0; 641 642 rtnh = cfg->fc_mp; 643 remaining = cfg->fc_mp_len; 644 645 for_nexthops(fi) { 646 int attrlen; 647 648 if (!rtnh_ok(rtnh, remaining)) 649 return -EINVAL; 650 651 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) 652 return 1; 653 654 attrlen = rtnh_attrlen(rtnh); 655 if (attrlen > 0) { 656 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 657 658 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 659 if (nla && nla_get_in_addr(nla) != nh->nh_gw) 660 return 1; 661 #ifdef CONFIG_IP_ROUTE_CLASSID 662 nla = nla_find(attrs, attrlen, RTA_FLOW); 663 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 664 return 1; 665 #endif 666 } 667 668 rtnh = rtnh_next(rtnh, &remaining); 669 } endfor_nexthops(fi); 670 #endif 671 return 0; 672 } 673 674 675 /* 676 * Picture 677 * ------- 678 * 679 * Semantics of nexthop is very messy by historical reasons. 680 * We have to take into account, that: 681 * a) gateway can be actually local interface address, 682 * so that gatewayed route is direct. 683 * b) gateway must be on-link address, possibly 684 * described not by an ifaddr, but also by a direct route. 685 * c) If both gateway and interface are specified, they should not 686 * contradict. 687 * d) If we use tunnel routes, gateway could be not on-link. 688 * 689 * Attempt to reconcile all of these (alas, self-contradictory) conditions 690 * results in pretty ugly and hairy code with obscure logic. 691 * 692 * I chose to generalized it instead, so that the size 693 * of code does not increase practically, but it becomes 694 * much more general. 695 * Every prefix is assigned a "scope" value: "host" is local address, 696 * "link" is direct route, 697 * [ ... "site" ... "interior" ... ] 698 * and "universe" is true gateway route with global meaning. 699 * 700 * Every prefix refers to a set of "nexthop"s (gw, oif), 701 * where gw must have narrower scope. This recursion stops 702 * when gw has LOCAL scope or if "nexthop" is declared ONLINK, 703 * which means that gw is forced to be on link. 704 * 705 * Code is still hairy, but now it is apparently logically 706 * consistent and very flexible. F.e. as by-product it allows 707 * to co-exists in peace independent exterior and interior 708 * routing processes. 709 * 710 * Normally it looks as following. 711 * 712 * {universe prefix} -> (gw, oif) [scope link] 713 * | 714 * |-> {link prefix} -> (gw, oif) [scope local] 715 * | 716 * |-> {local prefix} (terminal node) 717 */ 718 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 719 struct fib_nh *nh) 720 { 721 int err = 0; 722 struct net *net; 723 struct net_device *dev; 724 725 net = cfg->fc_nlinfo.nl_net; 726 if (nh->nh_gw) { 727 struct fib_result res; 728 729 if (nh->nh_flags & RTNH_F_ONLINK) { 730 unsigned int addr_type; 731 732 if (cfg->fc_scope >= RT_SCOPE_LINK) 733 return -EINVAL; 734 dev = __dev_get_by_index(net, nh->nh_oif); 735 if (!dev) 736 return -ENODEV; 737 if (!(dev->flags & IFF_UP)) 738 return -ENETDOWN; 739 addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); 740 if (addr_type != RTN_UNICAST) 741 return -EINVAL; 742 if (!netif_carrier_ok(dev)) 743 nh->nh_flags |= RTNH_F_LINKDOWN; 744 nh->nh_dev = dev; 745 dev_hold(dev); 746 nh->nh_scope = RT_SCOPE_LINK; 747 return 0; 748 } 749 rcu_read_lock(); 750 { 751 struct fib_table *tbl = NULL; 752 struct flowi4 fl4 = { 753 .daddr = nh->nh_gw, 754 .flowi4_scope = cfg->fc_scope + 1, 755 .flowi4_oif = nh->nh_oif, 756 .flowi4_iif = LOOPBACK_IFINDEX, 757 }; 758 759 /* It is not necessary, but requires a bit of thinking */ 760 if (fl4.flowi4_scope < RT_SCOPE_LINK) 761 fl4.flowi4_scope = RT_SCOPE_LINK; 762 763 if (cfg->fc_table) 764 tbl = fib_get_table(net, cfg->fc_table); 765 766 if (tbl) 767 err = fib_table_lookup(tbl, &fl4, &res, 768 FIB_LOOKUP_IGNORE_LINKSTATE | 769 FIB_LOOKUP_NOREF); 770 771 /* on error or if no table given do full lookup. This 772 * is needed for example when nexthops are in the local 773 * table rather than the given table 774 */ 775 if (!tbl || err) { 776 err = fib_lookup(net, &fl4, &res, 777 FIB_LOOKUP_IGNORE_LINKSTATE); 778 } 779 780 if (err) { 781 rcu_read_unlock(); 782 return err; 783 } 784 } 785 err = -EINVAL; 786 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 787 goto out; 788 nh->nh_scope = res.scope; 789 nh->nh_oif = FIB_RES_OIF(res); 790 nh->nh_dev = dev = FIB_RES_DEV(res); 791 if (!dev) 792 goto out; 793 dev_hold(dev); 794 if (!netif_carrier_ok(dev)) 795 nh->nh_flags |= RTNH_F_LINKDOWN; 796 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; 797 } else { 798 struct in_device *in_dev; 799 800 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) 801 return -EINVAL; 802 803 rcu_read_lock(); 804 err = -ENODEV; 805 in_dev = inetdev_by_index(net, nh->nh_oif); 806 if (!in_dev) 807 goto out; 808 err = -ENETDOWN; 809 if (!(in_dev->dev->flags & IFF_UP)) 810 goto out; 811 nh->nh_dev = in_dev->dev; 812 dev_hold(nh->nh_dev); 813 nh->nh_scope = RT_SCOPE_HOST; 814 if (!netif_carrier_ok(nh->nh_dev)) 815 nh->nh_flags |= RTNH_F_LINKDOWN; 816 err = 0; 817 } 818 out: 819 rcu_read_unlock(); 820 return err; 821 } 822 823 static inline unsigned int fib_laddr_hashfn(__be32 val) 824 { 825 unsigned int mask = (fib_info_hash_size - 1); 826 827 return ((__force u32)val ^ 828 ((__force u32)val >> 7) ^ 829 ((__force u32)val >> 14)) & mask; 830 } 831 832 static struct hlist_head *fib_info_hash_alloc(int bytes) 833 { 834 if (bytes <= PAGE_SIZE) 835 return kzalloc(bytes, GFP_KERNEL); 836 else 837 return (struct hlist_head *) 838 __get_free_pages(GFP_KERNEL | __GFP_ZERO, 839 get_order(bytes)); 840 } 841 842 static void fib_info_hash_free(struct hlist_head *hash, int bytes) 843 { 844 if (!hash) 845 return; 846 847 if (bytes <= PAGE_SIZE) 848 kfree(hash); 849 else 850 free_pages((unsigned long) hash, get_order(bytes)); 851 } 852 853 static void fib_info_hash_move(struct hlist_head *new_info_hash, 854 struct hlist_head *new_laddrhash, 855 unsigned int new_size) 856 { 857 struct hlist_head *old_info_hash, *old_laddrhash; 858 unsigned int old_size = fib_info_hash_size; 859 unsigned int i, bytes; 860 861 spin_lock_bh(&fib_info_lock); 862 old_info_hash = fib_info_hash; 863 old_laddrhash = fib_info_laddrhash; 864 fib_info_hash_size = new_size; 865 866 for (i = 0; i < old_size; i++) { 867 struct hlist_head *head = &fib_info_hash[i]; 868 struct hlist_node *n; 869 struct fib_info *fi; 870 871 hlist_for_each_entry_safe(fi, n, head, fib_hash) { 872 struct hlist_head *dest; 873 unsigned int new_hash; 874 875 new_hash = fib_info_hashfn(fi); 876 dest = &new_info_hash[new_hash]; 877 hlist_add_head(&fi->fib_hash, dest); 878 } 879 } 880 fib_info_hash = new_info_hash; 881 882 for (i = 0; i < old_size; i++) { 883 struct hlist_head *lhead = &fib_info_laddrhash[i]; 884 struct hlist_node *n; 885 struct fib_info *fi; 886 887 hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { 888 struct hlist_head *ldest; 889 unsigned int new_hash; 890 891 new_hash = fib_laddr_hashfn(fi->fib_prefsrc); 892 ldest = &new_laddrhash[new_hash]; 893 hlist_add_head(&fi->fib_lhash, ldest); 894 } 895 } 896 fib_info_laddrhash = new_laddrhash; 897 898 spin_unlock_bh(&fib_info_lock); 899 900 bytes = old_size * sizeof(struct hlist_head *); 901 fib_info_hash_free(old_info_hash, bytes); 902 fib_info_hash_free(old_laddrhash, bytes); 903 } 904 905 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) 906 { 907 nh->nh_saddr = inet_select_addr(nh->nh_dev, 908 nh->nh_gw, 909 nh->nh_parent->fib_scope); 910 nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); 911 912 return nh->nh_saddr; 913 } 914 915 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) 916 { 917 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 918 fib_prefsrc != cfg->fc_dst) { 919 u32 tb_id = cfg->fc_table; 920 int rc; 921 922 if (tb_id == RT_TABLE_MAIN) 923 tb_id = RT_TABLE_LOCAL; 924 925 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, 926 fib_prefsrc, tb_id); 927 928 if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { 929 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, 930 fib_prefsrc, RT_TABLE_LOCAL); 931 } 932 933 if (rc != RTN_LOCAL) 934 return false; 935 } 936 return true; 937 } 938 939 static int 940 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) 941 { 942 bool ecn_ca = false; 943 struct nlattr *nla; 944 int remaining; 945 946 if (!cfg->fc_mx) 947 return 0; 948 949 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 950 int type = nla_type(nla); 951 u32 val; 952 953 if (!type) 954 continue; 955 if (type > RTAX_MAX) 956 return -EINVAL; 957 958 if (type == RTAX_CC_ALGO) { 959 char tmp[TCP_CA_NAME_MAX]; 960 961 nla_strlcpy(tmp, nla, sizeof(tmp)); 962 val = tcp_ca_get_key_by_name(tmp, &ecn_ca); 963 if (val == TCP_CA_UNSPEC) 964 return -EINVAL; 965 } else { 966 val = nla_get_u32(nla); 967 } 968 if (type == RTAX_ADVMSS && val > 65535 - 40) 969 val = 65535 - 40; 970 if (type == RTAX_MTU && val > 65535 - 15) 971 val = 65535 - 15; 972 if (type == RTAX_HOPLIMIT && val > 255) 973 val = 255; 974 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) 975 return -EINVAL; 976 fi->fib_metrics->metrics[type - 1] = val; 977 } 978 979 if (ecn_ca) 980 fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; 981 982 return 0; 983 } 984 985 struct fib_info *fib_create_info(struct fib_config *cfg) 986 { 987 int err; 988 struct fib_info *fi = NULL; 989 struct fib_info *ofi; 990 int nhs = 1; 991 struct net *net = cfg->fc_nlinfo.nl_net; 992 993 if (cfg->fc_type > RTN_MAX) 994 goto err_inval; 995 996 /* Fast check to catch the most weird cases */ 997 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 998 goto err_inval; 999 1000 if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) 1001 goto err_inval; 1002 1003 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1004 if (cfg->fc_mp) { 1005 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); 1006 if (nhs == 0) 1007 goto err_inval; 1008 } 1009 #endif 1010 1011 err = -ENOBUFS; 1012 if (fib_info_cnt >= fib_info_hash_size) { 1013 unsigned int new_size = fib_info_hash_size << 1; 1014 struct hlist_head *new_info_hash; 1015 struct hlist_head *new_laddrhash; 1016 unsigned int bytes; 1017 1018 if (!new_size) 1019 new_size = 16; 1020 bytes = new_size * sizeof(struct hlist_head *); 1021 new_info_hash = fib_info_hash_alloc(bytes); 1022 new_laddrhash = fib_info_hash_alloc(bytes); 1023 if (!new_info_hash || !new_laddrhash) { 1024 fib_info_hash_free(new_info_hash, bytes); 1025 fib_info_hash_free(new_laddrhash, bytes); 1026 } else 1027 fib_info_hash_move(new_info_hash, new_laddrhash, new_size); 1028 1029 if (!fib_info_hash_size) 1030 goto failure; 1031 } 1032 1033 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 1034 if (!fi) 1035 goto failure; 1036 fib_info_cnt++; 1037 if (cfg->fc_mx) { 1038 fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); 1039 if (!fi->fib_metrics) 1040 goto failure; 1041 atomic_set(&fi->fib_metrics->refcnt, 1); 1042 } else 1043 fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; 1044 1045 fi->fib_net = net; 1046 fi->fib_protocol = cfg->fc_protocol; 1047 fi->fib_scope = cfg->fc_scope; 1048 fi->fib_flags = cfg->fc_flags; 1049 fi->fib_priority = cfg->fc_priority; 1050 fi->fib_prefsrc = cfg->fc_prefsrc; 1051 fi->fib_type = cfg->fc_type; 1052 fi->fib_tb_id = cfg->fc_table; 1053 1054 fi->fib_nhs = nhs; 1055 change_nexthops(fi) { 1056 nexthop_nh->nh_parent = fi; 1057 nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); 1058 if (!nexthop_nh->nh_pcpu_rth_output) 1059 goto failure; 1060 } endfor_nexthops(fi) 1061 1062 err = fib_convert_metrics(fi, cfg); 1063 if (err) 1064 goto failure; 1065 1066 if (cfg->fc_mp) { 1067 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1068 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); 1069 if (err != 0) 1070 goto failure; 1071 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) 1072 goto err_inval; 1073 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 1074 goto err_inval; 1075 #ifdef CONFIG_IP_ROUTE_CLASSID 1076 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 1077 goto err_inval; 1078 #endif 1079 #else 1080 goto err_inval; 1081 #endif 1082 } else { 1083 struct fib_nh *nh = fi->fib_nh; 1084 1085 if (cfg->fc_encap) { 1086 struct lwtunnel_state *lwtstate; 1087 1088 if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) 1089 goto err_inval; 1090 err = lwtunnel_build_state(cfg->fc_encap_type, 1091 cfg->fc_encap, AF_INET, cfg, 1092 &lwtstate); 1093 if (err) 1094 goto failure; 1095 1096 nh->nh_lwtstate = lwtstate_get(lwtstate); 1097 } 1098 nh->nh_oif = cfg->fc_oif; 1099 nh->nh_gw = cfg->fc_gw; 1100 nh->nh_flags = cfg->fc_flags; 1101 #ifdef CONFIG_IP_ROUTE_CLASSID 1102 nh->nh_tclassid = cfg->fc_flow; 1103 if (nh->nh_tclassid) 1104 fi->fib_net->ipv4.fib_num_tclassid_users++; 1105 #endif 1106 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1107 nh->nh_weight = 1; 1108 #endif 1109 } 1110 1111 if (fib_props[cfg->fc_type].error) { 1112 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 1113 goto err_inval; 1114 goto link_it; 1115 } else { 1116 switch (cfg->fc_type) { 1117 case RTN_UNICAST: 1118 case RTN_LOCAL: 1119 case RTN_BROADCAST: 1120 case RTN_ANYCAST: 1121 case RTN_MULTICAST: 1122 break; 1123 default: 1124 goto err_inval; 1125 } 1126 } 1127 1128 if (cfg->fc_scope > RT_SCOPE_HOST) 1129 goto err_inval; 1130 1131 if (cfg->fc_scope == RT_SCOPE_HOST) { 1132 struct fib_nh *nh = fi->fib_nh; 1133 1134 /* Local address is added. */ 1135 if (nhs != 1 || nh->nh_gw) 1136 goto err_inval; 1137 nh->nh_scope = RT_SCOPE_NOWHERE; 1138 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); 1139 err = -ENODEV; 1140 if (!nh->nh_dev) 1141 goto failure; 1142 } else { 1143 int linkdown = 0; 1144 1145 change_nexthops(fi) { 1146 err = fib_check_nh(cfg, fi, nexthop_nh); 1147 if (err != 0) 1148 goto failure; 1149 if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN) 1150 linkdown++; 1151 } endfor_nexthops(fi) 1152 if (linkdown == fi->fib_nhs) 1153 fi->fib_flags |= RTNH_F_LINKDOWN; 1154 } 1155 1156 if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) 1157 goto err_inval; 1158 1159 change_nexthops(fi) { 1160 fib_info_update_nh_saddr(net, nexthop_nh); 1161 fib_add_weight(fi, nexthop_nh); 1162 } endfor_nexthops(fi) 1163 1164 fib_rebalance(fi); 1165 1166 link_it: 1167 ofi = fib_find_info(fi); 1168 if (ofi) { 1169 fi->fib_dead = 1; 1170 free_fib_info(fi); 1171 ofi->fib_treeref++; 1172 return ofi; 1173 } 1174 1175 fi->fib_treeref++; 1176 atomic_inc(&fi->fib_clntref); 1177 spin_lock_bh(&fib_info_lock); 1178 hlist_add_head(&fi->fib_hash, 1179 &fib_info_hash[fib_info_hashfn(fi)]); 1180 if (fi->fib_prefsrc) { 1181 struct hlist_head *head; 1182 1183 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; 1184 hlist_add_head(&fi->fib_lhash, head); 1185 } 1186 change_nexthops(fi) { 1187 struct hlist_head *head; 1188 unsigned int hash; 1189 1190 if (!nexthop_nh->nh_dev) 1191 continue; 1192 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); 1193 head = &fib_info_devhash[hash]; 1194 hlist_add_head(&nexthop_nh->nh_hash, head); 1195 } endfor_nexthops(fi) 1196 spin_unlock_bh(&fib_info_lock); 1197 return fi; 1198 1199 err_inval: 1200 err = -EINVAL; 1201 1202 failure: 1203 if (fi) { 1204 fi->fib_dead = 1; 1205 free_fib_info(fi); 1206 } 1207 1208 return ERR_PTR(err); 1209 } 1210 1211 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, 1212 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, 1213 struct fib_info *fi, unsigned int flags) 1214 { 1215 struct nlmsghdr *nlh; 1216 struct rtmsg *rtm; 1217 1218 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); 1219 if (!nlh) 1220 return -EMSGSIZE; 1221 1222 rtm = nlmsg_data(nlh); 1223 rtm->rtm_family = AF_INET; 1224 rtm->rtm_dst_len = dst_len; 1225 rtm->rtm_src_len = 0; 1226 rtm->rtm_tos = tos; 1227 if (tb_id < 256) 1228 rtm->rtm_table = tb_id; 1229 else 1230 rtm->rtm_table = RT_TABLE_COMPAT; 1231 if (nla_put_u32(skb, RTA_TABLE, tb_id)) 1232 goto nla_put_failure; 1233 rtm->rtm_type = type; 1234 rtm->rtm_flags = fi->fib_flags; 1235 rtm->rtm_scope = fi->fib_scope; 1236 rtm->rtm_protocol = fi->fib_protocol; 1237 1238 if (rtm->rtm_dst_len && 1239 nla_put_in_addr(skb, RTA_DST, dst)) 1240 goto nla_put_failure; 1241 if (fi->fib_priority && 1242 nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) 1243 goto nla_put_failure; 1244 if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) 1245 goto nla_put_failure; 1246 1247 if (fi->fib_prefsrc && 1248 nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) 1249 goto nla_put_failure; 1250 if (fi->fib_nhs == 1) { 1251 struct in_device *in_dev; 1252 1253 if (fi->fib_nh->nh_gw && 1254 nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw)) 1255 goto nla_put_failure; 1256 if (fi->fib_nh->nh_oif && 1257 nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif)) 1258 goto nla_put_failure; 1259 if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) { 1260 in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev); 1261 if (in_dev && 1262 IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) 1263 rtm->rtm_flags |= RTNH_F_DEAD; 1264 } 1265 #ifdef CONFIG_IP_ROUTE_CLASSID 1266 if (fi->fib_nh[0].nh_tclassid && 1267 nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) 1268 goto nla_put_failure; 1269 #endif 1270 if (fi->fib_nh->nh_lwtstate && 1271 lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0) 1272 goto nla_put_failure; 1273 } 1274 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1275 if (fi->fib_nhs > 1) { 1276 struct rtnexthop *rtnh; 1277 struct nlattr *mp; 1278 1279 mp = nla_nest_start(skb, RTA_MULTIPATH); 1280 if (!mp) 1281 goto nla_put_failure; 1282 1283 for_nexthops(fi) { 1284 struct in_device *in_dev; 1285 1286 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 1287 if (!rtnh) 1288 goto nla_put_failure; 1289 1290 rtnh->rtnh_flags = nh->nh_flags & 0xFF; 1291 if (nh->nh_flags & RTNH_F_LINKDOWN) { 1292 in_dev = __in_dev_get_rtnl(nh->nh_dev); 1293 if (in_dev && 1294 IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) 1295 rtnh->rtnh_flags |= RTNH_F_DEAD; 1296 } 1297 rtnh->rtnh_hops = nh->nh_weight - 1; 1298 rtnh->rtnh_ifindex = nh->nh_oif; 1299 1300 if (nh->nh_gw && 1301 nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw)) 1302 goto nla_put_failure; 1303 #ifdef CONFIG_IP_ROUTE_CLASSID 1304 if (nh->nh_tclassid && 1305 nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) 1306 goto nla_put_failure; 1307 #endif 1308 if (nh->nh_lwtstate && 1309 lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0) 1310 goto nla_put_failure; 1311 1312 /* length of rtnetlink header + attributes */ 1313 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 1314 } endfor_nexthops(fi); 1315 1316 nla_nest_end(skb, mp); 1317 } 1318 #endif 1319 nlmsg_end(skb, nlh); 1320 return 0; 1321 1322 nla_put_failure: 1323 nlmsg_cancel(skb, nlh); 1324 return -EMSGSIZE; 1325 } 1326 1327 /* 1328 * Update FIB if: 1329 * - local address disappeared -> we must delete all the entries 1330 * referring to it. 1331 * - device went down -> we must shutdown all nexthops going via it. 1332 */ 1333 int fib_sync_down_addr(struct net_device *dev, __be32 local) 1334 { 1335 int ret = 0; 1336 unsigned int hash = fib_laddr_hashfn(local); 1337 struct hlist_head *head = &fib_info_laddrhash[hash]; 1338 struct net *net = dev_net(dev); 1339 int tb_id = l3mdev_fib_table(dev); 1340 struct fib_info *fi; 1341 1342 if (!fib_info_laddrhash || local == 0) 1343 return 0; 1344 1345 hlist_for_each_entry(fi, head, fib_lhash) { 1346 if (!net_eq(fi->fib_net, net) || 1347 fi->fib_tb_id != tb_id) 1348 continue; 1349 if (fi->fib_prefsrc == local) { 1350 fi->fib_flags |= RTNH_F_DEAD; 1351 ret++; 1352 } 1353 } 1354 return ret; 1355 } 1356 1357 static int call_fib_nh_notifiers(struct fib_nh *fib_nh, 1358 enum fib_event_type event_type) 1359 { 1360 struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev); 1361 struct fib_nh_notifier_info info = { 1362 .fib_nh = fib_nh, 1363 }; 1364 1365 switch (event_type) { 1366 case FIB_EVENT_NH_ADD: 1367 if (fib_nh->nh_flags & RTNH_F_DEAD) 1368 break; 1369 if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && 1370 fib_nh->nh_flags & RTNH_F_LINKDOWN) 1371 break; 1372 return call_fib_notifiers(dev_net(fib_nh->nh_dev), event_type, 1373 &info.info); 1374 case FIB_EVENT_NH_DEL: 1375 if ((IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && 1376 fib_nh->nh_flags & RTNH_F_LINKDOWN) || 1377 (fib_nh->nh_flags & RTNH_F_DEAD)) 1378 return call_fib_notifiers(dev_net(fib_nh->nh_dev), 1379 event_type, &info.info); 1380 default: 1381 break; 1382 } 1383 1384 return NOTIFY_DONE; 1385 } 1386 1387 /* Event force Flags Description 1388 * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host 1389 * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host 1390 * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed 1391 * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed 1392 */ 1393 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) 1394 { 1395 int ret = 0; 1396 int scope = RT_SCOPE_NOWHERE; 1397 struct fib_info *prev_fi = NULL; 1398 unsigned int hash = fib_devindex_hashfn(dev->ifindex); 1399 struct hlist_head *head = &fib_info_devhash[hash]; 1400 struct fib_nh *nh; 1401 1402 if (force) 1403 scope = -1; 1404 1405 hlist_for_each_entry(nh, head, nh_hash) { 1406 struct fib_info *fi = nh->nh_parent; 1407 int dead; 1408 1409 BUG_ON(!fi->fib_nhs); 1410 if (nh->nh_dev != dev || fi == prev_fi) 1411 continue; 1412 prev_fi = fi; 1413 dead = 0; 1414 change_nexthops(fi) { 1415 if (nexthop_nh->nh_flags & RTNH_F_DEAD) 1416 dead++; 1417 else if (nexthop_nh->nh_dev == dev && 1418 nexthop_nh->nh_scope != scope) { 1419 switch (event) { 1420 case NETDEV_DOWN: 1421 case NETDEV_UNREGISTER: 1422 nexthop_nh->nh_flags |= RTNH_F_DEAD; 1423 /* fall through */ 1424 case NETDEV_CHANGE: 1425 nexthop_nh->nh_flags |= RTNH_F_LINKDOWN; 1426 break; 1427 } 1428 call_fib_nh_notifiers(nexthop_nh, 1429 FIB_EVENT_NH_DEL); 1430 dead++; 1431 } 1432 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1433 if (event == NETDEV_UNREGISTER && 1434 nexthop_nh->nh_dev == dev) { 1435 dead = fi->fib_nhs; 1436 break; 1437 } 1438 #endif 1439 } endfor_nexthops(fi) 1440 if (dead == fi->fib_nhs) { 1441 switch (event) { 1442 case NETDEV_DOWN: 1443 case NETDEV_UNREGISTER: 1444 fi->fib_flags |= RTNH_F_DEAD; 1445 /* fall through */ 1446 case NETDEV_CHANGE: 1447 fi->fib_flags |= RTNH_F_LINKDOWN; 1448 break; 1449 } 1450 ret++; 1451 } 1452 1453 fib_rebalance(fi); 1454 } 1455 1456 return ret; 1457 } 1458 1459 /* Must be invoked inside of an RCU protected region. */ 1460 static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) 1461 { 1462 struct fib_info *fi = NULL, *last_resort = NULL; 1463 struct hlist_head *fa_head = res->fa_head; 1464 struct fib_table *tb = res->table; 1465 u8 slen = 32 - res->prefixlen; 1466 int order = -1, last_idx = -1; 1467 struct fib_alias *fa, *fa1 = NULL; 1468 u32 last_prio = res->fi->fib_priority; 1469 u8 last_tos = 0; 1470 1471 hlist_for_each_entry_rcu(fa, fa_head, fa_list) { 1472 struct fib_info *next_fi = fa->fa_info; 1473 1474 if (fa->fa_slen != slen) 1475 continue; 1476 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) 1477 continue; 1478 if (fa->tb_id != tb->tb_id) 1479 continue; 1480 if (next_fi->fib_priority > last_prio && 1481 fa->fa_tos == last_tos) { 1482 if (last_tos) 1483 continue; 1484 break; 1485 } 1486 if (next_fi->fib_flags & RTNH_F_DEAD) 1487 continue; 1488 last_tos = fa->fa_tos; 1489 last_prio = next_fi->fib_priority; 1490 1491 if (next_fi->fib_scope != res->scope || 1492 fa->fa_type != RTN_UNICAST) 1493 continue; 1494 if (!next_fi->fib_nh[0].nh_gw || 1495 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1496 continue; 1497 1498 fib_alias_accessed(fa); 1499 1500 if (!fi) { 1501 if (next_fi != res->fi) 1502 break; 1503 fa1 = fa; 1504 } else if (!fib_detect_death(fi, order, &last_resort, 1505 &last_idx, fa1->fa_default)) { 1506 fib_result_assign(res, fi); 1507 fa1->fa_default = order; 1508 goto out; 1509 } 1510 fi = next_fi; 1511 order++; 1512 } 1513 1514 if (order <= 0 || !fi) { 1515 if (fa1) 1516 fa1->fa_default = -1; 1517 goto out; 1518 } 1519 1520 if (!fib_detect_death(fi, order, &last_resort, &last_idx, 1521 fa1->fa_default)) { 1522 fib_result_assign(res, fi); 1523 fa1->fa_default = order; 1524 goto out; 1525 } 1526 1527 if (last_idx >= 0) 1528 fib_result_assign(res, last_resort); 1529 fa1->fa_default = last_idx; 1530 out: 1531 return; 1532 } 1533 1534 /* 1535 * Dead device goes up. We wake up dead nexthops. 1536 * It takes sense only on multipath routes. 1537 */ 1538 int fib_sync_up(struct net_device *dev, unsigned int nh_flags) 1539 { 1540 struct fib_info *prev_fi; 1541 unsigned int hash; 1542 struct hlist_head *head; 1543 struct fib_nh *nh; 1544 int ret; 1545 1546 if (!(dev->flags & IFF_UP)) 1547 return 0; 1548 1549 if (nh_flags & RTNH_F_DEAD) { 1550 unsigned int flags = dev_get_flags(dev); 1551 1552 if (flags & (IFF_RUNNING | IFF_LOWER_UP)) 1553 nh_flags |= RTNH_F_LINKDOWN; 1554 } 1555 1556 prev_fi = NULL; 1557 hash = fib_devindex_hashfn(dev->ifindex); 1558 head = &fib_info_devhash[hash]; 1559 ret = 0; 1560 1561 hlist_for_each_entry(nh, head, nh_hash) { 1562 struct fib_info *fi = nh->nh_parent; 1563 int alive; 1564 1565 BUG_ON(!fi->fib_nhs); 1566 if (nh->nh_dev != dev || fi == prev_fi) 1567 continue; 1568 1569 prev_fi = fi; 1570 alive = 0; 1571 change_nexthops(fi) { 1572 if (!(nexthop_nh->nh_flags & nh_flags)) { 1573 alive++; 1574 continue; 1575 } 1576 if (!nexthop_nh->nh_dev || 1577 !(nexthop_nh->nh_dev->flags & IFF_UP)) 1578 continue; 1579 if (nexthop_nh->nh_dev != dev || 1580 !__in_dev_get_rtnl(dev)) 1581 continue; 1582 alive++; 1583 nexthop_nh->nh_flags &= ~nh_flags; 1584 call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); 1585 } endfor_nexthops(fi) 1586 1587 if (alive > 0) { 1588 fi->fib_flags &= ~nh_flags; 1589 ret++; 1590 } 1591 1592 fib_rebalance(fi); 1593 } 1594 1595 return ret; 1596 } 1597 1598 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1599 static bool fib_good_nh(const struct fib_nh *nh) 1600 { 1601 int state = NUD_REACHABLE; 1602 1603 if (nh->nh_scope == RT_SCOPE_LINK) { 1604 struct neighbour *n; 1605 1606 rcu_read_lock_bh(); 1607 1608 n = __ipv4_neigh_lookup_noref(nh->nh_dev, 1609 (__force u32)nh->nh_gw); 1610 if (n) 1611 state = n->nud_state; 1612 1613 rcu_read_unlock_bh(); 1614 } 1615 1616 return !!(state & NUD_VALID); 1617 } 1618 1619 void fib_select_multipath(struct fib_result *res, int hash) 1620 { 1621 struct fib_info *fi = res->fi; 1622 struct net *net = fi->fib_net; 1623 bool first = false; 1624 1625 for_nexthops(fi) { 1626 if (hash > atomic_read(&nh->nh_upper_bound)) 1627 continue; 1628 1629 if (!net->ipv4.sysctl_fib_multipath_use_neigh || 1630 fib_good_nh(nh)) { 1631 res->nh_sel = nhsel; 1632 return; 1633 } 1634 if (!first) { 1635 res->nh_sel = nhsel; 1636 first = true; 1637 } 1638 } endfor_nexthops(fi); 1639 } 1640 #endif 1641 1642 void fib_select_path(struct net *net, struct fib_result *res, 1643 struct flowi4 *fl4, const struct sk_buff *skb) 1644 { 1645 bool oif_check; 1646 1647 oif_check = (fl4->flowi4_oif == 0 || 1648 fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF); 1649 1650 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1651 if (res->fi->fib_nhs > 1 && oif_check) { 1652 int h = fib_multipath_hash(res->fi, fl4, skb); 1653 1654 fib_select_multipath(res, h); 1655 } 1656 else 1657 #endif 1658 if (!res->prefixlen && 1659 res->table->tb_num_default > 1 && 1660 res->type == RTN_UNICAST && oif_check) 1661 fib_select_default(fl4, res); 1662 1663 if (!fl4->saddr) 1664 fl4->saddr = FIB_RES_PREFSRC(net, *res); 1665 } 1666 EXPORT_SYMBOL_GPL(fib_select_path); 1667