1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IPv4 Forwarding Information Base: semantics. 7 * 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <asm/uaccess.h> 17 #include <linux/bitops.h> 18 #include <linux/types.h> 19 #include <linux/kernel.h> 20 #include <linux/jiffies.h> 21 #include <linux/mm.h> 22 #include <linux/string.h> 23 #include <linux/socket.h> 24 #include <linux/sockios.h> 25 #include <linux/errno.h> 26 #include <linux/in.h> 27 #include <linux/inet.h> 28 #include <linux/inetdevice.h> 29 #include <linux/netdevice.h> 30 #include <linux/if_arp.h> 31 #include <linux/proc_fs.h> 32 #include <linux/skbuff.h> 33 #include <linux/init.h> 34 #include <linux/slab.h> 35 36 #include <net/arp.h> 37 #include <net/ip.h> 38 #include <net/protocol.h> 39 #include <net/route.h> 40 #include <net/tcp.h> 41 #include <net/sock.h> 42 #include <net/ip_fib.h> 43 #include <net/netlink.h> 44 #include <net/nexthop.h> 45 46 #include "fib_lookup.h" 47 48 static DEFINE_SPINLOCK(fib_info_lock); 49 static struct hlist_head *fib_info_hash; 50 static struct hlist_head *fib_info_laddrhash; 51 static unsigned int fib_info_hash_size; 52 static unsigned int fib_info_cnt; 53 54 #define DEVINDEX_HASHBITS 8 55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) 56 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; 57 58 #ifdef CONFIG_IP_ROUTE_MULTIPATH 59 60 static DEFINE_SPINLOCK(fib_multipath_lock); 61 62 #define for_nexthops(fi) { \ 63 int nhsel; const struct fib_nh *nh; \ 64 for (nhsel = 0, nh = (fi)->fib_nh; \ 65 nhsel < (fi)->fib_nhs; \ 66 nh++, nhsel++) 67 68 #define change_nexthops(fi) { \ 69 int nhsel; struct fib_nh *nexthop_nh; \ 70 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 71 nhsel < (fi)->fib_nhs; \ 72 nexthop_nh++, nhsel++) 73 74 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 75 76 /* Hope, that gcc will optimize it to get rid of dummy loop */ 77 78 #define for_nexthops(fi) { \ 79 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ 80 for (nhsel = 0; nhsel < 1; nhsel++) 81 82 #define change_nexthops(fi) { \ 83 int nhsel; \ 84 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 85 for (nhsel = 0; nhsel < 1; nhsel++) 86 87 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 88 89 #define endfor_nexthops(fi) } 90 91 92 const struct fib_prop fib_props[RTN_MAX + 1] = { 93 [RTN_UNSPEC] = { 94 .error = 0, 95 .scope = RT_SCOPE_NOWHERE, 96 }, 97 [RTN_UNICAST] = { 98 .error = 0, 99 .scope = RT_SCOPE_UNIVERSE, 100 }, 101 [RTN_LOCAL] = { 102 .error = 0, 103 .scope = RT_SCOPE_HOST, 104 }, 105 [RTN_BROADCAST] = { 106 .error = 0, 107 .scope = RT_SCOPE_LINK, 108 }, 109 [RTN_ANYCAST] = { 110 .error = 0, 111 .scope = RT_SCOPE_LINK, 112 }, 113 [RTN_MULTICAST] = { 114 .error = 0, 115 .scope = RT_SCOPE_UNIVERSE, 116 }, 117 [RTN_BLACKHOLE] = { 118 .error = -EINVAL, 119 .scope = RT_SCOPE_UNIVERSE, 120 }, 121 [RTN_UNREACHABLE] = { 122 .error = -EHOSTUNREACH, 123 .scope = RT_SCOPE_UNIVERSE, 124 }, 125 [RTN_PROHIBIT] = { 126 .error = -EACCES, 127 .scope = RT_SCOPE_UNIVERSE, 128 }, 129 [RTN_THROW] = { 130 .error = -EAGAIN, 131 .scope = RT_SCOPE_UNIVERSE, 132 }, 133 [RTN_NAT] = { 134 .error = -EINVAL, 135 .scope = RT_SCOPE_NOWHERE, 136 }, 137 [RTN_XRESOLVE] = { 138 .error = -EINVAL, 139 .scope = RT_SCOPE_NOWHERE, 140 }, 141 }; 142 143 /* Release a nexthop info record */ 144 static void free_fib_info_rcu(struct rcu_head *head) 145 { 146 struct fib_info *fi = container_of(head, struct fib_info, rcu); 147 148 if (fi->fib_metrics != (u32 *) dst_default_metrics) 149 kfree(fi->fib_metrics); 150 kfree(fi); 151 } 152 153 void free_fib_info(struct fib_info *fi) 154 { 155 if (fi->fib_dead == 0) { 156 pr_warn("Freeing alive fib_info %p\n", fi); 157 return; 158 } 159 change_nexthops(fi) { 160 if (nexthop_nh->nh_dev) 161 dev_put(nexthop_nh->nh_dev); 162 nexthop_nh->nh_dev = NULL; 163 } endfor_nexthops(fi); 164 fib_info_cnt--; 165 release_net(fi->fib_net); 166 call_rcu(&fi->rcu, free_fib_info_rcu); 167 } 168 169 void fib_release_info(struct fib_info *fi) 170 { 171 spin_lock_bh(&fib_info_lock); 172 if (fi && --fi->fib_treeref == 0) { 173 hlist_del(&fi->fib_hash); 174 if (fi->fib_prefsrc) 175 hlist_del(&fi->fib_lhash); 176 change_nexthops(fi) { 177 if (!nexthop_nh->nh_dev) 178 continue; 179 hlist_del(&nexthop_nh->nh_hash); 180 } endfor_nexthops(fi) 181 fi->fib_dead = 1; 182 fib_info_put(fi); 183 } 184 spin_unlock_bh(&fib_info_lock); 185 } 186 187 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 188 { 189 const struct fib_nh *onh = ofi->fib_nh; 190 191 for_nexthops(fi) { 192 if (nh->nh_oif != onh->nh_oif || 193 nh->nh_gw != onh->nh_gw || 194 nh->nh_scope != onh->nh_scope || 195 #ifdef CONFIG_IP_ROUTE_MULTIPATH 196 nh->nh_weight != onh->nh_weight || 197 #endif 198 #ifdef CONFIG_IP_ROUTE_CLASSID 199 nh->nh_tclassid != onh->nh_tclassid || 200 #endif 201 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) 202 return -1; 203 onh++; 204 } endfor_nexthops(fi); 205 return 0; 206 } 207 208 static inline unsigned int fib_devindex_hashfn(unsigned int val) 209 { 210 unsigned int mask = DEVINDEX_HASHSIZE - 1; 211 212 return (val ^ 213 (val >> DEVINDEX_HASHBITS) ^ 214 (val >> (DEVINDEX_HASHBITS * 2))) & mask; 215 } 216 217 static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 218 { 219 unsigned int mask = (fib_info_hash_size - 1); 220 unsigned int val = fi->fib_nhs; 221 222 val ^= (fi->fib_protocol << 8) | fi->fib_scope; 223 val ^= (__force u32)fi->fib_prefsrc; 224 val ^= fi->fib_priority; 225 for_nexthops(fi) { 226 val ^= fib_devindex_hashfn(nh->nh_oif); 227 } endfor_nexthops(fi) 228 229 return (val ^ (val >> 7) ^ (val >> 12)) & mask; 230 } 231 232 static struct fib_info *fib_find_info(const struct fib_info *nfi) 233 { 234 struct hlist_head *head; 235 struct hlist_node *node; 236 struct fib_info *fi; 237 unsigned int hash; 238 239 hash = fib_info_hashfn(nfi); 240 head = &fib_info_hash[hash]; 241 242 hlist_for_each_entry(fi, node, head, fib_hash) { 243 if (!net_eq(fi->fib_net, nfi->fib_net)) 244 continue; 245 if (fi->fib_nhs != nfi->fib_nhs) 246 continue; 247 if (nfi->fib_protocol == fi->fib_protocol && 248 nfi->fib_scope == fi->fib_scope && 249 nfi->fib_prefsrc == fi->fib_prefsrc && 250 nfi->fib_priority == fi->fib_priority && 251 memcmp(nfi->fib_metrics, fi->fib_metrics, 252 sizeof(u32) * RTAX_MAX) == 0 && 253 ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && 254 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 255 return fi; 256 } 257 258 return NULL; 259 } 260 261 /* Check, that the gateway is already configured. 262 * Used only by redirect accept routine. 263 */ 264 int ip_fib_check_default(__be32 gw, struct net_device *dev) 265 { 266 struct hlist_head *head; 267 struct hlist_node *node; 268 struct fib_nh *nh; 269 unsigned int hash; 270 271 spin_lock(&fib_info_lock); 272 273 hash = fib_devindex_hashfn(dev->ifindex); 274 head = &fib_info_devhash[hash]; 275 hlist_for_each_entry(nh, node, head, nh_hash) { 276 if (nh->nh_dev == dev && 277 nh->nh_gw == gw && 278 !(nh->nh_flags & RTNH_F_DEAD)) { 279 spin_unlock(&fib_info_lock); 280 return 0; 281 } 282 } 283 284 spin_unlock(&fib_info_lock); 285 286 return -1; 287 } 288 289 static inline size_t fib_nlmsg_size(struct fib_info *fi) 290 { 291 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) 292 + nla_total_size(4) /* RTA_TABLE */ 293 + nla_total_size(4) /* RTA_DST */ 294 + nla_total_size(4) /* RTA_PRIORITY */ 295 + nla_total_size(4); /* RTA_PREFSRC */ 296 297 /* space for nested metrics */ 298 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 299 300 if (fi->fib_nhs) { 301 /* Also handles the special case fib_nhs == 1 */ 302 303 /* each nexthop is packed in an attribute */ 304 size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); 305 306 /* may contain flow and gateway attribute */ 307 nhsize += 2 * nla_total_size(4); 308 309 /* all nexthops are packed in a nested attribute */ 310 payload += nla_total_size(fi->fib_nhs * nhsize); 311 } 312 313 return payload; 314 } 315 316 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 317 int dst_len, u32 tb_id, struct nl_info *info, 318 unsigned int nlm_flags) 319 { 320 struct sk_buff *skb; 321 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 322 int err = -ENOBUFS; 323 324 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 325 if (skb == NULL) 326 goto errout; 327 328 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 329 fa->fa_type, key, dst_len, 330 fa->fa_tos, fa->fa_info, nlm_flags); 331 if (err < 0) { 332 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 333 WARN_ON(err == -EMSGSIZE); 334 kfree_skb(skb); 335 goto errout; 336 } 337 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 338 info->nlh, GFP_KERNEL); 339 return; 340 errout: 341 if (err < 0) 342 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 343 } 344 345 /* Return the first fib alias matching TOS with 346 * priority less than or equal to PRIO. 347 */ 348 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) 349 { 350 if (fah) { 351 struct fib_alias *fa; 352 list_for_each_entry(fa, fah, fa_list) { 353 if (fa->fa_tos > tos) 354 continue; 355 if (fa->fa_info->fib_priority >= prio || 356 fa->fa_tos < tos) 357 return fa; 358 } 359 } 360 return NULL; 361 } 362 363 int fib_detect_death(struct fib_info *fi, int order, 364 struct fib_info **last_resort, int *last_idx, int dflt) 365 { 366 struct neighbour *n; 367 int state = NUD_NONE; 368 369 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); 370 if (n) { 371 state = n->nud_state; 372 neigh_release(n); 373 } 374 if (state == NUD_REACHABLE) 375 return 0; 376 if ((state & NUD_VALID) && order != dflt) 377 return 0; 378 if ((state & NUD_VALID) || 379 (*last_idx < 0 && order > dflt)) { 380 *last_resort = fi; 381 *last_idx = order; 382 } 383 return 1; 384 } 385 386 #ifdef CONFIG_IP_ROUTE_MULTIPATH 387 388 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) 389 { 390 int nhs = 0; 391 392 while (rtnh_ok(rtnh, remaining)) { 393 nhs++; 394 rtnh = rtnh_next(rtnh, &remaining); 395 } 396 397 /* leftover implies invalid nexthop configuration, discard it */ 398 return remaining > 0 ? 0 : nhs; 399 } 400 401 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 402 int remaining, struct fib_config *cfg) 403 { 404 change_nexthops(fi) { 405 int attrlen; 406 407 if (!rtnh_ok(rtnh, remaining)) 408 return -EINVAL; 409 410 nexthop_nh->nh_flags = 411 (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 412 nexthop_nh->nh_oif = rtnh->rtnh_ifindex; 413 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; 414 415 attrlen = rtnh_attrlen(rtnh); 416 if (attrlen > 0) { 417 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 418 419 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 420 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 421 #ifdef CONFIG_IP_ROUTE_CLASSID 422 nla = nla_find(attrs, attrlen, RTA_FLOW); 423 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 424 #endif 425 } 426 427 rtnh = rtnh_next(rtnh, &remaining); 428 } endfor_nexthops(fi); 429 430 return 0; 431 } 432 433 #endif 434 435 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) 436 { 437 #ifdef CONFIG_IP_ROUTE_MULTIPATH 438 struct rtnexthop *rtnh; 439 int remaining; 440 #endif 441 442 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) 443 return 1; 444 445 if (cfg->fc_oif || cfg->fc_gw) { 446 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && 447 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) 448 return 0; 449 return 1; 450 } 451 452 #ifdef CONFIG_IP_ROUTE_MULTIPATH 453 if (cfg->fc_mp == NULL) 454 return 0; 455 456 rtnh = cfg->fc_mp; 457 remaining = cfg->fc_mp_len; 458 459 for_nexthops(fi) { 460 int attrlen; 461 462 if (!rtnh_ok(rtnh, remaining)) 463 return -EINVAL; 464 465 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) 466 return 1; 467 468 attrlen = rtnh_attrlen(rtnh); 469 if (attrlen < 0) { 470 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 471 472 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 473 if (nla && nla_get_be32(nla) != nh->nh_gw) 474 return 1; 475 #ifdef CONFIG_IP_ROUTE_CLASSID 476 nla = nla_find(attrs, attrlen, RTA_FLOW); 477 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 478 return 1; 479 #endif 480 } 481 482 rtnh = rtnh_next(rtnh, &remaining); 483 } endfor_nexthops(fi); 484 #endif 485 return 0; 486 } 487 488 489 /* 490 * Picture 491 * ------- 492 * 493 * Semantics of nexthop is very messy by historical reasons. 494 * We have to take into account, that: 495 * a) gateway can be actually local interface address, 496 * so that gatewayed route is direct. 497 * b) gateway must be on-link address, possibly 498 * described not by an ifaddr, but also by a direct route. 499 * c) If both gateway and interface are specified, they should not 500 * contradict. 501 * d) If we use tunnel routes, gateway could be not on-link. 502 * 503 * Attempt to reconcile all of these (alas, self-contradictory) conditions 504 * results in pretty ugly and hairy code with obscure logic. 505 * 506 * I chose to generalized it instead, so that the size 507 * of code does not increase practically, but it becomes 508 * much more general. 509 * Every prefix is assigned a "scope" value: "host" is local address, 510 * "link" is direct route, 511 * [ ... "site" ... "interior" ... ] 512 * and "universe" is true gateway route with global meaning. 513 * 514 * Every prefix refers to a set of "nexthop"s (gw, oif), 515 * where gw must have narrower scope. This recursion stops 516 * when gw has LOCAL scope or if "nexthop" is declared ONLINK, 517 * which means that gw is forced to be on link. 518 * 519 * Code is still hairy, but now it is apparently logically 520 * consistent and very flexible. F.e. as by-product it allows 521 * to co-exists in peace independent exterior and interior 522 * routing processes. 523 * 524 * Normally it looks as following. 525 * 526 * {universe prefix} -> (gw, oif) [scope link] 527 * | 528 * |-> {link prefix} -> (gw, oif) [scope local] 529 * | 530 * |-> {local prefix} (terminal node) 531 */ 532 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 533 struct fib_nh *nh) 534 { 535 int err; 536 struct net *net; 537 struct net_device *dev; 538 539 net = cfg->fc_nlinfo.nl_net; 540 if (nh->nh_gw) { 541 struct fib_result res; 542 543 if (nh->nh_flags & RTNH_F_ONLINK) { 544 545 if (cfg->fc_scope >= RT_SCOPE_LINK) 546 return -EINVAL; 547 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 548 return -EINVAL; 549 dev = __dev_get_by_index(net, nh->nh_oif); 550 if (!dev) 551 return -ENODEV; 552 if (!(dev->flags & IFF_UP)) 553 return -ENETDOWN; 554 nh->nh_dev = dev; 555 dev_hold(dev); 556 nh->nh_scope = RT_SCOPE_LINK; 557 return 0; 558 } 559 rcu_read_lock(); 560 { 561 struct flowi4 fl4 = { 562 .daddr = nh->nh_gw, 563 .flowi4_scope = cfg->fc_scope + 1, 564 .flowi4_oif = nh->nh_oif, 565 }; 566 567 /* It is not necessary, but requires a bit of thinking */ 568 if (fl4.flowi4_scope < RT_SCOPE_LINK) 569 fl4.flowi4_scope = RT_SCOPE_LINK; 570 err = fib_lookup(net, &fl4, &res); 571 if (err) { 572 rcu_read_unlock(); 573 return err; 574 } 575 } 576 err = -EINVAL; 577 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 578 goto out; 579 nh->nh_scope = res.scope; 580 nh->nh_oif = FIB_RES_OIF(res); 581 nh->nh_dev = dev = FIB_RES_DEV(res); 582 if (!dev) 583 goto out; 584 dev_hold(dev); 585 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; 586 } else { 587 struct in_device *in_dev; 588 589 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) 590 return -EINVAL; 591 592 rcu_read_lock(); 593 err = -ENODEV; 594 in_dev = inetdev_by_index(net, nh->nh_oif); 595 if (in_dev == NULL) 596 goto out; 597 err = -ENETDOWN; 598 if (!(in_dev->dev->flags & IFF_UP)) 599 goto out; 600 nh->nh_dev = in_dev->dev; 601 dev_hold(nh->nh_dev); 602 nh->nh_scope = RT_SCOPE_HOST; 603 err = 0; 604 } 605 out: 606 rcu_read_unlock(); 607 return err; 608 } 609 610 static inline unsigned int fib_laddr_hashfn(__be32 val) 611 { 612 unsigned int mask = (fib_info_hash_size - 1); 613 614 return ((__force u32)val ^ 615 ((__force u32)val >> 7) ^ 616 ((__force u32)val >> 14)) & mask; 617 } 618 619 static struct hlist_head *fib_info_hash_alloc(int bytes) 620 { 621 if (bytes <= PAGE_SIZE) 622 return kzalloc(bytes, GFP_KERNEL); 623 else 624 return (struct hlist_head *) 625 __get_free_pages(GFP_KERNEL | __GFP_ZERO, 626 get_order(bytes)); 627 } 628 629 static void fib_info_hash_free(struct hlist_head *hash, int bytes) 630 { 631 if (!hash) 632 return; 633 634 if (bytes <= PAGE_SIZE) 635 kfree(hash); 636 else 637 free_pages((unsigned long) hash, get_order(bytes)); 638 } 639 640 static void fib_info_hash_move(struct hlist_head *new_info_hash, 641 struct hlist_head *new_laddrhash, 642 unsigned int new_size) 643 { 644 struct hlist_head *old_info_hash, *old_laddrhash; 645 unsigned int old_size = fib_info_hash_size; 646 unsigned int i, bytes; 647 648 spin_lock_bh(&fib_info_lock); 649 old_info_hash = fib_info_hash; 650 old_laddrhash = fib_info_laddrhash; 651 fib_info_hash_size = new_size; 652 653 for (i = 0; i < old_size; i++) { 654 struct hlist_head *head = &fib_info_hash[i]; 655 struct hlist_node *node, *n; 656 struct fib_info *fi; 657 658 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { 659 struct hlist_head *dest; 660 unsigned int new_hash; 661 662 hlist_del(&fi->fib_hash); 663 664 new_hash = fib_info_hashfn(fi); 665 dest = &new_info_hash[new_hash]; 666 hlist_add_head(&fi->fib_hash, dest); 667 } 668 } 669 fib_info_hash = new_info_hash; 670 671 for (i = 0; i < old_size; i++) { 672 struct hlist_head *lhead = &fib_info_laddrhash[i]; 673 struct hlist_node *node, *n; 674 struct fib_info *fi; 675 676 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { 677 struct hlist_head *ldest; 678 unsigned int new_hash; 679 680 hlist_del(&fi->fib_lhash); 681 682 new_hash = fib_laddr_hashfn(fi->fib_prefsrc); 683 ldest = &new_laddrhash[new_hash]; 684 hlist_add_head(&fi->fib_lhash, ldest); 685 } 686 } 687 fib_info_laddrhash = new_laddrhash; 688 689 spin_unlock_bh(&fib_info_lock); 690 691 bytes = old_size * sizeof(struct hlist_head *); 692 fib_info_hash_free(old_info_hash, bytes); 693 fib_info_hash_free(old_laddrhash, bytes); 694 } 695 696 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) 697 { 698 nh->nh_saddr = inet_select_addr(nh->nh_dev, 699 nh->nh_gw, 700 nh->nh_parent->fib_scope); 701 nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); 702 703 return nh->nh_saddr; 704 } 705 706 struct fib_info *fib_create_info(struct fib_config *cfg) 707 { 708 int err; 709 struct fib_info *fi = NULL; 710 struct fib_info *ofi; 711 int nhs = 1; 712 struct net *net = cfg->fc_nlinfo.nl_net; 713 714 if (cfg->fc_type > RTN_MAX) 715 goto err_inval; 716 717 /* Fast check to catch the most weird cases */ 718 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 719 goto err_inval; 720 721 #ifdef CONFIG_IP_ROUTE_MULTIPATH 722 if (cfg->fc_mp) { 723 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); 724 if (nhs == 0) 725 goto err_inval; 726 } 727 #endif 728 729 err = -ENOBUFS; 730 if (fib_info_cnt >= fib_info_hash_size) { 731 unsigned int new_size = fib_info_hash_size << 1; 732 struct hlist_head *new_info_hash; 733 struct hlist_head *new_laddrhash; 734 unsigned int bytes; 735 736 if (!new_size) 737 new_size = 1; 738 bytes = new_size * sizeof(struct hlist_head *); 739 new_info_hash = fib_info_hash_alloc(bytes); 740 new_laddrhash = fib_info_hash_alloc(bytes); 741 if (!new_info_hash || !new_laddrhash) { 742 fib_info_hash_free(new_info_hash, bytes); 743 fib_info_hash_free(new_laddrhash, bytes); 744 } else 745 fib_info_hash_move(new_info_hash, new_laddrhash, new_size); 746 747 if (!fib_info_hash_size) 748 goto failure; 749 } 750 751 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 752 if (fi == NULL) 753 goto failure; 754 if (cfg->fc_mx) { 755 fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); 756 if (!fi->fib_metrics) 757 goto failure; 758 } else 759 fi->fib_metrics = (u32 *) dst_default_metrics; 760 fib_info_cnt++; 761 762 fi->fib_net = hold_net(net); 763 fi->fib_protocol = cfg->fc_protocol; 764 fi->fib_scope = cfg->fc_scope; 765 fi->fib_flags = cfg->fc_flags; 766 fi->fib_priority = cfg->fc_priority; 767 fi->fib_prefsrc = cfg->fc_prefsrc; 768 769 fi->fib_nhs = nhs; 770 change_nexthops(fi) { 771 nexthop_nh->nh_parent = fi; 772 } endfor_nexthops(fi) 773 774 if (cfg->fc_mx) { 775 struct nlattr *nla; 776 int remaining; 777 778 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 779 int type = nla_type(nla); 780 781 if (type) { 782 if (type > RTAX_MAX) 783 goto err_inval; 784 fi->fib_metrics[type - 1] = nla_get_u32(nla); 785 } 786 } 787 } 788 789 if (cfg->fc_mp) { 790 #ifdef CONFIG_IP_ROUTE_MULTIPATH 791 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); 792 if (err != 0) 793 goto failure; 794 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) 795 goto err_inval; 796 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 797 goto err_inval; 798 #ifdef CONFIG_IP_ROUTE_CLASSID 799 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 800 goto err_inval; 801 #endif 802 #else 803 goto err_inval; 804 #endif 805 } else { 806 struct fib_nh *nh = fi->fib_nh; 807 808 nh->nh_oif = cfg->fc_oif; 809 nh->nh_gw = cfg->fc_gw; 810 nh->nh_flags = cfg->fc_flags; 811 #ifdef CONFIG_IP_ROUTE_CLASSID 812 nh->nh_tclassid = cfg->fc_flow; 813 #endif 814 #ifdef CONFIG_IP_ROUTE_MULTIPATH 815 nh->nh_weight = 1; 816 #endif 817 } 818 819 if (fib_props[cfg->fc_type].error) { 820 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 821 goto err_inval; 822 goto link_it; 823 } else { 824 switch (cfg->fc_type) { 825 case RTN_UNICAST: 826 case RTN_LOCAL: 827 case RTN_BROADCAST: 828 case RTN_ANYCAST: 829 case RTN_MULTICAST: 830 break; 831 default: 832 goto err_inval; 833 } 834 } 835 836 if (cfg->fc_scope > RT_SCOPE_HOST) 837 goto err_inval; 838 839 if (cfg->fc_scope == RT_SCOPE_HOST) { 840 struct fib_nh *nh = fi->fib_nh; 841 842 /* Local address is added. */ 843 if (nhs != 1 || nh->nh_gw) 844 goto err_inval; 845 nh->nh_scope = RT_SCOPE_NOWHERE; 846 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); 847 err = -ENODEV; 848 if (nh->nh_dev == NULL) 849 goto failure; 850 } else { 851 change_nexthops(fi) { 852 err = fib_check_nh(cfg, fi, nexthop_nh); 853 if (err != 0) 854 goto failure; 855 } endfor_nexthops(fi) 856 } 857 858 if (fi->fib_prefsrc) { 859 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 860 fi->fib_prefsrc != cfg->fc_dst) 861 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) 862 goto err_inval; 863 } 864 865 change_nexthops(fi) { 866 fib_info_update_nh_saddr(net, nexthop_nh); 867 } endfor_nexthops(fi) 868 869 link_it: 870 ofi = fib_find_info(fi); 871 if (ofi) { 872 fi->fib_dead = 1; 873 free_fib_info(fi); 874 ofi->fib_treeref++; 875 return ofi; 876 } 877 878 fi->fib_treeref++; 879 atomic_inc(&fi->fib_clntref); 880 spin_lock_bh(&fib_info_lock); 881 hlist_add_head(&fi->fib_hash, 882 &fib_info_hash[fib_info_hashfn(fi)]); 883 if (fi->fib_prefsrc) { 884 struct hlist_head *head; 885 886 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; 887 hlist_add_head(&fi->fib_lhash, head); 888 } 889 change_nexthops(fi) { 890 struct hlist_head *head; 891 unsigned int hash; 892 893 if (!nexthop_nh->nh_dev) 894 continue; 895 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); 896 head = &fib_info_devhash[hash]; 897 hlist_add_head(&nexthop_nh->nh_hash, head); 898 } endfor_nexthops(fi) 899 spin_unlock_bh(&fib_info_lock); 900 return fi; 901 902 err_inval: 903 err = -EINVAL; 904 905 failure: 906 if (fi) { 907 fi->fib_dead = 1; 908 free_fib_info(fi); 909 } 910 911 return ERR_PTR(err); 912 } 913 914 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 915 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, 916 struct fib_info *fi, unsigned int flags) 917 { 918 struct nlmsghdr *nlh; 919 struct rtmsg *rtm; 920 921 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); 922 if (nlh == NULL) 923 return -EMSGSIZE; 924 925 rtm = nlmsg_data(nlh); 926 rtm->rtm_family = AF_INET; 927 rtm->rtm_dst_len = dst_len; 928 rtm->rtm_src_len = 0; 929 rtm->rtm_tos = tos; 930 if (tb_id < 256) 931 rtm->rtm_table = tb_id; 932 else 933 rtm->rtm_table = RT_TABLE_COMPAT; 934 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 935 rtm->rtm_type = type; 936 rtm->rtm_flags = fi->fib_flags; 937 rtm->rtm_scope = fi->fib_scope; 938 rtm->rtm_protocol = fi->fib_protocol; 939 940 if (rtm->rtm_dst_len) 941 NLA_PUT_BE32(skb, RTA_DST, dst); 942 943 if (fi->fib_priority) 944 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); 945 946 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) 947 goto nla_put_failure; 948 949 if (fi->fib_prefsrc) 950 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); 951 952 if (fi->fib_nhs == 1) { 953 if (fi->fib_nh->nh_gw) 954 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); 955 956 if (fi->fib_nh->nh_oif) 957 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 958 #ifdef CONFIG_IP_ROUTE_CLASSID 959 if (fi->fib_nh[0].nh_tclassid) 960 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 961 #endif 962 } 963 #ifdef CONFIG_IP_ROUTE_MULTIPATH 964 if (fi->fib_nhs > 1) { 965 struct rtnexthop *rtnh; 966 struct nlattr *mp; 967 968 mp = nla_nest_start(skb, RTA_MULTIPATH); 969 if (mp == NULL) 970 goto nla_put_failure; 971 972 for_nexthops(fi) { 973 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 974 if (rtnh == NULL) 975 goto nla_put_failure; 976 977 rtnh->rtnh_flags = nh->nh_flags & 0xFF; 978 rtnh->rtnh_hops = nh->nh_weight - 1; 979 rtnh->rtnh_ifindex = nh->nh_oif; 980 981 if (nh->nh_gw) 982 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 983 #ifdef CONFIG_IP_ROUTE_CLASSID 984 if (nh->nh_tclassid) 985 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 986 #endif 987 /* length of rtnetlink header + attributes */ 988 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 989 } endfor_nexthops(fi); 990 991 nla_nest_end(skb, mp); 992 } 993 #endif 994 return nlmsg_end(skb, nlh); 995 996 nla_put_failure: 997 nlmsg_cancel(skb, nlh); 998 return -EMSGSIZE; 999 } 1000 1001 /* 1002 * Update FIB if: 1003 * - local address disappeared -> we must delete all the entries 1004 * referring to it. 1005 * - device went down -> we must shutdown all nexthops going via it. 1006 */ 1007 int fib_sync_down_addr(struct net *net, __be32 local) 1008 { 1009 int ret = 0; 1010 unsigned int hash = fib_laddr_hashfn(local); 1011 struct hlist_head *head = &fib_info_laddrhash[hash]; 1012 struct hlist_node *node; 1013 struct fib_info *fi; 1014 1015 if (fib_info_laddrhash == NULL || local == 0) 1016 return 0; 1017 1018 hlist_for_each_entry(fi, node, head, fib_lhash) { 1019 if (!net_eq(fi->fib_net, net)) 1020 continue; 1021 if (fi->fib_prefsrc == local) { 1022 fi->fib_flags |= RTNH_F_DEAD; 1023 ret++; 1024 } 1025 } 1026 return ret; 1027 } 1028 1029 int fib_sync_down_dev(struct net_device *dev, int force) 1030 { 1031 int ret = 0; 1032 int scope = RT_SCOPE_NOWHERE; 1033 struct fib_info *prev_fi = NULL; 1034 unsigned int hash = fib_devindex_hashfn(dev->ifindex); 1035 struct hlist_head *head = &fib_info_devhash[hash]; 1036 struct hlist_node *node; 1037 struct fib_nh *nh; 1038 1039 if (force) 1040 scope = -1; 1041 1042 hlist_for_each_entry(nh, node, head, nh_hash) { 1043 struct fib_info *fi = nh->nh_parent; 1044 int dead; 1045 1046 BUG_ON(!fi->fib_nhs); 1047 if (nh->nh_dev != dev || fi == prev_fi) 1048 continue; 1049 prev_fi = fi; 1050 dead = 0; 1051 change_nexthops(fi) { 1052 if (nexthop_nh->nh_flags & RTNH_F_DEAD) 1053 dead++; 1054 else if (nexthop_nh->nh_dev == dev && 1055 nexthop_nh->nh_scope != scope) { 1056 nexthop_nh->nh_flags |= RTNH_F_DEAD; 1057 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1058 spin_lock_bh(&fib_multipath_lock); 1059 fi->fib_power -= nexthop_nh->nh_power; 1060 nexthop_nh->nh_power = 0; 1061 spin_unlock_bh(&fib_multipath_lock); 1062 #endif 1063 dead++; 1064 } 1065 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1066 if (force > 1 && nexthop_nh->nh_dev == dev) { 1067 dead = fi->fib_nhs; 1068 break; 1069 } 1070 #endif 1071 } endfor_nexthops(fi) 1072 if (dead == fi->fib_nhs) { 1073 fi->fib_flags |= RTNH_F_DEAD; 1074 ret++; 1075 } 1076 } 1077 1078 return ret; 1079 } 1080 1081 /* Must be invoked inside of an RCU protected region. */ 1082 void fib_select_default(struct fib_result *res) 1083 { 1084 struct fib_info *fi = NULL, *last_resort = NULL; 1085 struct list_head *fa_head = res->fa_head; 1086 struct fib_table *tb = res->table; 1087 int order = -1, last_idx = -1; 1088 struct fib_alias *fa; 1089 1090 list_for_each_entry_rcu(fa, fa_head, fa_list) { 1091 struct fib_info *next_fi = fa->fa_info; 1092 1093 if (next_fi->fib_scope != res->scope || 1094 fa->fa_type != RTN_UNICAST) 1095 continue; 1096 1097 if (next_fi->fib_priority > res->fi->fib_priority) 1098 break; 1099 if (!next_fi->fib_nh[0].nh_gw || 1100 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) 1101 continue; 1102 1103 fib_alias_accessed(fa); 1104 1105 if (fi == NULL) { 1106 if (next_fi != res->fi) 1107 break; 1108 } else if (!fib_detect_death(fi, order, &last_resort, 1109 &last_idx, tb->tb_default)) { 1110 fib_result_assign(res, fi); 1111 tb->tb_default = order; 1112 goto out; 1113 } 1114 fi = next_fi; 1115 order++; 1116 } 1117 1118 if (order <= 0 || fi == NULL) { 1119 tb->tb_default = -1; 1120 goto out; 1121 } 1122 1123 if (!fib_detect_death(fi, order, &last_resort, &last_idx, 1124 tb->tb_default)) { 1125 fib_result_assign(res, fi); 1126 tb->tb_default = order; 1127 goto out; 1128 } 1129 1130 if (last_idx >= 0) 1131 fib_result_assign(res, last_resort); 1132 tb->tb_default = last_idx; 1133 out: 1134 return; 1135 } 1136 1137 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1138 1139 /* 1140 * Dead device goes up. We wake up dead nexthops. 1141 * It takes sense only on multipath routes. 1142 */ 1143 int fib_sync_up(struct net_device *dev) 1144 { 1145 struct fib_info *prev_fi; 1146 unsigned int hash; 1147 struct hlist_head *head; 1148 struct hlist_node *node; 1149 struct fib_nh *nh; 1150 int ret; 1151 1152 if (!(dev->flags & IFF_UP)) 1153 return 0; 1154 1155 prev_fi = NULL; 1156 hash = fib_devindex_hashfn(dev->ifindex); 1157 head = &fib_info_devhash[hash]; 1158 ret = 0; 1159 1160 hlist_for_each_entry(nh, node, head, nh_hash) { 1161 struct fib_info *fi = nh->nh_parent; 1162 int alive; 1163 1164 BUG_ON(!fi->fib_nhs); 1165 if (nh->nh_dev != dev || fi == prev_fi) 1166 continue; 1167 1168 prev_fi = fi; 1169 alive = 0; 1170 change_nexthops(fi) { 1171 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { 1172 alive++; 1173 continue; 1174 } 1175 if (nexthop_nh->nh_dev == NULL || 1176 !(nexthop_nh->nh_dev->flags & IFF_UP)) 1177 continue; 1178 if (nexthop_nh->nh_dev != dev || 1179 !__in_dev_get_rtnl(dev)) 1180 continue; 1181 alive++; 1182 spin_lock_bh(&fib_multipath_lock); 1183 nexthop_nh->nh_power = 0; 1184 nexthop_nh->nh_flags &= ~RTNH_F_DEAD; 1185 spin_unlock_bh(&fib_multipath_lock); 1186 } endfor_nexthops(fi) 1187 1188 if (alive > 0) { 1189 fi->fib_flags &= ~RTNH_F_DEAD; 1190 ret++; 1191 } 1192 } 1193 1194 return ret; 1195 } 1196 1197 /* 1198 * The algorithm is suboptimal, but it provides really 1199 * fair weighted route distribution. 1200 */ 1201 void fib_select_multipath(struct fib_result *res) 1202 { 1203 struct fib_info *fi = res->fi; 1204 int w; 1205 1206 spin_lock_bh(&fib_multipath_lock); 1207 if (fi->fib_power <= 0) { 1208 int power = 0; 1209 change_nexthops(fi) { 1210 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { 1211 power += nexthop_nh->nh_weight; 1212 nexthop_nh->nh_power = nexthop_nh->nh_weight; 1213 } 1214 } endfor_nexthops(fi); 1215 fi->fib_power = power; 1216 if (power <= 0) { 1217 spin_unlock_bh(&fib_multipath_lock); 1218 /* Race condition: route has just become dead. */ 1219 res->nh_sel = 0; 1220 return; 1221 } 1222 } 1223 1224 1225 /* w should be random number [0..fi->fib_power-1], 1226 * it is pretty bad approximation. 1227 */ 1228 1229 w = jiffies % fi->fib_power; 1230 1231 change_nexthops(fi) { 1232 if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && 1233 nexthop_nh->nh_power) { 1234 w -= nexthop_nh->nh_power; 1235 if (w <= 0) { 1236 nexthop_nh->nh_power--; 1237 fi->fib_power--; 1238 res->nh_sel = nhsel; 1239 spin_unlock_bh(&fib_multipath_lock); 1240 return; 1241 } 1242 } 1243 } endfor_nexthops(fi); 1244 1245 /* Race condition: route has just become dead. */ 1246 res->nh_sel = 0; 1247 spin_unlock_bh(&fib_multipath_lock); 1248 } 1249 #endif 1250