1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IPv4 Forwarding Information Base: semantics. 7 * 8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $ 9 * 10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License 14 * as published by the Free Software Foundation; either version 15 * 2 of the License, or (at your option) any later version. 16 */ 17 18 #include <asm/uaccess.h> 19 #include <asm/system.h> 20 #include <linux/bitops.h> 21 #include <linux/types.h> 22 #include <linux/kernel.h> 23 #include <linux/jiffies.h> 24 #include <linux/mm.h> 25 #include <linux/string.h> 26 #include <linux/socket.h> 27 #include <linux/sockios.h> 28 #include <linux/errno.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/inetdevice.h> 32 #include <linux/netdevice.h> 33 #include <linux/if_arp.h> 34 #include <linux/proc_fs.h> 35 #include <linux/skbuff.h> 36 #include <linux/init.h> 37 38 #include <net/arp.h> 39 #include <net/ip.h> 40 #include <net/protocol.h> 41 #include <net/route.h> 42 #include <net/tcp.h> 43 #include <net/sock.h> 44 #include <net/ip_fib.h> 45 #include <net/netlink.h> 46 #include <net/nexthop.h> 47 48 #include "fib_lookup.h" 49 50 static DEFINE_SPINLOCK(fib_info_lock); 51 static struct hlist_head *fib_info_hash; 52 static struct hlist_head *fib_info_laddrhash; 53 static unsigned int fib_hash_size; 54 static unsigned int fib_info_cnt; 55 56 #define DEVINDEX_HASHBITS 8 57 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) 58 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; 59 60 #ifdef CONFIG_IP_ROUTE_MULTIPATH 61 62 static DEFINE_SPINLOCK(fib_multipath_lock); 63 64 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 65 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 66 67 #define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ 68 for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) 69 70 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 71 72 /* Hope, that gcc will optimize it to get rid of dummy loop */ 73 74 #define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \ 75 for (nhsel=0; nhsel < 1; nhsel++) 76 77 #define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \ 78 for (nhsel=0; nhsel < 1; nhsel++) 79 80 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 81 82 #define endfor_nexthops(fi) } 83 84 85 static const struct 86 { 87 int error; 88 u8 scope; 89 } fib_props[RTN_MAX + 1] = { 90 { 91 .error = 0, 92 .scope = RT_SCOPE_NOWHERE, 93 }, /* RTN_UNSPEC */ 94 { 95 .error = 0, 96 .scope = RT_SCOPE_UNIVERSE, 97 }, /* RTN_UNICAST */ 98 { 99 .error = 0, 100 .scope = RT_SCOPE_HOST, 101 }, /* RTN_LOCAL */ 102 { 103 .error = 0, 104 .scope = RT_SCOPE_LINK, 105 }, /* RTN_BROADCAST */ 106 { 107 .error = 0, 108 .scope = RT_SCOPE_LINK, 109 }, /* RTN_ANYCAST */ 110 { 111 .error = 0, 112 .scope = RT_SCOPE_UNIVERSE, 113 }, /* RTN_MULTICAST */ 114 { 115 .error = -EINVAL, 116 .scope = RT_SCOPE_UNIVERSE, 117 }, /* RTN_BLACKHOLE */ 118 { 119 .error = -EHOSTUNREACH, 120 .scope = RT_SCOPE_UNIVERSE, 121 }, /* RTN_UNREACHABLE */ 122 { 123 .error = -EACCES, 124 .scope = RT_SCOPE_UNIVERSE, 125 }, /* RTN_PROHIBIT */ 126 { 127 .error = -EAGAIN, 128 .scope = RT_SCOPE_UNIVERSE, 129 }, /* RTN_THROW */ 130 { 131 .error = -EINVAL, 132 .scope = RT_SCOPE_NOWHERE, 133 }, /* RTN_NAT */ 134 { 135 .error = -EINVAL, 136 .scope = RT_SCOPE_NOWHERE, 137 }, /* RTN_XRESOLVE */ 138 }; 139 140 141 /* Release a nexthop info record */ 142 143 void free_fib_info(struct fib_info *fi) 144 { 145 if (fi->fib_dead == 0) { 146 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); 147 return; 148 } 149 change_nexthops(fi) { 150 if (nh->nh_dev) 151 dev_put(nh->nh_dev); 152 nh->nh_dev = NULL; 153 } endfor_nexthops(fi); 154 fib_info_cnt--; 155 release_net(fi->fib_net); 156 kfree(fi); 157 } 158 159 void fib_release_info(struct fib_info *fi) 160 { 161 spin_lock_bh(&fib_info_lock); 162 if (fi && --fi->fib_treeref == 0) { 163 hlist_del(&fi->fib_hash); 164 if (fi->fib_prefsrc) 165 hlist_del(&fi->fib_lhash); 166 change_nexthops(fi) { 167 if (!nh->nh_dev) 168 continue; 169 hlist_del(&nh->nh_hash); 170 } endfor_nexthops(fi) 171 fi->fib_dead = 1; 172 fib_info_put(fi); 173 } 174 spin_unlock_bh(&fib_info_lock); 175 } 176 177 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 178 { 179 const struct fib_nh *onh = ofi->fib_nh; 180 181 for_nexthops(fi) { 182 if (nh->nh_oif != onh->nh_oif || 183 nh->nh_gw != onh->nh_gw || 184 nh->nh_scope != onh->nh_scope || 185 #ifdef CONFIG_IP_ROUTE_MULTIPATH 186 nh->nh_weight != onh->nh_weight || 187 #endif 188 #ifdef CONFIG_NET_CLS_ROUTE 189 nh->nh_tclassid != onh->nh_tclassid || 190 #endif 191 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) 192 return -1; 193 onh++; 194 } endfor_nexthops(fi); 195 return 0; 196 } 197 198 static inline unsigned int fib_devindex_hashfn(unsigned int val) 199 { 200 unsigned int mask = DEVINDEX_HASHSIZE - 1; 201 202 return (val ^ 203 (val >> DEVINDEX_HASHBITS) ^ 204 (val >> (DEVINDEX_HASHBITS * 2))) & mask; 205 } 206 207 static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 208 { 209 unsigned int mask = (fib_hash_size - 1); 210 unsigned int val = fi->fib_nhs; 211 212 val ^= fi->fib_protocol; 213 val ^= (__force u32)fi->fib_prefsrc; 214 val ^= fi->fib_priority; 215 for_nexthops(fi) { 216 val ^= fib_devindex_hashfn(nh->nh_oif); 217 } endfor_nexthops(fi) 218 219 return (val ^ (val >> 7) ^ (val >> 12)) & mask; 220 } 221 222 static struct fib_info *fib_find_info(const struct fib_info *nfi) 223 { 224 struct hlist_head *head; 225 struct hlist_node *node; 226 struct fib_info *fi; 227 unsigned int hash; 228 229 hash = fib_info_hashfn(nfi); 230 head = &fib_info_hash[hash]; 231 232 hlist_for_each_entry(fi, node, head, fib_hash) { 233 if (fi->fib_net != nfi->fib_net) 234 continue; 235 if (fi->fib_nhs != nfi->fib_nhs) 236 continue; 237 if (nfi->fib_protocol == fi->fib_protocol && 238 nfi->fib_prefsrc == fi->fib_prefsrc && 239 nfi->fib_priority == fi->fib_priority && 240 memcmp(nfi->fib_metrics, fi->fib_metrics, 241 sizeof(fi->fib_metrics)) == 0 && 242 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && 243 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 244 return fi; 245 } 246 247 return NULL; 248 } 249 250 /* Check, that the gateway is already configured. 251 Used only by redirect accept routine. 252 */ 253 254 int ip_fib_check_default(__be32 gw, struct net_device *dev) 255 { 256 struct hlist_head *head; 257 struct hlist_node *node; 258 struct fib_nh *nh; 259 unsigned int hash; 260 261 spin_lock(&fib_info_lock); 262 263 hash = fib_devindex_hashfn(dev->ifindex); 264 head = &fib_info_devhash[hash]; 265 hlist_for_each_entry(nh, node, head, nh_hash) { 266 if (nh->nh_dev == dev && 267 nh->nh_gw == gw && 268 !(nh->nh_flags&RTNH_F_DEAD)) { 269 spin_unlock(&fib_info_lock); 270 return 0; 271 } 272 } 273 274 spin_unlock(&fib_info_lock); 275 276 return -1; 277 } 278 279 static inline size_t fib_nlmsg_size(struct fib_info *fi) 280 { 281 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) 282 + nla_total_size(4) /* RTA_TABLE */ 283 + nla_total_size(4) /* RTA_DST */ 284 + nla_total_size(4) /* RTA_PRIORITY */ 285 + nla_total_size(4); /* RTA_PREFSRC */ 286 287 /* space for nested metrics */ 288 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 289 290 if (fi->fib_nhs) { 291 /* Also handles the special case fib_nhs == 1 */ 292 293 /* each nexthop is packed in an attribute */ 294 size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); 295 296 /* may contain flow and gateway attribute */ 297 nhsize += 2 * nla_total_size(4); 298 299 /* all nexthops are packed in a nested attribute */ 300 payload += nla_total_size(fi->fib_nhs * nhsize); 301 } 302 303 return payload; 304 } 305 306 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 307 int dst_len, u32 tb_id, struct nl_info *info, 308 unsigned int nlm_flags) 309 { 310 struct sk_buff *skb; 311 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 312 int err = -ENOBUFS; 313 314 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 315 if (skb == NULL) 316 goto errout; 317 318 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 319 fa->fa_type, fa->fa_scope, key, dst_len, 320 fa->fa_tos, fa->fa_info, nlm_flags); 321 if (err < 0) { 322 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 323 WARN_ON(err == -EMSGSIZE); 324 kfree_skb(skb); 325 goto errout; 326 } 327 err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 328 info->nlh, GFP_KERNEL); 329 errout: 330 if (err < 0) 331 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 332 } 333 334 /* Return the first fib alias matching TOS with 335 * priority less than or equal to PRIO. 336 */ 337 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) 338 { 339 if (fah) { 340 struct fib_alias *fa; 341 list_for_each_entry(fa, fah, fa_list) { 342 if (fa->fa_tos > tos) 343 continue; 344 if (fa->fa_info->fib_priority >= prio || 345 fa->fa_tos < tos) 346 return fa; 347 } 348 } 349 return NULL; 350 } 351 352 int fib_detect_death(struct fib_info *fi, int order, 353 struct fib_info **last_resort, int *last_idx, int dflt) 354 { 355 struct neighbour *n; 356 int state = NUD_NONE; 357 358 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); 359 if (n) { 360 state = n->nud_state; 361 neigh_release(n); 362 } 363 if (state==NUD_REACHABLE) 364 return 0; 365 if ((state&NUD_VALID) && order != dflt) 366 return 0; 367 if ((state&NUD_VALID) || 368 (*last_idx<0 && order > dflt)) { 369 *last_resort = fi; 370 *last_idx = order; 371 } 372 return 1; 373 } 374 375 #ifdef CONFIG_IP_ROUTE_MULTIPATH 376 377 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) 378 { 379 int nhs = 0; 380 381 while (rtnh_ok(rtnh, remaining)) { 382 nhs++; 383 rtnh = rtnh_next(rtnh, &remaining); 384 } 385 386 /* leftover implies invalid nexthop configuration, discard it */ 387 return remaining > 0 ? 0 : nhs; 388 } 389 390 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 391 int remaining, struct fib_config *cfg) 392 { 393 change_nexthops(fi) { 394 int attrlen; 395 396 if (!rtnh_ok(rtnh, remaining)) 397 return -EINVAL; 398 399 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 400 nh->nh_oif = rtnh->rtnh_ifindex; 401 nh->nh_weight = rtnh->rtnh_hops + 1; 402 403 attrlen = rtnh_attrlen(rtnh); 404 if (attrlen > 0) { 405 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 406 407 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 408 nh->nh_gw = nla ? nla_get_be32(nla) : 0; 409 #ifdef CONFIG_NET_CLS_ROUTE 410 nla = nla_find(attrs, attrlen, RTA_FLOW); 411 nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 412 #endif 413 } 414 415 rtnh = rtnh_next(rtnh, &remaining); 416 } endfor_nexthops(fi); 417 418 return 0; 419 } 420 421 #endif 422 423 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) 424 { 425 #ifdef CONFIG_IP_ROUTE_MULTIPATH 426 struct rtnexthop *rtnh; 427 int remaining; 428 #endif 429 430 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) 431 return 1; 432 433 if (cfg->fc_oif || cfg->fc_gw) { 434 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && 435 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) 436 return 0; 437 return 1; 438 } 439 440 #ifdef CONFIG_IP_ROUTE_MULTIPATH 441 if (cfg->fc_mp == NULL) 442 return 0; 443 444 rtnh = cfg->fc_mp; 445 remaining = cfg->fc_mp_len; 446 447 for_nexthops(fi) { 448 int attrlen; 449 450 if (!rtnh_ok(rtnh, remaining)) 451 return -EINVAL; 452 453 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) 454 return 1; 455 456 attrlen = rtnh_attrlen(rtnh); 457 if (attrlen < 0) { 458 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 459 460 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 461 if (nla && nla_get_be32(nla) != nh->nh_gw) 462 return 1; 463 #ifdef CONFIG_NET_CLS_ROUTE 464 nla = nla_find(attrs, attrlen, RTA_FLOW); 465 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 466 return 1; 467 #endif 468 } 469 470 rtnh = rtnh_next(rtnh, &remaining); 471 } endfor_nexthops(fi); 472 #endif 473 return 0; 474 } 475 476 477 /* 478 Picture 479 ------- 480 481 Semantics of nexthop is very messy by historical reasons. 482 We have to take into account, that: 483 a) gateway can be actually local interface address, 484 so that gatewayed route is direct. 485 b) gateway must be on-link address, possibly 486 described not by an ifaddr, but also by a direct route. 487 c) If both gateway and interface are specified, they should not 488 contradict. 489 d) If we use tunnel routes, gateway could be not on-link. 490 491 Attempt to reconcile all of these (alas, self-contradictory) conditions 492 results in pretty ugly and hairy code with obscure logic. 493 494 I chose to generalized it instead, so that the size 495 of code does not increase practically, but it becomes 496 much more general. 497 Every prefix is assigned a "scope" value: "host" is local address, 498 "link" is direct route, 499 [ ... "site" ... "interior" ... ] 500 and "universe" is true gateway route with global meaning. 501 502 Every prefix refers to a set of "nexthop"s (gw, oif), 503 where gw must have narrower scope. This recursion stops 504 when gw has LOCAL scope or if "nexthop" is declared ONLINK, 505 which means that gw is forced to be on link. 506 507 Code is still hairy, but now it is apparently logically 508 consistent and very flexible. F.e. as by-product it allows 509 to co-exists in peace independent exterior and interior 510 routing processes. 511 512 Normally it looks as following. 513 514 {universe prefix} -> (gw, oif) [scope link] 515 | 516 |-> {link prefix} -> (gw, oif) [scope local] 517 | 518 |-> {local prefix} (terminal node) 519 */ 520 521 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 522 struct fib_nh *nh) 523 { 524 int err; 525 struct net *net; 526 527 net = cfg->fc_nlinfo.nl_net; 528 if (nh->nh_gw) { 529 struct fib_result res; 530 531 #ifdef CONFIG_IP_ROUTE_PERVASIVE 532 if (nh->nh_flags&RTNH_F_PERVASIVE) 533 return 0; 534 #endif 535 if (nh->nh_flags&RTNH_F_ONLINK) { 536 struct net_device *dev; 537 538 if (cfg->fc_scope >= RT_SCOPE_LINK) 539 return -EINVAL; 540 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 541 return -EINVAL; 542 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) 543 return -ENODEV; 544 if (!(dev->flags&IFF_UP)) 545 return -ENETDOWN; 546 nh->nh_dev = dev; 547 dev_hold(dev); 548 nh->nh_scope = RT_SCOPE_LINK; 549 return 0; 550 } 551 { 552 struct flowi fl = { 553 .nl_u = { 554 .ip4_u = { 555 .daddr = nh->nh_gw, 556 .scope = cfg->fc_scope + 1, 557 }, 558 }, 559 .oif = nh->nh_oif, 560 }; 561 562 /* It is not necessary, but requires a bit of thinking */ 563 if (fl.fl4_scope < RT_SCOPE_LINK) 564 fl.fl4_scope = RT_SCOPE_LINK; 565 if ((err = fib_lookup(net, &fl, &res)) != 0) 566 return err; 567 } 568 err = -EINVAL; 569 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 570 goto out; 571 nh->nh_scope = res.scope; 572 nh->nh_oif = FIB_RES_OIF(res); 573 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) 574 goto out; 575 dev_hold(nh->nh_dev); 576 err = -ENETDOWN; 577 if (!(nh->nh_dev->flags & IFF_UP)) 578 goto out; 579 err = 0; 580 out: 581 fib_res_put(&res); 582 return err; 583 } else { 584 struct in_device *in_dev; 585 586 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) 587 return -EINVAL; 588 589 in_dev = inetdev_by_index(net, nh->nh_oif); 590 if (in_dev == NULL) 591 return -ENODEV; 592 if (!(in_dev->dev->flags&IFF_UP)) { 593 in_dev_put(in_dev); 594 return -ENETDOWN; 595 } 596 nh->nh_dev = in_dev->dev; 597 dev_hold(nh->nh_dev); 598 nh->nh_scope = RT_SCOPE_HOST; 599 in_dev_put(in_dev); 600 } 601 return 0; 602 } 603 604 static inline unsigned int fib_laddr_hashfn(__be32 val) 605 { 606 unsigned int mask = (fib_hash_size - 1); 607 608 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; 609 } 610 611 static struct hlist_head *fib_hash_alloc(int bytes) 612 { 613 if (bytes <= PAGE_SIZE) 614 return kzalloc(bytes, GFP_KERNEL); 615 else 616 return (struct hlist_head *) 617 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); 618 } 619 620 static void fib_hash_free(struct hlist_head *hash, int bytes) 621 { 622 if (!hash) 623 return; 624 625 if (bytes <= PAGE_SIZE) 626 kfree(hash); 627 else 628 free_pages((unsigned long) hash, get_order(bytes)); 629 } 630 631 static void fib_hash_move(struct hlist_head *new_info_hash, 632 struct hlist_head *new_laddrhash, 633 unsigned int new_size) 634 { 635 struct hlist_head *old_info_hash, *old_laddrhash; 636 unsigned int old_size = fib_hash_size; 637 unsigned int i, bytes; 638 639 spin_lock_bh(&fib_info_lock); 640 old_info_hash = fib_info_hash; 641 old_laddrhash = fib_info_laddrhash; 642 fib_hash_size = new_size; 643 644 for (i = 0; i < old_size; i++) { 645 struct hlist_head *head = &fib_info_hash[i]; 646 struct hlist_node *node, *n; 647 struct fib_info *fi; 648 649 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { 650 struct hlist_head *dest; 651 unsigned int new_hash; 652 653 hlist_del(&fi->fib_hash); 654 655 new_hash = fib_info_hashfn(fi); 656 dest = &new_info_hash[new_hash]; 657 hlist_add_head(&fi->fib_hash, dest); 658 } 659 } 660 fib_info_hash = new_info_hash; 661 662 for (i = 0; i < old_size; i++) { 663 struct hlist_head *lhead = &fib_info_laddrhash[i]; 664 struct hlist_node *node, *n; 665 struct fib_info *fi; 666 667 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { 668 struct hlist_head *ldest; 669 unsigned int new_hash; 670 671 hlist_del(&fi->fib_lhash); 672 673 new_hash = fib_laddr_hashfn(fi->fib_prefsrc); 674 ldest = &new_laddrhash[new_hash]; 675 hlist_add_head(&fi->fib_lhash, ldest); 676 } 677 } 678 fib_info_laddrhash = new_laddrhash; 679 680 spin_unlock_bh(&fib_info_lock); 681 682 bytes = old_size * sizeof(struct hlist_head *); 683 fib_hash_free(old_info_hash, bytes); 684 fib_hash_free(old_laddrhash, bytes); 685 } 686 687 struct fib_info *fib_create_info(struct fib_config *cfg) 688 { 689 int err; 690 struct fib_info *fi = NULL; 691 struct fib_info *ofi; 692 int nhs = 1; 693 struct net *net = cfg->fc_nlinfo.nl_net; 694 695 /* Fast check to catch the most weird cases */ 696 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 697 goto err_inval; 698 699 #ifdef CONFIG_IP_ROUTE_MULTIPATH 700 if (cfg->fc_mp) { 701 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); 702 if (nhs == 0) 703 goto err_inval; 704 } 705 #endif 706 707 err = -ENOBUFS; 708 if (fib_info_cnt >= fib_hash_size) { 709 unsigned int new_size = fib_hash_size << 1; 710 struct hlist_head *new_info_hash; 711 struct hlist_head *new_laddrhash; 712 unsigned int bytes; 713 714 if (!new_size) 715 new_size = 1; 716 bytes = new_size * sizeof(struct hlist_head *); 717 new_info_hash = fib_hash_alloc(bytes); 718 new_laddrhash = fib_hash_alloc(bytes); 719 if (!new_info_hash || !new_laddrhash) { 720 fib_hash_free(new_info_hash, bytes); 721 fib_hash_free(new_laddrhash, bytes); 722 } else 723 fib_hash_move(new_info_hash, new_laddrhash, new_size); 724 725 if (!fib_hash_size) 726 goto failure; 727 } 728 729 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 730 if (fi == NULL) 731 goto failure; 732 fib_info_cnt++; 733 734 fi->fib_net = hold_net(net); 735 fi->fib_protocol = cfg->fc_protocol; 736 fi->fib_flags = cfg->fc_flags; 737 fi->fib_priority = cfg->fc_priority; 738 fi->fib_prefsrc = cfg->fc_prefsrc; 739 740 fi->fib_nhs = nhs; 741 change_nexthops(fi) { 742 nh->nh_parent = fi; 743 } endfor_nexthops(fi) 744 745 if (cfg->fc_mx) { 746 struct nlattr *nla; 747 int remaining; 748 749 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 750 int type = nla_type(nla); 751 752 if (type) { 753 if (type > RTAX_MAX) 754 goto err_inval; 755 fi->fib_metrics[type - 1] = nla_get_u32(nla); 756 } 757 } 758 } 759 760 if (cfg->fc_mp) { 761 #ifdef CONFIG_IP_ROUTE_MULTIPATH 762 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); 763 if (err != 0) 764 goto failure; 765 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) 766 goto err_inval; 767 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 768 goto err_inval; 769 #ifdef CONFIG_NET_CLS_ROUTE 770 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 771 goto err_inval; 772 #endif 773 #else 774 goto err_inval; 775 #endif 776 } else { 777 struct fib_nh *nh = fi->fib_nh; 778 779 nh->nh_oif = cfg->fc_oif; 780 nh->nh_gw = cfg->fc_gw; 781 nh->nh_flags = cfg->fc_flags; 782 #ifdef CONFIG_NET_CLS_ROUTE 783 nh->nh_tclassid = cfg->fc_flow; 784 #endif 785 #ifdef CONFIG_IP_ROUTE_MULTIPATH 786 nh->nh_weight = 1; 787 #endif 788 } 789 790 if (fib_props[cfg->fc_type].error) { 791 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 792 goto err_inval; 793 goto link_it; 794 } 795 796 if (cfg->fc_scope > RT_SCOPE_HOST) 797 goto err_inval; 798 799 if (cfg->fc_scope == RT_SCOPE_HOST) { 800 struct fib_nh *nh = fi->fib_nh; 801 802 /* Local address is added. */ 803 if (nhs != 1 || nh->nh_gw) 804 goto err_inval; 805 nh->nh_scope = RT_SCOPE_NOWHERE; 806 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); 807 err = -ENODEV; 808 if (nh->nh_dev == NULL) 809 goto failure; 810 } else { 811 change_nexthops(fi) { 812 if ((err = fib_check_nh(cfg, fi, nh)) != 0) 813 goto failure; 814 } endfor_nexthops(fi) 815 } 816 817 if (fi->fib_prefsrc) { 818 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 819 fi->fib_prefsrc != cfg->fc_dst) 820 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) 821 goto err_inval; 822 } 823 824 link_it: 825 if ((ofi = fib_find_info(fi)) != NULL) { 826 fi->fib_dead = 1; 827 free_fib_info(fi); 828 ofi->fib_treeref++; 829 return ofi; 830 } 831 832 fi->fib_treeref++; 833 atomic_inc(&fi->fib_clntref); 834 spin_lock_bh(&fib_info_lock); 835 hlist_add_head(&fi->fib_hash, 836 &fib_info_hash[fib_info_hashfn(fi)]); 837 if (fi->fib_prefsrc) { 838 struct hlist_head *head; 839 840 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; 841 hlist_add_head(&fi->fib_lhash, head); 842 } 843 change_nexthops(fi) { 844 struct hlist_head *head; 845 unsigned int hash; 846 847 if (!nh->nh_dev) 848 continue; 849 hash = fib_devindex_hashfn(nh->nh_dev->ifindex); 850 head = &fib_info_devhash[hash]; 851 hlist_add_head(&nh->nh_hash, head); 852 } endfor_nexthops(fi) 853 spin_unlock_bh(&fib_info_lock); 854 return fi; 855 856 err_inval: 857 err = -EINVAL; 858 859 failure: 860 if (fi) { 861 fi->fib_dead = 1; 862 free_fib_info(fi); 863 } 864 865 return ERR_PTR(err); 866 } 867 868 /* Note! fib_semantic_match intentionally uses RCU list functions. */ 869 int fib_semantic_match(struct list_head *head, const struct flowi *flp, 870 struct fib_result *res, __be32 zone, __be32 mask, 871 int prefixlen) 872 { 873 struct fib_alias *fa; 874 int nh_sel = 0; 875 876 list_for_each_entry_rcu(fa, head, fa_list) { 877 int err; 878 879 if (fa->fa_tos && 880 fa->fa_tos != flp->fl4_tos) 881 continue; 882 883 if (fa->fa_scope < flp->fl4_scope) 884 continue; 885 886 fa->fa_state |= FA_S_ACCESSED; 887 888 err = fib_props[fa->fa_type].error; 889 if (err == 0) { 890 struct fib_info *fi = fa->fa_info; 891 892 if (fi->fib_flags & RTNH_F_DEAD) 893 continue; 894 895 switch (fa->fa_type) { 896 case RTN_UNICAST: 897 case RTN_LOCAL: 898 case RTN_BROADCAST: 899 case RTN_ANYCAST: 900 case RTN_MULTICAST: 901 for_nexthops(fi) { 902 if (nh->nh_flags&RTNH_F_DEAD) 903 continue; 904 if (!flp->oif || flp->oif == nh->nh_oif) 905 break; 906 } 907 #ifdef CONFIG_IP_ROUTE_MULTIPATH 908 if (nhsel < fi->fib_nhs) { 909 nh_sel = nhsel; 910 goto out_fill_res; 911 } 912 #else 913 if (nhsel < 1) { 914 goto out_fill_res; 915 } 916 #endif 917 endfor_nexthops(fi); 918 continue; 919 920 default: 921 printk(KERN_WARNING "fib_semantic_match bad type %#x\n", 922 fa->fa_type); 923 return -EINVAL; 924 } 925 } 926 return err; 927 } 928 return 1; 929 930 out_fill_res: 931 res->prefixlen = prefixlen; 932 res->nh_sel = nh_sel; 933 res->type = fa->fa_type; 934 res->scope = fa->fa_scope; 935 res->fi = fa->fa_info; 936 atomic_inc(&res->fi->fib_clntref); 937 return 0; 938 } 939 940 /* Find appropriate source address to this destination */ 941 942 __be32 __fib_res_prefsrc(struct fib_result *res) 943 { 944 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); 945 } 946 947 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 948 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, 949 struct fib_info *fi, unsigned int flags) 950 { 951 struct nlmsghdr *nlh; 952 struct rtmsg *rtm; 953 954 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); 955 if (nlh == NULL) 956 return -EMSGSIZE; 957 958 rtm = nlmsg_data(nlh); 959 rtm->rtm_family = AF_INET; 960 rtm->rtm_dst_len = dst_len; 961 rtm->rtm_src_len = 0; 962 rtm->rtm_tos = tos; 963 rtm->rtm_table = tb_id; 964 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 965 rtm->rtm_type = type; 966 rtm->rtm_flags = fi->fib_flags; 967 rtm->rtm_scope = scope; 968 rtm->rtm_protocol = fi->fib_protocol; 969 970 if (rtm->rtm_dst_len) 971 NLA_PUT_BE32(skb, RTA_DST, dst); 972 973 if (fi->fib_priority) 974 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); 975 976 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) 977 goto nla_put_failure; 978 979 if (fi->fib_prefsrc) 980 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); 981 982 if (fi->fib_nhs == 1) { 983 if (fi->fib_nh->nh_gw) 984 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); 985 986 if (fi->fib_nh->nh_oif) 987 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 988 #ifdef CONFIG_NET_CLS_ROUTE 989 if (fi->fib_nh[0].nh_tclassid) 990 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 991 #endif 992 } 993 #ifdef CONFIG_IP_ROUTE_MULTIPATH 994 if (fi->fib_nhs > 1) { 995 struct rtnexthop *rtnh; 996 struct nlattr *mp; 997 998 mp = nla_nest_start(skb, RTA_MULTIPATH); 999 if (mp == NULL) 1000 goto nla_put_failure; 1001 1002 for_nexthops(fi) { 1003 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 1004 if (rtnh == NULL) 1005 goto nla_put_failure; 1006 1007 rtnh->rtnh_flags = nh->nh_flags & 0xFF; 1008 rtnh->rtnh_hops = nh->nh_weight - 1; 1009 rtnh->rtnh_ifindex = nh->nh_oif; 1010 1011 if (nh->nh_gw) 1012 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 1013 #ifdef CONFIG_NET_CLS_ROUTE 1014 if (nh->nh_tclassid) 1015 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 1016 #endif 1017 /* length of rtnetlink header + attributes */ 1018 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 1019 } endfor_nexthops(fi); 1020 1021 nla_nest_end(skb, mp); 1022 } 1023 #endif 1024 return nlmsg_end(skb, nlh); 1025 1026 nla_put_failure: 1027 nlmsg_cancel(skb, nlh); 1028 return -EMSGSIZE; 1029 } 1030 1031 /* 1032 Update FIB if: 1033 - local address disappeared -> we must delete all the entries 1034 referring to it. 1035 - device went down -> we must shutdown all nexthops going via it. 1036 */ 1037 int fib_sync_down_addr(struct net *net, __be32 local) 1038 { 1039 int ret = 0; 1040 unsigned int hash = fib_laddr_hashfn(local); 1041 struct hlist_head *head = &fib_info_laddrhash[hash]; 1042 struct hlist_node *node; 1043 struct fib_info *fi; 1044 1045 if (fib_info_laddrhash == NULL || local == 0) 1046 return 0; 1047 1048 hlist_for_each_entry(fi, node, head, fib_lhash) { 1049 if (fi->fib_net != net) 1050 continue; 1051 if (fi->fib_prefsrc == local) { 1052 fi->fib_flags |= RTNH_F_DEAD; 1053 ret++; 1054 } 1055 } 1056 return ret; 1057 } 1058 1059 int fib_sync_down_dev(struct net_device *dev, int force) 1060 { 1061 int ret = 0; 1062 int scope = RT_SCOPE_NOWHERE; 1063 struct fib_info *prev_fi = NULL; 1064 unsigned int hash = fib_devindex_hashfn(dev->ifindex); 1065 struct hlist_head *head = &fib_info_devhash[hash]; 1066 struct hlist_node *node; 1067 struct fib_nh *nh; 1068 1069 if (force) 1070 scope = -1; 1071 1072 hlist_for_each_entry(nh, node, head, nh_hash) { 1073 struct fib_info *fi = nh->nh_parent; 1074 int dead; 1075 1076 BUG_ON(!fi->fib_nhs); 1077 if (nh->nh_dev != dev || fi == prev_fi) 1078 continue; 1079 prev_fi = fi; 1080 dead = 0; 1081 change_nexthops(fi) { 1082 if (nh->nh_flags&RTNH_F_DEAD) 1083 dead++; 1084 else if (nh->nh_dev == dev && 1085 nh->nh_scope != scope) { 1086 nh->nh_flags |= RTNH_F_DEAD; 1087 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1088 spin_lock_bh(&fib_multipath_lock); 1089 fi->fib_power -= nh->nh_power; 1090 nh->nh_power = 0; 1091 spin_unlock_bh(&fib_multipath_lock); 1092 #endif 1093 dead++; 1094 } 1095 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1096 if (force > 1 && nh->nh_dev == dev) { 1097 dead = fi->fib_nhs; 1098 break; 1099 } 1100 #endif 1101 } endfor_nexthops(fi) 1102 if (dead == fi->fib_nhs) { 1103 fi->fib_flags |= RTNH_F_DEAD; 1104 ret++; 1105 } 1106 } 1107 1108 return ret; 1109 } 1110 1111 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1112 1113 /* 1114 Dead device goes up. We wake up dead nexthops. 1115 It takes sense only on multipath routes. 1116 */ 1117 1118 int fib_sync_up(struct net_device *dev) 1119 { 1120 struct fib_info *prev_fi; 1121 unsigned int hash; 1122 struct hlist_head *head; 1123 struct hlist_node *node; 1124 struct fib_nh *nh; 1125 int ret; 1126 1127 if (!(dev->flags&IFF_UP)) 1128 return 0; 1129 1130 prev_fi = NULL; 1131 hash = fib_devindex_hashfn(dev->ifindex); 1132 head = &fib_info_devhash[hash]; 1133 ret = 0; 1134 1135 hlist_for_each_entry(nh, node, head, nh_hash) { 1136 struct fib_info *fi = nh->nh_parent; 1137 int alive; 1138 1139 BUG_ON(!fi->fib_nhs); 1140 if (nh->nh_dev != dev || fi == prev_fi) 1141 continue; 1142 1143 prev_fi = fi; 1144 alive = 0; 1145 change_nexthops(fi) { 1146 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1147 alive++; 1148 continue; 1149 } 1150 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) 1151 continue; 1152 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) 1153 continue; 1154 alive++; 1155 spin_lock_bh(&fib_multipath_lock); 1156 nh->nh_power = 0; 1157 nh->nh_flags &= ~RTNH_F_DEAD; 1158 spin_unlock_bh(&fib_multipath_lock); 1159 } endfor_nexthops(fi) 1160 1161 if (alive > 0) { 1162 fi->fib_flags &= ~RTNH_F_DEAD; 1163 ret++; 1164 } 1165 } 1166 1167 return ret; 1168 } 1169 1170 /* 1171 The algorithm is suboptimal, but it provides really 1172 fair weighted route distribution. 1173 */ 1174 1175 void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1176 { 1177 struct fib_info *fi = res->fi; 1178 int w; 1179 1180 spin_lock_bh(&fib_multipath_lock); 1181 if (fi->fib_power <= 0) { 1182 int power = 0; 1183 change_nexthops(fi) { 1184 if (!(nh->nh_flags&RTNH_F_DEAD)) { 1185 power += nh->nh_weight; 1186 nh->nh_power = nh->nh_weight; 1187 } 1188 } endfor_nexthops(fi); 1189 fi->fib_power = power; 1190 if (power <= 0) { 1191 spin_unlock_bh(&fib_multipath_lock); 1192 /* Race condition: route has just become dead. */ 1193 res->nh_sel = 0; 1194 return; 1195 } 1196 } 1197 1198 1199 /* w should be random number [0..fi->fib_power-1], 1200 it is pretty bad approximation. 1201 */ 1202 1203 w = jiffies % fi->fib_power; 1204 1205 change_nexthops(fi) { 1206 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { 1207 if ((w -= nh->nh_power) <= 0) { 1208 nh->nh_power--; 1209 fi->fib_power--; 1210 res->nh_sel = nhsel; 1211 spin_unlock_bh(&fib_multipath_lock); 1212 return; 1213 } 1214 } 1215 } endfor_nexthops(fi); 1216 1217 /* Race condition: route has just become dead. */ 1218 res->nh_sel = 0; 1219 spin_unlock_bh(&fib_multipath_lock); 1220 } 1221 #endif 1222