1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IPv4 Forwarding Information Base: semantics. 7 * 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <asm/uaccess.h> 17 #include <asm/system.h> 18 #include <linux/bitops.h> 19 #include <linux/types.h> 20 #include <linux/kernel.h> 21 #include <linux/jiffies.h> 22 #include <linux/mm.h> 23 #include <linux/string.h> 24 #include <linux/socket.h> 25 #include <linux/sockios.h> 26 #include <linux/errno.h> 27 #include <linux/in.h> 28 #include <linux/inet.h> 29 #include <linux/inetdevice.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/proc_fs.h> 33 #include <linux/skbuff.h> 34 #include <linux/init.h> 35 #include <linux/slab.h> 36 37 #include <net/arp.h> 38 #include <net/ip.h> 39 #include <net/protocol.h> 40 #include <net/route.h> 41 #include <net/tcp.h> 42 #include <net/sock.h> 43 #include <net/ip_fib.h> 44 #include <net/netlink.h> 45 #include <net/nexthop.h> 46 47 #include "fib_lookup.h" 48 49 static DEFINE_SPINLOCK(fib_info_lock); 50 static struct hlist_head *fib_info_hash; 51 static struct hlist_head *fib_info_laddrhash; 52 static unsigned int fib_hash_size; 53 static unsigned int fib_info_cnt; 54 55 #define DEVINDEX_HASHBITS 8 56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) 57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; 58 59 #ifdef CONFIG_IP_ROUTE_MULTIPATH 60 61 static DEFINE_SPINLOCK(fib_multipath_lock); 62 63 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ 64 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 65 66 #define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ 67 for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) 68 69 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 70 71 /* Hope, that gcc will optimize it to get rid of dummy loop */ 72 73 #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ 74 for (nhsel=0; nhsel < 1; nhsel++) 75 76 #define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 77 for (nhsel=0; nhsel < 1; nhsel++) 78 79 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 80 81 #define endfor_nexthops(fi) } 82 83 84 static const struct 85 { 86 int error; 87 u8 scope; 88 } fib_props[RTN_MAX + 1] = { 89 { 90 .error = 0, 91 .scope = RT_SCOPE_NOWHERE, 92 }, /* RTN_UNSPEC */ 93 { 94 .error = 0, 95 .scope = RT_SCOPE_UNIVERSE, 96 }, /* RTN_UNICAST */ 97 { 98 .error = 0, 99 .scope = RT_SCOPE_HOST, 100 }, /* RTN_LOCAL */ 101 { 102 .error = 0, 103 .scope = RT_SCOPE_LINK, 104 }, /* RTN_BROADCAST */ 105 { 106 .error = 0, 107 .scope = RT_SCOPE_LINK, 108 }, /* RTN_ANYCAST */ 109 { 110 .error = 0, 111 .scope = RT_SCOPE_UNIVERSE, 112 }, /* RTN_MULTICAST */ 113 { 114 .error = -EINVAL, 115 .scope = RT_SCOPE_UNIVERSE, 116 }, /* RTN_BLACKHOLE */ 117 { 118 .error = -EHOSTUNREACH, 119 .scope = RT_SCOPE_UNIVERSE, 120 }, /* RTN_UNREACHABLE */ 121 { 122 .error = -EACCES, 123 .scope = RT_SCOPE_UNIVERSE, 124 }, /* RTN_PROHIBIT */ 125 { 126 .error = -EAGAIN, 127 .scope = RT_SCOPE_UNIVERSE, 128 }, /* RTN_THROW */ 129 { 130 .error = -EINVAL, 131 .scope = RT_SCOPE_NOWHERE, 132 }, /* RTN_NAT */ 133 { 134 .error = -EINVAL, 135 .scope = RT_SCOPE_NOWHERE, 136 }, /* RTN_XRESOLVE */ 137 }; 138 139 140 /* Release a nexthop info record */ 141 142 void free_fib_info(struct fib_info *fi) 143 { 144 if (fi->fib_dead == 0) { 145 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); 146 return; 147 } 148 change_nexthops(fi) { 149 if (nexthop_nh->nh_dev) 150 dev_put(nexthop_nh->nh_dev); 151 nexthop_nh->nh_dev = NULL; 152 } endfor_nexthops(fi); 153 fib_info_cnt--; 154 release_net(fi->fib_net); 155 kfree(fi); 156 } 157 158 void fib_release_info(struct fib_info *fi) 159 { 160 spin_lock_bh(&fib_info_lock); 161 if (fi && --fi->fib_treeref == 0) { 162 hlist_del(&fi->fib_hash); 163 if (fi->fib_prefsrc) 164 hlist_del(&fi->fib_lhash); 165 change_nexthops(fi) { 166 if (!nexthop_nh->nh_dev) 167 continue; 168 hlist_del(&nexthop_nh->nh_hash); 169 } endfor_nexthops(fi) 170 fi->fib_dead = 1; 171 fib_info_put(fi); 172 } 173 spin_unlock_bh(&fib_info_lock); 174 } 175 176 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 177 { 178 const struct fib_nh *onh = ofi->fib_nh; 179 180 for_nexthops(fi) { 181 if (nh->nh_oif != onh->nh_oif || 182 nh->nh_gw != onh->nh_gw || 183 nh->nh_scope != onh->nh_scope || 184 #ifdef CONFIG_IP_ROUTE_MULTIPATH 185 nh->nh_weight != onh->nh_weight || 186 #endif 187 #ifdef CONFIG_NET_CLS_ROUTE 188 nh->nh_tclassid != onh->nh_tclassid || 189 #endif 190 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) 191 return -1; 192 onh++; 193 } endfor_nexthops(fi); 194 return 0; 195 } 196 197 static inline unsigned int fib_devindex_hashfn(unsigned int val) 198 { 199 unsigned int mask = DEVINDEX_HASHSIZE - 1; 200 201 return (val ^ 202 (val >> DEVINDEX_HASHBITS) ^ 203 (val >> (DEVINDEX_HASHBITS * 2))) & mask; 204 } 205 206 static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 207 { 208 unsigned int mask = (fib_hash_size - 1); 209 unsigned int val = fi->fib_nhs; 210 211 val ^= fi->fib_protocol; 212 val ^= (__force u32)fi->fib_prefsrc; 213 val ^= fi->fib_priority; 214 for_nexthops(fi) { 215 val ^= fib_devindex_hashfn(nh->nh_oif); 216 } endfor_nexthops(fi) 217 218 return (val ^ (val >> 7) ^ (val >> 12)) & mask; 219 } 220 221 static struct fib_info *fib_find_info(const struct fib_info *nfi) 222 { 223 struct hlist_head *head; 224 struct hlist_node *node; 225 struct fib_info *fi; 226 unsigned int hash; 227 228 hash = fib_info_hashfn(nfi); 229 head = &fib_info_hash[hash]; 230 231 hlist_for_each_entry(fi, node, head, fib_hash) { 232 if (!net_eq(fi->fib_net, nfi->fib_net)) 233 continue; 234 if (fi->fib_nhs != nfi->fib_nhs) 235 continue; 236 if (nfi->fib_protocol == fi->fib_protocol && 237 nfi->fib_prefsrc == fi->fib_prefsrc && 238 nfi->fib_priority == fi->fib_priority && 239 memcmp(nfi->fib_metrics, fi->fib_metrics, 240 sizeof(fi->fib_metrics)) == 0 && 241 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && 242 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 243 return fi; 244 } 245 246 return NULL; 247 } 248 249 /* Check, that the gateway is already configured. 250 Used only by redirect accept routine. 251 */ 252 253 int ip_fib_check_default(__be32 gw, struct net_device *dev) 254 { 255 struct hlist_head *head; 256 struct hlist_node *node; 257 struct fib_nh *nh; 258 unsigned int hash; 259 260 spin_lock(&fib_info_lock); 261 262 hash = fib_devindex_hashfn(dev->ifindex); 263 head = &fib_info_devhash[hash]; 264 hlist_for_each_entry(nh, node, head, nh_hash) { 265 if (nh->nh_dev == dev && 266 nh->nh_gw == gw && 267 !(nh->nh_flags&RTNH_F_DEAD)) { 268 spin_unlock(&fib_info_lock); 269 return 0; 270 } 271 } 272 273 spin_unlock(&fib_info_lock); 274 275 return -1; 276 } 277 278 static inline size_t fib_nlmsg_size(struct fib_info *fi) 279 { 280 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) 281 + nla_total_size(4) /* RTA_TABLE */ 282 + nla_total_size(4) /* RTA_DST */ 283 + nla_total_size(4) /* RTA_PRIORITY */ 284 + nla_total_size(4); /* RTA_PREFSRC */ 285 286 /* space for nested metrics */ 287 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 288 289 if (fi->fib_nhs) { 290 /* Also handles the special case fib_nhs == 1 */ 291 292 /* each nexthop is packed in an attribute */ 293 size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); 294 295 /* may contain flow and gateway attribute */ 296 nhsize += 2 * nla_total_size(4); 297 298 /* all nexthops are packed in a nested attribute */ 299 payload += nla_total_size(fi->fib_nhs * nhsize); 300 } 301 302 return payload; 303 } 304 305 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 306 int dst_len, u32 tb_id, struct nl_info *info, 307 unsigned int nlm_flags) 308 { 309 struct sk_buff *skb; 310 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 311 int err = -ENOBUFS; 312 313 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 314 if (skb == NULL) 315 goto errout; 316 317 err = fib_dump_info(skb, info->pid, seq, event, tb_id, 318 fa->fa_type, fa->fa_scope, key, dst_len, 319 fa->fa_tos, fa->fa_info, nlm_flags); 320 if (err < 0) { 321 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 322 WARN_ON(err == -EMSGSIZE); 323 kfree_skb(skb); 324 goto errout; 325 } 326 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 327 info->nlh, GFP_KERNEL); 328 return; 329 errout: 330 if (err < 0) 331 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 332 } 333 334 /* Return the first fib alias matching TOS with 335 * priority less than or equal to PRIO. 336 */ 337 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) 338 { 339 if (fah) { 340 struct fib_alias *fa; 341 list_for_each_entry(fa, fah, fa_list) { 342 if (fa->fa_tos > tos) 343 continue; 344 if (fa->fa_info->fib_priority >= prio || 345 fa->fa_tos < tos) 346 return fa; 347 } 348 } 349 return NULL; 350 } 351 352 int fib_detect_death(struct fib_info *fi, int order, 353 struct fib_info **last_resort, int *last_idx, int dflt) 354 { 355 struct neighbour *n; 356 int state = NUD_NONE; 357 358 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); 359 if (n) { 360 state = n->nud_state; 361 neigh_release(n); 362 } 363 if (state == NUD_REACHABLE) 364 return 0; 365 if ((state&NUD_VALID) && order != dflt) 366 return 0; 367 if ((state&NUD_VALID) || 368 (*last_idx<0 && order > dflt)) { 369 *last_resort = fi; 370 *last_idx = order; 371 } 372 return 1; 373 } 374 375 #ifdef CONFIG_IP_ROUTE_MULTIPATH 376 377 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) 378 { 379 int nhs = 0; 380 381 while (rtnh_ok(rtnh, remaining)) { 382 nhs++; 383 rtnh = rtnh_next(rtnh, &remaining); 384 } 385 386 /* leftover implies invalid nexthop configuration, discard it */ 387 return remaining > 0 ? 0 : nhs; 388 } 389 390 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 391 int remaining, struct fib_config *cfg) 392 { 393 change_nexthops(fi) { 394 int attrlen; 395 396 if (!rtnh_ok(rtnh, remaining)) 397 return -EINVAL; 398 399 nexthop_nh->nh_flags = 400 (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 401 nexthop_nh->nh_oif = rtnh->rtnh_ifindex; 402 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; 403 404 attrlen = rtnh_attrlen(rtnh); 405 if (attrlen > 0) { 406 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 407 408 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 409 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 410 #ifdef CONFIG_NET_CLS_ROUTE 411 nla = nla_find(attrs, attrlen, RTA_FLOW); 412 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 413 #endif 414 } 415 416 rtnh = rtnh_next(rtnh, &remaining); 417 } endfor_nexthops(fi); 418 419 return 0; 420 } 421 422 #endif 423 424 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) 425 { 426 #ifdef CONFIG_IP_ROUTE_MULTIPATH 427 struct rtnexthop *rtnh; 428 int remaining; 429 #endif 430 431 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) 432 return 1; 433 434 if (cfg->fc_oif || cfg->fc_gw) { 435 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && 436 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) 437 return 0; 438 return 1; 439 } 440 441 #ifdef CONFIG_IP_ROUTE_MULTIPATH 442 if (cfg->fc_mp == NULL) 443 return 0; 444 445 rtnh = cfg->fc_mp; 446 remaining = cfg->fc_mp_len; 447 448 for_nexthops(fi) { 449 int attrlen; 450 451 if (!rtnh_ok(rtnh, remaining)) 452 return -EINVAL; 453 454 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif) 455 return 1; 456 457 attrlen = rtnh_attrlen(rtnh); 458 if (attrlen < 0) { 459 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 460 461 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 462 if (nla && nla_get_be32(nla) != nh->nh_gw) 463 return 1; 464 #ifdef CONFIG_NET_CLS_ROUTE 465 nla = nla_find(attrs, attrlen, RTA_FLOW); 466 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 467 return 1; 468 #endif 469 } 470 471 rtnh = rtnh_next(rtnh, &remaining); 472 } endfor_nexthops(fi); 473 #endif 474 return 0; 475 } 476 477 478 /* 479 Picture 480 ------- 481 482 Semantics of nexthop is very messy by historical reasons. 483 We have to take into account, that: 484 a) gateway can be actually local interface address, 485 so that gatewayed route is direct. 486 b) gateway must be on-link address, possibly 487 described not by an ifaddr, but also by a direct route. 488 c) If both gateway and interface are specified, they should not 489 contradict. 490 d) If we use tunnel routes, gateway could be not on-link. 491 492 Attempt to reconcile all of these (alas, self-contradictory) conditions 493 results in pretty ugly and hairy code with obscure logic. 494 495 I chose to generalized it instead, so that the size 496 of code does not increase practically, but it becomes 497 much more general. 498 Every prefix is assigned a "scope" value: "host" is local address, 499 "link" is direct route, 500 [ ... "site" ... "interior" ... ] 501 and "universe" is true gateway route with global meaning. 502 503 Every prefix refers to a set of "nexthop"s (gw, oif), 504 where gw must have narrower scope. This recursion stops 505 when gw has LOCAL scope or if "nexthop" is declared ONLINK, 506 which means that gw is forced to be on link. 507 508 Code is still hairy, but now it is apparently logically 509 consistent and very flexible. F.e. as by-product it allows 510 to co-exists in peace independent exterior and interior 511 routing processes. 512 513 Normally it looks as following. 514 515 {universe prefix} -> (gw, oif) [scope link] 516 | 517 |-> {link prefix} -> (gw, oif) [scope local] 518 | 519 |-> {local prefix} (terminal node) 520 */ 521 522 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, 523 struct fib_nh *nh) 524 { 525 int err; 526 struct net *net; 527 528 net = cfg->fc_nlinfo.nl_net; 529 if (nh->nh_gw) { 530 struct fib_result res; 531 532 if (nh->nh_flags&RTNH_F_ONLINK) { 533 struct net_device *dev; 534 535 if (cfg->fc_scope >= RT_SCOPE_LINK) 536 return -EINVAL; 537 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) 538 return -EINVAL; 539 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) 540 return -ENODEV; 541 if (!(dev->flags&IFF_UP)) 542 return -ENETDOWN; 543 nh->nh_dev = dev; 544 dev_hold(dev); 545 nh->nh_scope = RT_SCOPE_LINK; 546 return 0; 547 } 548 { 549 struct flowi fl = { 550 .nl_u = { 551 .ip4_u = { 552 .daddr = nh->nh_gw, 553 .scope = cfg->fc_scope + 1, 554 }, 555 }, 556 .oif = nh->nh_oif, 557 }; 558 559 /* It is not necessary, but requires a bit of thinking */ 560 if (fl.fl4_scope < RT_SCOPE_LINK) 561 fl.fl4_scope = RT_SCOPE_LINK; 562 if ((err = fib_lookup(net, &fl, &res)) != 0) 563 return err; 564 } 565 err = -EINVAL; 566 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) 567 goto out; 568 nh->nh_scope = res.scope; 569 nh->nh_oif = FIB_RES_OIF(res); 570 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) 571 goto out; 572 dev_hold(nh->nh_dev); 573 err = -ENETDOWN; 574 if (!(nh->nh_dev->flags & IFF_UP)) 575 goto out; 576 err = 0; 577 out: 578 fib_res_put(&res); 579 return err; 580 } else { 581 struct in_device *in_dev; 582 583 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) 584 return -EINVAL; 585 586 in_dev = inetdev_by_index(net, nh->nh_oif); 587 if (in_dev == NULL) 588 return -ENODEV; 589 if (!(in_dev->dev->flags&IFF_UP)) { 590 in_dev_put(in_dev); 591 return -ENETDOWN; 592 } 593 nh->nh_dev = in_dev->dev; 594 dev_hold(nh->nh_dev); 595 nh->nh_scope = RT_SCOPE_HOST; 596 in_dev_put(in_dev); 597 } 598 return 0; 599 } 600 601 static inline unsigned int fib_laddr_hashfn(__be32 val) 602 { 603 unsigned int mask = (fib_hash_size - 1); 604 605 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; 606 } 607 608 static struct hlist_head *fib_hash_alloc(int bytes) 609 { 610 if (bytes <= PAGE_SIZE) 611 return kzalloc(bytes, GFP_KERNEL); 612 else 613 return (struct hlist_head *) 614 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); 615 } 616 617 static void fib_hash_free(struct hlist_head *hash, int bytes) 618 { 619 if (!hash) 620 return; 621 622 if (bytes <= PAGE_SIZE) 623 kfree(hash); 624 else 625 free_pages((unsigned long) hash, get_order(bytes)); 626 } 627 628 static void fib_hash_move(struct hlist_head *new_info_hash, 629 struct hlist_head *new_laddrhash, 630 unsigned int new_size) 631 { 632 struct hlist_head *old_info_hash, *old_laddrhash; 633 unsigned int old_size = fib_hash_size; 634 unsigned int i, bytes; 635 636 spin_lock_bh(&fib_info_lock); 637 old_info_hash = fib_info_hash; 638 old_laddrhash = fib_info_laddrhash; 639 fib_hash_size = new_size; 640 641 for (i = 0; i < old_size; i++) { 642 struct hlist_head *head = &fib_info_hash[i]; 643 struct hlist_node *node, *n; 644 struct fib_info *fi; 645 646 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { 647 struct hlist_head *dest; 648 unsigned int new_hash; 649 650 hlist_del(&fi->fib_hash); 651 652 new_hash = fib_info_hashfn(fi); 653 dest = &new_info_hash[new_hash]; 654 hlist_add_head(&fi->fib_hash, dest); 655 } 656 } 657 fib_info_hash = new_info_hash; 658 659 for (i = 0; i < old_size; i++) { 660 struct hlist_head *lhead = &fib_info_laddrhash[i]; 661 struct hlist_node *node, *n; 662 struct fib_info *fi; 663 664 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { 665 struct hlist_head *ldest; 666 unsigned int new_hash; 667 668 hlist_del(&fi->fib_lhash); 669 670 new_hash = fib_laddr_hashfn(fi->fib_prefsrc); 671 ldest = &new_laddrhash[new_hash]; 672 hlist_add_head(&fi->fib_lhash, ldest); 673 } 674 } 675 fib_info_laddrhash = new_laddrhash; 676 677 spin_unlock_bh(&fib_info_lock); 678 679 bytes = old_size * sizeof(struct hlist_head *); 680 fib_hash_free(old_info_hash, bytes); 681 fib_hash_free(old_laddrhash, bytes); 682 } 683 684 struct fib_info *fib_create_info(struct fib_config *cfg) 685 { 686 int err; 687 struct fib_info *fi = NULL; 688 struct fib_info *ofi; 689 int nhs = 1; 690 struct net *net = cfg->fc_nlinfo.nl_net; 691 692 /* Fast check to catch the most weird cases */ 693 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) 694 goto err_inval; 695 696 #ifdef CONFIG_IP_ROUTE_MULTIPATH 697 if (cfg->fc_mp) { 698 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); 699 if (nhs == 0) 700 goto err_inval; 701 } 702 #endif 703 704 err = -ENOBUFS; 705 if (fib_info_cnt >= fib_hash_size) { 706 unsigned int new_size = fib_hash_size << 1; 707 struct hlist_head *new_info_hash; 708 struct hlist_head *new_laddrhash; 709 unsigned int bytes; 710 711 if (!new_size) 712 new_size = 1; 713 bytes = new_size * sizeof(struct hlist_head *); 714 new_info_hash = fib_hash_alloc(bytes); 715 new_laddrhash = fib_hash_alloc(bytes); 716 if (!new_info_hash || !new_laddrhash) { 717 fib_hash_free(new_info_hash, bytes); 718 fib_hash_free(new_laddrhash, bytes); 719 } else 720 fib_hash_move(new_info_hash, new_laddrhash, new_size); 721 722 if (!fib_hash_size) 723 goto failure; 724 } 725 726 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); 727 if (fi == NULL) 728 goto failure; 729 fib_info_cnt++; 730 731 fi->fib_net = hold_net(net); 732 fi->fib_protocol = cfg->fc_protocol; 733 fi->fib_flags = cfg->fc_flags; 734 fi->fib_priority = cfg->fc_priority; 735 fi->fib_prefsrc = cfg->fc_prefsrc; 736 737 fi->fib_nhs = nhs; 738 change_nexthops(fi) { 739 nexthop_nh->nh_parent = fi; 740 } endfor_nexthops(fi) 741 742 if (cfg->fc_mx) { 743 struct nlattr *nla; 744 int remaining; 745 746 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 747 int type = nla_type(nla); 748 749 if (type) { 750 if (type > RTAX_MAX) 751 goto err_inval; 752 fi->fib_metrics[type - 1] = nla_get_u32(nla); 753 } 754 } 755 } 756 757 if (cfg->fc_mp) { 758 #ifdef CONFIG_IP_ROUTE_MULTIPATH 759 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg); 760 if (err != 0) 761 goto failure; 762 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) 763 goto err_inval; 764 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 765 goto err_inval; 766 #ifdef CONFIG_NET_CLS_ROUTE 767 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 768 goto err_inval; 769 #endif 770 #else 771 goto err_inval; 772 #endif 773 } else { 774 struct fib_nh *nh = fi->fib_nh; 775 776 nh->nh_oif = cfg->fc_oif; 777 nh->nh_gw = cfg->fc_gw; 778 nh->nh_flags = cfg->fc_flags; 779 #ifdef CONFIG_NET_CLS_ROUTE 780 nh->nh_tclassid = cfg->fc_flow; 781 #endif 782 #ifdef CONFIG_IP_ROUTE_MULTIPATH 783 nh->nh_weight = 1; 784 #endif 785 } 786 787 if (fib_props[cfg->fc_type].error) { 788 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) 789 goto err_inval; 790 goto link_it; 791 } 792 793 if (cfg->fc_scope > RT_SCOPE_HOST) 794 goto err_inval; 795 796 if (cfg->fc_scope == RT_SCOPE_HOST) { 797 struct fib_nh *nh = fi->fib_nh; 798 799 /* Local address is added. */ 800 if (nhs != 1 || nh->nh_gw) 801 goto err_inval; 802 nh->nh_scope = RT_SCOPE_NOWHERE; 803 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif); 804 err = -ENODEV; 805 if (nh->nh_dev == NULL) 806 goto failure; 807 } else { 808 change_nexthops(fi) { 809 if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) 810 goto failure; 811 } endfor_nexthops(fi) 812 } 813 814 if (fi->fib_prefsrc) { 815 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 816 fi->fib_prefsrc != cfg->fc_dst) 817 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) 818 goto err_inval; 819 } 820 821 link_it: 822 if ((ofi = fib_find_info(fi)) != NULL) { 823 fi->fib_dead = 1; 824 free_fib_info(fi); 825 ofi->fib_treeref++; 826 return ofi; 827 } 828 829 fi->fib_treeref++; 830 atomic_inc(&fi->fib_clntref); 831 spin_lock_bh(&fib_info_lock); 832 hlist_add_head(&fi->fib_hash, 833 &fib_info_hash[fib_info_hashfn(fi)]); 834 if (fi->fib_prefsrc) { 835 struct hlist_head *head; 836 837 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; 838 hlist_add_head(&fi->fib_lhash, head); 839 } 840 change_nexthops(fi) { 841 struct hlist_head *head; 842 unsigned int hash; 843 844 if (!nexthop_nh->nh_dev) 845 continue; 846 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); 847 head = &fib_info_devhash[hash]; 848 hlist_add_head(&nexthop_nh->nh_hash, head); 849 } endfor_nexthops(fi) 850 spin_unlock_bh(&fib_info_lock); 851 return fi; 852 853 err_inval: 854 err = -EINVAL; 855 856 failure: 857 if (fi) { 858 fi->fib_dead = 1; 859 free_fib_info(fi); 860 } 861 862 return ERR_PTR(err); 863 } 864 865 /* Note! fib_semantic_match intentionally uses RCU list functions. */ 866 int fib_semantic_match(struct list_head *head, const struct flowi *flp, 867 struct fib_result *res, int prefixlen) 868 { 869 struct fib_alias *fa; 870 int nh_sel = 0; 871 872 list_for_each_entry_rcu(fa, head, fa_list) { 873 int err; 874 875 if (fa->fa_tos && 876 fa->fa_tos != flp->fl4_tos) 877 continue; 878 879 if (fa->fa_scope < flp->fl4_scope) 880 continue; 881 882 fa->fa_state |= FA_S_ACCESSED; 883 884 err = fib_props[fa->fa_type].error; 885 if (err == 0) { 886 struct fib_info *fi = fa->fa_info; 887 888 if (fi->fib_flags & RTNH_F_DEAD) 889 continue; 890 891 switch (fa->fa_type) { 892 case RTN_UNICAST: 893 case RTN_LOCAL: 894 case RTN_BROADCAST: 895 case RTN_ANYCAST: 896 case RTN_MULTICAST: 897 for_nexthops(fi) { 898 if (nh->nh_flags&RTNH_F_DEAD) 899 continue; 900 if (!flp->oif || flp->oif == nh->nh_oif) 901 break; 902 } 903 #ifdef CONFIG_IP_ROUTE_MULTIPATH 904 if (nhsel < fi->fib_nhs) { 905 nh_sel = nhsel; 906 goto out_fill_res; 907 } 908 #else 909 if (nhsel < 1) { 910 goto out_fill_res; 911 } 912 #endif 913 endfor_nexthops(fi); 914 continue; 915 916 default: 917 printk(KERN_WARNING "fib_semantic_match bad type %#x\n", 918 fa->fa_type); 919 return -EINVAL; 920 } 921 } 922 return err; 923 } 924 return 1; 925 926 out_fill_res: 927 res->prefixlen = prefixlen; 928 res->nh_sel = nh_sel; 929 res->type = fa->fa_type; 930 res->scope = fa->fa_scope; 931 res->fi = fa->fa_info; 932 atomic_inc(&res->fi->fib_clntref); 933 return 0; 934 } 935 936 /* Find appropriate source address to this destination */ 937 938 __be32 __fib_res_prefsrc(struct fib_result *res) 939 { 940 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); 941 } 942 943 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 944 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos, 945 struct fib_info *fi, unsigned int flags) 946 { 947 struct nlmsghdr *nlh; 948 struct rtmsg *rtm; 949 950 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags); 951 if (nlh == NULL) 952 return -EMSGSIZE; 953 954 rtm = nlmsg_data(nlh); 955 rtm->rtm_family = AF_INET; 956 rtm->rtm_dst_len = dst_len; 957 rtm->rtm_src_len = 0; 958 rtm->rtm_tos = tos; 959 if (tb_id < 256) 960 rtm->rtm_table = tb_id; 961 else 962 rtm->rtm_table = RT_TABLE_COMPAT; 963 NLA_PUT_U32(skb, RTA_TABLE, tb_id); 964 rtm->rtm_type = type; 965 rtm->rtm_flags = fi->fib_flags; 966 rtm->rtm_scope = scope; 967 rtm->rtm_protocol = fi->fib_protocol; 968 969 if (rtm->rtm_dst_len) 970 NLA_PUT_BE32(skb, RTA_DST, dst); 971 972 if (fi->fib_priority) 973 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority); 974 975 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) 976 goto nla_put_failure; 977 978 if (fi->fib_prefsrc) 979 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc); 980 981 if (fi->fib_nhs == 1) { 982 if (fi->fib_nh->nh_gw) 983 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw); 984 985 if (fi->fib_nh->nh_oif) 986 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 987 #ifdef CONFIG_NET_CLS_ROUTE 988 if (fi->fib_nh[0].nh_tclassid) 989 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 990 #endif 991 } 992 #ifdef CONFIG_IP_ROUTE_MULTIPATH 993 if (fi->fib_nhs > 1) { 994 struct rtnexthop *rtnh; 995 struct nlattr *mp; 996 997 mp = nla_nest_start(skb, RTA_MULTIPATH); 998 if (mp == NULL) 999 goto nla_put_failure; 1000 1001 for_nexthops(fi) { 1002 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 1003 if (rtnh == NULL) 1004 goto nla_put_failure; 1005 1006 rtnh->rtnh_flags = nh->nh_flags & 0xFF; 1007 rtnh->rtnh_hops = nh->nh_weight - 1; 1008 rtnh->rtnh_ifindex = nh->nh_oif; 1009 1010 if (nh->nh_gw) 1011 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 1012 #ifdef CONFIG_NET_CLS_ROUTE 1013 if (nh->nh_tclassid) 1014 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 1015 #endif 1016 /* length of rtnetlink header + attributes */ 1017 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 1018 } endfor_nexthops(fi); 1019 1020 nla_nest_end(skb, mp); 1021 } 1022 #endif 1023 return nlmsg_end(skb, nlh); 1024 1025 nla_put_failure: 1026 nlmsg_cancel(skb, nlh); 1027 return -EMSGSIZE; 1028 } 1029 1030 /* 1031 Update FIB if: 1032 - local address disappeared -> we must delete all the entries 1033 referring to it. 1034 - device went down -> we must shutdown all nexthops going via it. 1035 */ 1036 int fib_sync_down_addr(struct net *net, __be32 local) 1037 { 1038 int ret = 0; 1039 unsigned int hash = fib_laddr_hashfn(local); 1040 struct hlist_head *head = &fib_info_laddrhash[hash]; 1041 struct hlist_node *node; 1042 struct fib_info *fi; 1043 1044 if (fib_info_laddrhash == NULL || local == 0) 1045 return 0; 1046 1047 hlist_for_each_entry(fi, node, head, fib_lhash) { 1048 if (!net_eq(fi->fib_net, net)) 1049 continue; 1050 if (fi->fib_prefsrc == local) { 1051 fi->fib_flags |= RTNH_F_DEAD; 1052 ret++; 1053 } 1054 } 1055 return ret; 1056 } 1057 1058 int fib_sync_down_dev(struct net_device *dev, int force) 1059 { 1060 int ret = 0; 1061 int scope = RT_SCOPE_NOWHERE; 1062 struct fib_info *prev_fi = NULL; 1063 unsigned int hash = fib_devindex_hashfn(dev->ifindex); 1064 struct hlist_head *head = &fib_info_devhash[hash]; 1065 struct hlist_node *node; 1066 struct fib_nh *nh; 1067 1068 if (force) 1069 scope = -1; 1070 1071 hlist_for_each_entry(nh, node, head, nh_hash) { 1072 struct fib_info *fi = nh->nh_parent; 1073 int dead; 1074 1075 BUG_ON(!fi->fib_nhs); 1076 if (nh->nh_dev != dev || fi == prev_fi) 1077 continue; 1078 prev_fi = fi; 1079 dead = 0; 1080 change_nexthops(fi) { 1081 if (nexthop_nh->nh_flags&RTNH_F_DEAD) 1082 dead++; 1083 else if (nexthop_nh->nh_dev == dev && 1084 nexthop_nh->nh_scope != scope) { 1085 nexthop_nh->nh_flags |= RTNH_F_DEAD; 1086 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1087 spin_lock_bh(&fib_multipath_lock); 1088 fi->fib_power -= nexthop_nh->nh_power; 1089 nexthop_nh->nh_power = 0; 1090 spin_unlock_bh(&fib_multipath_lock); 1091 #endif 1092 dead++; 1093 } 1094 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1095 if (force > 1 && nexthop_nh->nh_dev == dev) { 1096 dead = fi->fib_nhs; 1097 break; 1098 } 1099 #endif 1100 } endfor_nexthops(fi) 1101 if (dead == fi->fib_nhs) { 1102 fi->fib_flags |= RTNH_F_DEAD; 1103 ret++; 1104 } 1105 } 1106 1107 return ret; 1108 } 1109 1110 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1111 1112 /* 1113 Dead device goes up. We wake up dead nexthops. 1114 It takes sense only on multipath routes. 1115 */ 1116 1117 int fib_sync_up(struct net_device *dev) 1118 { 1119 struct fib_info *prev_fi; 1120 unsigned int hash; 1121 struct hlist_head *head; 1122 struct hlist_node *node; 1123 struct fib_nh *nh; 1124 int ret; 1125 1126 if (!(dev->flags&IFF_UP)) 1127 return 0; 1128 1129 prev_fi = NULL; 1130 hash = fib_devindex_hashfn(dev->ifindex); 1131 head = &fib_info_devhash[hash]; 1132 ret = 0; 1133 1134 hlist_for_each_entry(nh, node, head, nh_hash) { 1135 struct fib_info *fi = nh->nh_parent; 1136 int alive; 1137 1138 BUG_ON(!fi->fib_nhs); 1139 if (nh->nh_dev != dev || fi == prev_fi) 1140 continue; 1141 1142 prev_fi = fi; 1143 alive = 0; 1144 change_nexthops(fi) { 1145 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1146 alive++; 1147 continue; 1148 } 1149 if (nexthop_nh->nh_dev == NULL || 1150 !(nexthop_nh->nh_dev->flags&IFF_UP)) 1151 continue; 1152 if (nexthop_nh->nh_dev != dev || 1153 !__in_dev_get_rtnl(dev)) 1154 continue; 1155 alive++; 1156 spin_lock_bh(&fib_multipath_lock); 1157 nexthop_nh->nh_power = 0; 1158 nexthop_nh->nh_flags &= ~RTNH_F_DEAD; 1159 spin_unlock_bh(&fib_multipath_lock); 1160 } endfor_nexthops(fi) 1161 1162 if (alive > 0) { 1163 fi->fib_flags &= ~RTNH_F_DEAD; 1164 ret++; 1165 } 1166 } 1167 1168 return ret; 1169 } 1170 1171 /* 1172 The algorithm is suboptimal, but it provides really 1173 fair weighted route distribution. 1174 */ 1175 1176 void fib_select_multipath(const struct flowi *flp, struct fib_result *res) 1177 { 1178 struct fib_info *fi = res->fi; 1179 int w; 1180 1181 spin_lock_bh(&fib_multipath_lock); 1182 if (fi->fib_power <= 0) { 1183 int power = 0; 1184 change_nexthops(fi) { 1185 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { 1186 power += nexthop_nh->nh_weight; 1187 nexthop_nh->nh_power = nexthop_nh->nh_weight; 1188 } 1189 } endfor_nexthops(fi); 1190 fi->fib_power = power; 1191 if (power <= 0) { 1192 spin_unlock_bh(&fib_multipath_lock); 1193 /* Race condition: route has just become dead. */ 1194 res->nh_sel = 0; 1195 return; 1196 } 1197 } 1198 1199 1200 /* w should be random number [0..fi->fib_power-1], 1201 it is pretty bad approximation. 1202 */ 1203 1204 w = jiffies % fi->fib_power; 1205 1206 change_nexthops(fi) { 1207 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && 1208 nexthop_nh->nh_power) { 1209 if ((w -= nexthop_nh->nh_power) <= 0) { 1210 nexthop_nh->nh_power--; 1211 fi->fib_power--; 1212 res->nh_sel = nhsel; 1213 spin_unlock_bh(&fib_multipath_lock); 1214 return; 1215 } 1216 } 1217 } endfor_nexthops(fi); 1218 1219 /* Race condition: route has just become dead. */ 1220 res->nh_sel = 0; 1221 spin_unlock_bh(&fib_multipath_lock); 1222 } 1223 #endif 1224