1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IPv4 Forwarding Information Base: FIB frontend. 7 * 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <asm/uaccess.h> 18 #include <linux/bitops.h> 19 #include <linux/capability.h> 20 #include <linux/types.h> 21 #include <linux/kernel.h> 22 #include <linux/mm.h> 23 #include <linux/string.h> 24 #include <linux/socket.h> 25 #include <linux/sockios.h> 26 #include <linux/errno.h> 27 #include <linux/in.h> 28 #include <linux/inet.h> 29 #include <linux/inetdevice.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_addr.h> 32 #include <linux/if_arp.h> 33 #include <linux/skbuff.h> 34 #include <linux/cache.h> 35 #include <linux/init.h> 36 #include <linux/list.h> 37 #include <linux/slab.h> 38 39 #include <net/ip.h> 40 #include <net/protocol.h> 41 #include <net/route.h> 42 #include <net/tcp.h> 43 #include <net/sock.h> 44 #include <net/arp.h> 45 #include <net/ip_fib.h> 46 #include <net/rtnetlink.h> 47 #include <net/xfrm.h> 48 49 #ifndef CONFIG_IP_MULTIPLE_TABLES 50 51 static int __net_init fib4_rules_init(struct net *net) 52 { 53 struct fib_table *local_table, *main_table; 54 55 local_table = fib_trie_table(RT_TABLE_LOCAL); 56 if (local_table == NULL) 57 return -ENOMEM; 58 59 main_table = fib_trie_table(RT_TABLE_MAIN); 60 if (main_table == NULL) 61 goto fail; 62 63 hlist_add_head_rcu(&local_table->tb_hlist, 64 &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]); 65 hlist_add_head_rcu(&main_table->tb_hlist, 66 &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]); 67 return 0; 68 69 fail: 70 fib_free_table(local_table); 71 return -ENOMEM; 72 } 73 #else 74 75 struct fib_table *fib_new_table(struct net *net, u32 id) 76 { 77 struct fib_table *tb; 78 unsigned int h; 79 80 if (id == 0) 81 id = RT_TABLE_MAIN; 82 tb = fib_get_table(net, id); 83 if (tb) 84 return tb; 85 86 tb = fib_trie_table(id); 87 if (!tb) 88 return NULL; 89 90 switch (id) { 91 case RT_TABLE_LOCAL: 92 rcu_assign_pointer(net->ipv4.fib_local, tb); 93 break; 94 case RT_TABLE_MAIN: 95 rcu_assign_pointer(net->ipv4.fib_main, tb); 96 break; 97 case RT_TABLE_DEFAULT: 98 rcu_assign_pointer(net->ipv4.fib_default, tb); 99 break; 100 default: 101 break; 102 } 103 104 h = id & (FIB_TABLE_HASHSZ - 1); 105 hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); 106 return tb; 107 } 108 109 /* caller must hold either rtnl or rcu read lock */ 110 struct fib_table *fib_get_table(struct net *net, u32 id) 111 { 112 struct fib_table *tb; 113 struct hlist_head *head; 114 unsigned int h; 115 116 if (id == 0) 117 id = RT_TABLE_MAIN; 118 h = id & (FIB_TABLE_HASHSZ - 1); 119 120 head = &net->ipv4.fib_table_hash[h]; 121 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 122 if (tb->tb_id == id) 123 return tb; 124 } 125 return NULL; 126 } 127 #endif /* CONFIG_IP_MULTIPLE_TABLES */ 128 129 static void fib_flush(struct net *net) 130 { 131 int flushed = 0; 132 unsigned int h; 133 134 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 135 struct hlist_head *head = &net->ipv4.fib_table_hash[h]; 136 struct hlist_node *tmp; 137 struct fib_table *tb; 138 139 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) 140 flushed += fib_table_flush(tb); 141 } 142 143 if (flushed) 144 rt_cache_flush(net); 145 } 146 147 void fib_flush_external(struct net *net) 148 { 149 struct fib_table *tb; 150 struct hlist_head *head; 151 unsigned int h; 152 153 for (h = 0; h < FIB_TABLE_HASHSZ; h++) { 154 head = &net->ipv4.fib_table_hash[h]; 155 hlist_for_each_entry(tb, head, tb_hlist) 156 fib_table_flush_external(tb); 157 } 158 } 159 160 /* 161 * Find address type as if only "dev" was present in the system. If 162 * on_dev is NULL then all interfaces are taken into consideration. 163 */ 164 static inline unsigned int __inet_dev_addr_type(struct net *net, 165 const struct net_device *dev, 166 __be32 addr) 167 { 168 struct flowi4 fl4 = { .daddr = addr }; 169 struct fib_result res; 170 unsigned int ret = RTN_BROADCAST; 171 struct fib_table *local_table; 172 173 if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) 174 return RTN_BROADCAST; 175 if (ipv4_is_multicast(addr)) 176 return RTN_MULTICAST; 177 178 rcu_read_lock(); 179 180 local_table = fib_get_table(net, RT_TABLE_LOCAL); 181 if (local_table) { 182 ret = RTN_UNICAST; 183 if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { 184 if (!dev || dev == res.fi->fib_dev) 185 ret = res.type; 186 } 187 } 188 189 rcu_read_unlock(); 190 return ret; 191 } 192 193 unsigned int inet_addr_type(struct net *net, __be32 addr) 194 { 195 return __inet_dev_addr_type(net, NULL, addr); 196 } 197 EXPORT_SYMBOL(inet_addr_type); 198 199 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, 200 __be32 addr) 201 { 202 return __inet_dev_addr_type(net, dev, addr); 203 } 204 EXPORT_SYMBOL(inet_dev_addr_type); 205 206 __be32 fib_compute_spec_dst(struct sk_buff *skb) 207 { 208 struct net_device *dev = skb->dev; 209 struct in_device *in_dev; 210 struct fib_result res; 211 struct rtable *rt; 212 struct flowi4 fl4; 213 struct net *net; 214 int scope; 215 216 rt = skb_rtable(skb); 217 if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) == 218 RTCF_LOCAL) 219 return ip_hdr(skb)->daddr; 220 221 in_dev = __in_dev_get_rcu(dev); 222 BUG_ON(!in_dev); 223 224 net = dev_net(dev); 225 226 scope = RT_SCOPE_UNIVERSE; 227 if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) { 228 fl4.flowi4_oif = 0; 229 fl4.flowi4_iif = LOOPBACK_IFINDEX; 230 fl4.daddr = ip_hdr(skb)->saddr; 231 fl4.saddr = 0; 232 fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); 233 fl4.flowi4_scope = scope; 234 fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; 235 if (!fib_lookup(net, &fl4, &res)) 236 return FIB_RES_PREFSRC(net, res); 237 } else { 238 scope = RT_SCOPE_LINK; 239 } 240 241 return inet_select_addr(dev, ip_hdr(skb)->saddr, scope); 242 } 243 244 /* Given (packet source, input interface) and optional (dst, oif, tos): 245 * - (main) check, that source is valid i.e. not broadcast or our local 246 * address. 247 * - figure out what "logical" interface this packet arrived 248 * and calculate "specific destination" address. 249 * - check, that packet arrived from expected physical interface. 250 * called with rcu_read_lock() 251 */ 252 static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, 253 u8 tos, int oif, struct net_device *dev, 254 int rpf, struct in_device *idev, u32 *itag) 255 { 256 int ret, no_addr; 257 struct fib_result res; 258 struct flowi4 fl4; 259 struct net *net; 260 bool dev_match; 261 262 fl4.flowi4_oif = 0; 263 fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; 264 fl4.daddr = src; 265 fl4.saddr = dst; 266 fl4.flowi4_tos = tos; 267 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 268 269 no_addr = idev->ifa_list == NULL; 270 271 fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; 272 273 net = dev_net(dev); 274 if (fib_lookup(net, &fl4, &res)) 275 goto last_resort; 276 if (res.type != RTN_UNICAST && 277 (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev))) 278 goto e_inval; 279 if (!rpf && !fib_num_tclassid_users(dev_net(dev)) && 280 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) 281 goto last_resort; 282 fib_combine_itag(itag, &res); 283 dev_match = false; 284 285 #ifdef CONFIG_IP_ROUTE_MULTIPATH 286 for (ret = 0; ret < res.fi->fib_nhs; ret++) { 287 struct fib_nh *nh = &res.fi->fib_nh[ret]; 288 289 if (nh->nh_dev == dev) { 290 dev_match = true; 291 break; 292 } 293 } 294 #else 295 if (FIB_RES_DEV(res) == dev) 296 dev_match = true; 297 #endif 298 if (dev_match) { 299 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 300 return ret; 301 } 302 if (no_addr) 303 goto last_resort; 304 if (rpf == 1) 305 goto e_rpf; 306 fl4.flowi4_oif = dev->ifindex; 307 308 ret = 0; 309 if (fib_lookup(net, &fl4, &res) == 0) { 310 if (res.type == RTN_UNICAST) 311 ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; 312 } 313 return ret; 314 315 last_resort: 316 if (rpf) 317 goto e_rpf; 318 *itag = 0; 319 return 0; 320 321 e_inval: 322 return -EINVAL; 323 e_rpf: 324 return -EXDEV; 325 } 326 327 /* Ignore rp_filter for packets protected by IPsec. */ 328 int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, 329 u8 tos, int oif, struct net_device *dev, 330 struct in_device *idev, u32 *itag) 331 { 332 int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); 333 334 if (!r && !fib_num_tclassid_users(dev_net(dev)) && 335 IN_DEV_ACCEPT_LOCAL(idev) && 336 (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { 337 *itag = 0; 338 return 0; 339 } 340 return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag); 341 } 342 343 static inline __be32 sk_extract_addr(struct sockaddr *addr) 344 { 345 return ((struct sockaddr_in *) addr)->sin_addr.s_addr; 346 } 347 348 static int put_rtax(struct nlattr *mx, int len, int type, u32 value) 349 { 350 struct nlattr *nla; 351 352 nla = (struct nlattr *) ((char *) mx + len); 353 nla->nla_type = type; 354 nla->nla_len = nla_attr_size(4); 355 *(u32 *) nla_data(nla) = value; 356 357 return len + nla_total_size(4); 358 } 359 360 static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, 361 struct fib_config *cfg) 362 { 363 __be32 addr; 364 int plen; 365 366 memset(cfg, 0, sizeof(*cfg)); 367 cfg->fc_nlinfo.nl_net = net; 368 369 if (rt->rt_dst.sa_family != AF_INET) 370 return -EAFNOSUPPORT; 371 372 /* 373 * Check mask for validity: 374 * a) it must be contiguous. 375 * b) destination must have all host bits clear. 376 * c) if application forgot to set correct family (AF_INET), 377 * reject request unless it is absolutely clear i.e. 378 * both family and mask are zero. 379 */ 380 plen = 32; 381 addr = sk_extract_addr(&rt->rt_dst); 382 if (!(rt->rt_flags & RTF_HOST)) { 383 __be32 mask = sk_extract_addr(&rt->rt_genmask); 384 385 if (rt->rt_genmask.sa_family != AF_INET) { 386 if (mask || rt->rt_genmask.sa_family) 387 return -EAFNOSUPPORT; 388 } 389 390 if (bad_mask(mask, addr)) 391 return -EINVAL; 392 393 plen = inet_mask_len(mask); 394 } 395 396 cfg->fc_dst_len = plen; 397 cfg->fc_dst = addr; 398 399 if (cmd != SIOCDELRT) { 400 cfg->fc_nlflags = NLM_F_CREATE; 401 cfg->fc_protocol = RTPROT_BOOT; 402 } 403 404 if (rt->rt_metric) 405 cfg->fc_priority = rt->rt_metric - 1; 406 407 if (rt->rt_flags & RTF_REJECT) { 408 cfg->fc_scope = RT_SCOPE_HOST; 409 cfg->fc_type = RTN_UNREACHABLE; 410 return 0; 411 } 412 413 cfg->fc_scope = RT_SCOPE_NOWHERE; 414 cfg->fc_type = RTN_UNICAST; 415 416 if (rt->rt_dev) { 417 char *colon; 418 struct net_device *dev; 419 char devname[IFNAMSIZ]; 420 421 if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1)) 422 return -EFAULT; 423 424 devname[IFNAMSIZ-1] = 0; 425 colon = strchr(devname, ':'); 426 if (colon) 427 *colon = 0; 428 dev = __dev_get_by_name(net, devname); 429 if (!dev) 430 return -ENODEV; 431 cfg->fc_oif = dev->ifindex; 432 if (colon) { 433 struct in_ifaddr *ifa; 434 struct in_device *in_dev = __in_dev_get_rtnl(dev); 435 if (!in_dev) 436 return -ENODEV; 437 *colon = ':'; 438 for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) 439 if (strcmp(ifa->ifa_label, devname) == 0) 440 break; 441 if (ifa == NULL) 442 return -ENODEV; 443 cfg->fc_prefsrc = ifa->ifa_local; 444 } 445 } 446 447 addr = sk_extract_addr(&rt->rt_gateway); 448 if (rt->rt_gateway.sa_family == AF_INET && addr) { 449 cfg->fc_gw = addr; 450 if (rt->rt_flags & RTF_GATEWAY && 451 inet_addr_type(net, addr) == RTN_UNICAST) 452 cfg->fc_scope = RT_SCOPE_UNIVERSE; 453 } 454 455 if (cmd == SIOCDELRT) 456 return 0; 457 458 if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw) 459 return -EINVAL; 460 461 if (cfg->fc_scope == RT_SCOPE_NOWHERE) 462 cfg->fc_scope = RT_SCOPE_LINK; 463 464 if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) { 465 struct nlattr *mx; 466 int len = 0; 467 468 mx = kzalloc(3 * nla_total_size(4), GFP_KERNEL); 469 if (mx == NULL) 470 return -ENOMEM; 471 472 if (rt->rt_flags & RTF_MTU) 473 len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40); 474 475 if (rt->rt_flags & RTF_WINDOW) 476 len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window); 477 478 if (rt->rt_flags & RTF_IRTT) 479 len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3); 480 481 cfg->fc_mx = mx; 482 cfg->fc_mx_len = len; 483 } 484 485 return 0; 486 } 487 488 /* 489 * Handle IP routing ioctl calls. 490 * These are used to manipulate the routing tables 491 */ 492 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) 493 { 494 struct fib_config cfg; 495 struct rtentry rt; 496 int err; 497 498 switch (cmd) { 499 case SIOCADDRT: /* Add a route */ 500 case SIOCDELRT: /* Delete a route */ 501 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 502 return -EPERM; 503 504 if (copy_from_user(&rt, arg, sizeof(rt))) 505 return -EFAULT; 506 507 rtnl_lock(); 508 err = rtentry_to_fib_config(net, cmd, &rt, &cfg); 509 if (err == 0) { 510 struct fib_table *tb; 511 512 if (cmd == SIOCDELRT) { 513 tb = fib_get_table(net, cfg.fc_table); 514 if (tb) 515 err = fib_table_delete(tb, &cfg); 516 else 517 err = -ESRCH; 518 } else { 519 tb = fib_new_table(net, cfg.fc_table); 520 if (tb) 521 err = fib_table_insert(tb, &cfg); 522 else 523 err = -ENOBUFS; 524 } 525 526 /* allocated by rtentry_to_fib_config() */ 527 kfree(cfg.fc_mx); 528 } 529 rtnl_unlock(); 530 return err; 531 } 532 return -EINVAL; 533 } 534 535 const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { 536 [RTA_DST] = { .type = NLA_U32 }, 537 [RTA_SRC] = { .type = NLA_U32 }, 538 [RTA_IIF] = { .type = NLA_U32 }, 539 [RTA_OIF] = { .type = NLA_U32 }, 540 [RTA_GATEWAY] = { .type = NLA_U32 }, 541 [RTA_PRIORITY] = { .type = NLA_U32 }, 542 [RTA_PREFSRC] = { .type = NLA_U32 }, 543 [RTA_METRICS] = { .type = NLA_NESTED }, 544 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, 545 [RTA_FLOW] = { .type = NLA_U32 }, 546 }; 547 548 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, 549 struct nlmsghdr *nlh, struct fib_config *cfg) 550 { 551 struct nlattr *attr; 552 int err, remaining; 553 struct rtmsg *rtm; 554 555 err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy); 556 if (err < 0) 557 goto errout; 558 559 memset(cfg, 0, sizeof(*cfg)); 560 561 rtm = nlmsg_data(nlh); 562 cfg->fc_dst_len = rtm->rtm_dst_len; 563 cfg->fc_tos = rtm->rtm_tos; 564 cfg->fc_table = rtm->rtm_table; 565 cfg->fc_protocol = rtm->rtm_protocol; 566 cfg->fc_scope = rtm->rtm_scope; 567 cfg->fc_type = rtm->rtm_type; 568 cfg->fc_flags = rtm->rtm_flags; 569 cfg->fc_nlflags = nlh->nlmsg_flags; 570 571 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid; 572 cfg->fc_nlinfo.nlh = nlh; 573 cfg->fc_nlinfo.nl_net = net; 574 575 if (cfg->fc_type > RTN_MAX) { 576 err = -EINVAL; 577 goto errout; 578 } 579 580 nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) { 581 switch (nla_type(attr)) { 582 case RTA_DST: 583 cfg->fc_dst = nla_get_be32(attr); 584 break; 585 case RTA_OIF: 586 cfg->fc_oif = nla_get_u32(attr); 587 break; 588 case RTA_GATEWAY: 589 cfg->fc_gw = nla_get_be32(attr); 590 break; 591 case RTA_PRIORITY: 592 cfg->fc_priority = nla_get_u32(attr); 593 break; 594 case RTA_PREFSRC: 595 cfg->fc_prefsrc = nla_get_be32(attr); 596 break; 597 case RTA_METRICS: 598 cfg->fc_mx = nla_data(attr); 599 cfg->fc_mx_len = nla_len(attr); 600 break; 601 case RTA_MULTIPATH: 602 cfg->fc_mp = nla_data(attr); 603 cfg->fc_mp_len = nla_len(attr); 604 break; 605 case RTA_FLOW: 606 cfg->fc_flow = nla_get_u32(attr); 607 break; 608 case RTA_TABLE: 609 cfg->fc_table = nla_get_u32(attr); 610 break; 611 } 612 } 613 614 return 0; 615 errout: 616 return err; 617 } 618 619 static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) 620 { 621 struct net *net = sock_net(skb->sk); 622 struct fib_config cfg; 623 struct fib_table *tb; 624 int err; 625 626 err = rtm_to_fib_config(net, skb, nlh, &cfg); 627 if (err < 0) 628 goto errout; 629 630 tb = fib_get_table(net, cfg.fc_table); 631 if (tb == NULL) { 632 err = -ESRCH; 633 goto errout; 634 } 635 636 err = fib_table_delete(tb, &cfg); 637 errout: 638 return err; 639 } 640 641 static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) 642 { 643 struct net *net = sock_net(skb->sk); 644 struct fib_config cfg; 645 struct fib_table *tb; 646 int err; 647 648 err = rtm_to_fib_config(net, skb, nlh, &cfg); 649 if (err < 0) 650 goto errout; 651 652 tb = fib_new_table(net, cfg.fc_table); 653 if (tb == NULL) { 654 err = -ENOBUFS; 655 goto errout; 656 } 657 658 err = fib_table_insert(tb, &cfg); 659 errout: 660 return err; 661 } 662 663 static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) 664 { 665 struct net *net = sock_net(skb->sk); 666 unsigned int h, s_h; 667 unsigned int e = 0, s_e; 668 struct fib_table *tb; 669 struct hlist_head *head; 670 int dumped = 0; 671 672 if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) && 673 ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED) 674 return skb->len; 675 676 s_h = cb->args[0]; 677 s_e = cb->args[1]; 678 679 rcu_read_lock(); 680 681 for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) { 682 e = 0; 683 head = &net->ipv4.fib_table_hash[h]; 684 hlist_for_each_entry_rcu(tb, head, tb_hlist) { 685 if (e < s_e) 686 goto next; 687 if (dumped) 688 memset(&cb->args[2], 0, sizeof(cb->args) - 689 2 * sizeof(cb->args[0])); 690 if (fib_table_dump(tb, skb, cb) < 0) 691 goto out; 692 dumped = 1; 693 next: 694 e++; 695 } 696 } 697 out: 698 rcu_read_unlock(); 699 700 cb->args[1] = e; 701 cb->args[0] = h; 702 703 return skb->len; 704 } 705 706 /* Prepare and feed intra-kernel routing request. 707 * Really, it should be netlink message, but :-( netlink 708 * can be not configured, so that we feed it directly 709 * to fib engine. It is legal, because all events occur 710 * only when netlink is already locked. 711 */ 712 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) 713 { 714 struct net *net = dev_net(ifa->ifa_dev->dev); 715 struct fib_table *tb; 716 struct fib_config cfg = { 717 .fc_protocol = RTPROT_KERNEL, 718 .fc_type = type, 719 .fc_dst = dst, 720 .fc_dst_len = dst_len, 721 .fc_prefsrc = ifa->ifa_local, 722 .fc_oif = ifa->ifa_dev->dev->ifindex, 723 .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND, 724 .fc_nlinfo = { 725 .nl_net = net, 726 }, 727 }; 728 729 if (type == RTN_UNICAST) 730 tb = fib_new_table(net, RT_TABLE_MAIN); 731 else 732 tb = fib_new_table(net, RT_TABLE_LOCAL); 733 734 if (tb == NULL) 735 return; 736 737 cfg.fc_table = tb->tb_id; 738 739 if (type != RTN_LOCAL) 740 cfg.fc_scope = RT_SCOPE_LINK; 741 else 742 cfg.fc_scope = RT_SCOPE_HOST; 743 744 if (cmd == RTM_NEWROUTE) 745 fib_table_insert(tb, &cfg); 746 else 747 fib_table_delete(tb, &cfg); 748 } 749 750 void fib_add_ifaddr(struct in_ifaddr *ifa) 751 { 752 struct in_device *in_dev = ifa->ifa_dev; 753 struct net_device *dev = in_dev->dev; 754 struct in_ifaddr *prim = ifa; 755 __be32 mask = ifa->ifa_mask; 756 __be32 addr = ifa->ifa_local; 757 __be32 prefix = ifa->ifa_address & mask; 758 759 if (ifa->ifa_flags & IFA_F_SECONDARY) { 760 prim = inet_ifa_byprefix(in_dev, prefix, mask); 761 if (prim == NULL) { 762 pr_warn("%s: bug: prim == NULL\n", __func__); 763 return; 764 } 765 } 766 767 fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); 768 769 if (!(dev->flags & IFF_UP)) 770 return; 771 772 /* Add broadcast address, if it is explicitly assigned. */ 773 if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) 774 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 775 776 if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && 777 (prefix != addr || ifa->ifa_prefixlen < 32)) { 778 fib_magic(RTM_NEWROUTE, 779 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, 780 prefix, ifa->ifa_prefixlen, prim); 781 782 /* Add network specific broadcasts, when it takes a sense */ 783 if (ifa->ifa_prefixlen < 31) { 784 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); 785 fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, 786 32, prim); 787 } 788 } 789 } 790 791 /* Delete primary or secondary address. 792 * Optionally, on secondary address promotion consider the addresses 793 * from subnet iprim as deleted, even if they are in device list. 794 * In this case the secondary ifa can be in device list. 795 */ 796 void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) 797 { 798 struct in_device *in_dev = ifa->ifa_dev; 799 struct net_device *dev = in_dev->dev; 800 struct in_ifaddr *ifa1; 801 struct in_ifaddr *prim = ifa, *prim1 = NULL; 802 __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; 803 __be32 any = ifa->ifa_address & ifa->ifa_mask; 804 #define LOCAL_OK 1 805 #define BRD_OK 2 806 #define BRD0_OK 4 807 #define BRD1_OK 8 808 unsigned int ok = 0; 809 int subnet = 0; /* Primary network */ 810 int gone = 1; /* Address is missing */ 811 int same_prefsrc = 0; /* Another primary with same IP */ 812 813 if (ifa->ifa_flags & IFA_F_SECONDARY) { 814 prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); 815 if (prim == NULL) { 816 pr_warn("%s: bug: prim == NULL\n", __func__); 817 return; 818 } 819 if (iprim && iprim != prim) { 820 pr_warn("%s: bug: iprim != prim\n", __func__); 821 return; 822 } 823 } else if (!ipv4_is_zeronet(any) && 824 (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) { 825 fib_magic(RTM_DELROUTE, 826 dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, 827 any, ifa->ifa_prefixlen, prim); 828 subnet = 1; 829 } 830 831 /* Deletion is more complicated than add. 832 * We should take care of not to delete too much :-) 833 * 834 * Scan address list to be sure that addresses are really gone. 835 */ 836 837 for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { 838 if (ifa1 == ifa) { 839 /* promotion, keep the IP */ 840 gone = 0; 841 continue; 842 } 843 /* Ignore IFAs from our subnet */ 844 if (iprim && ifa1->ifa_mask == iprim->ifa_mask && 845 inet_ifa_match(ifa1->ifa_address, iprim)) 846 continue; 847 848 /* Ignore ifa1 if it uses different primary IP (prefsrc) */ 849 if (ifa1->ifa_flags & IFA_F_SECONDARY) { 850 /* Another address from our subnet? */ 851 if (ifa1->ifa_mask == prim->ifa_mask && 852 inet_ifa_match(ifa1->ifa_address, prim)) 853 prim1 = prim; 854 else { 855 /* We reached the secondaries, so 856 * same_prefsrc should be determined. 857 */ 858 if (!same_prefsrc) 859 continue; 860 /* Search new prim1 if ifa1 is not 861 * using the current prim1 862 */ 863 if (!prim1 || 864 ifa1->ifa_mask != prim1->ifa_mask || 865 !inet_ifa_match(ifa1->ifa_address, prim1)) 866 prim1 = inet_ifa_byprefix(in_dev, 867 ifa1->ifa_address, 868 ifa1->ifa_mask); 869 if (!prim1) 870 continue; 871 if (prim1->ifa_local != prim->ifa_local) 872 continue; 873 } 874 } else { 875 if (prim->ifa_local != ifa1->ifa_local) 876 continue; 877 prim1 = ifa1; 878 if (prim != prim1) 879 same_prefsrc = 1; 880 } 881 if (ifa->ifa_local == ifa1->ifa_local) 882 ok |= LOCAL_OK; 883 if (ifa->ifa_broadcast == ifa1->ifa_broadcast) 884 ok |= BRD_OK; 885 if (brd == ifa1->ifa_broadcast) 886 ok |= BRD1_OK; 887 if (any == ifa1->ifa_broadcast) 888 ok |= BRD0_OK; 889 /* primary has network specific broadcasts */ 890 if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) { 891 __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask; 892 __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask; 893 894 if (!ipv4_is_zeronet(any1)) { 895 if (ifa->ifa_broadcast == brd1 || 896 ifa->ifa_broadcast == any1) 897 ok |= BRD_OK; 898 if (brd == brd1 || brd == any1) 899 ok |= BRD1_OK; 900 if (any == brd1 || any == any1) 901 ok |= BRD0_OK; 902 } 903 } 904 } 905 906 if (!(ok & BRD_OK)) 907 fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); 908 if (subnet && ifa->ifa_prefixlen < 31) { 909 if (!(ok & BRD1_OK)) 910 fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); 911 if (!(ok & BRD0_OK)) 912 fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); 913 } 914 if (!(ok & LOCAL_OK)) { 915 fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); 916 917 /* Check, that this local address finally disappeared. */ 918 if (gone && 919 inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { 920 /* And the last, but not the least thing. 921 * We must flush stray FIB entries. 922 * 923 * First of all, we scan fib_info list searching 924 * for stray nexthop entries, then ignite fib_flush. 925 */ 926 if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) 927 fib_flush(dev_net(dev)); 928 } 929 } 930 #undef LOCAL_OK 931 #undef BRD_OK 932 #undef BRD0_OK 933 #undef BRD1_OK 934 } 935 936 static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) 937 { 938 939 struct fib_result res; 940 struct flowi4 fl4 = { 941 .flowi4_mark = frn->fl_mark, 942 .daddr = frn->fl_addr, 943 .flowi4_tos = frn->fl_tos, 944 .flowi4_scope = frn->fl_scope, 945 }; 946 struct fib_table *tb; 947 948 rcu_read_lock(); 949 950 tb = fib_get_table(net, frn->tb_id_in); 951 952 frn->err = -ENOENT; 953 if (tb) { 954 local_bh_disable(); 955 956 frn->tb_id = tb->tb_id; 957 frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); 958 959 if (!frn->err) { 960 frn->prefixlen = res.prefixlen; 961 frn->nh_sel = res.nh_sel; 962 frn->type = res.type; 963 frn->scope = res.scope; 964 } 965 local_bh_enable(); 966 } 967 968 rcu_read_unlock(); 969 } 970 971 static void nl_fib_input(struct sk_buff *skb) 972 { 973 struct net *net; 974 struct fib_result_nl *frn; 975 struct nlmsghdr *nlh; 976 u32 portid; 977 978 net = sock_net(skb->sk); 979 nlh = nlmsg_hdr(skb); 980 if (skb->len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len || 981 nlmsg_len(nlh) < sizeof(*frn)) 982 return; 983 984 skb = netlink_skb_clone(skb, GFP_KERNEL); 985 if (skb == NULL) 986 return; 987 nlh = nlmsg_hdr(skb); 988 989 frn = (struct fib_result_nl *) nlmsg_data(nlh); 990 nl_fib_lookup(net, frn); 991 992 portid = NETLINK_CB(skb).portid; /* netlink portid */ 993 NETLINK_CB(skb).portid = 0; /* from kernel */ 994 NETLINK_CB(skb).dst_group = 0; /* unicast */ 995 netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT); 996 } 997 998 static int __net_init nl_fib_lookup_init(struct net *net) 999 { 1000 struct sock *sk; 1001 struct netlink_kernel_cfg cfg = { 1002 .input = nl_fib_input, 1003 }; 1004 1005 sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg); 1006 if (sk == NULL) 1007 return -EAFNOSUPPORT; 1008 net->ipv4.fibnl = sk; 1009 return 0; 1010 } 1011 1012 static void nl_fib_lookup_exit(struct net *net) 1013 { 1014 netlink_kernel_release(net->ipv4.fibnl); 1015 net->ipv4.fibnl = NULL; 1016 } 1017 1018 static void fib_disable_ip(struct net_device *dev, int force) 1019 { 1020 if (fib_sync_down_dev(dev, force)) 1021 fib_flush(dev_net(dev)); 1022 rt_cache_flush(dev_net(dev)); 1023 arp_ifdown(dev); 1024 } 1025 1026 static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) 1027 { 1028 struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; 1029 struct net_device *dev = ifa->ifa_dev->dev; 1030 struct net *net = dev_net(dev); 1031 1032 switch (event) { 1033 case NETDEV_UP: 1034 fib_add_ifaddr(ifa); 1035 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1036 fib_sync_up(dev); 1037 #endif 1038 atomic_inc(&net->ipv4.dev_addr_genid); 1039 rt_cache_flush(dev_net(dev)); 1040 break; 1041 case NETDEV_DOWN: 1042 fib_del_ifaddr(ifa, NULL); 1043 atomic_inc(&net->ipv4.dev_addr_genid); 1044 if (ifa->ifa_dev->ifa_list == NULL) { 1045 /* Last address was deleted from this interface. 1046 * Disable IP. 1047 */ 1048 fib_disable_ip(dev, 1); 1049 } else { 1050 rt_cache_flush(dev_net(dev)); 1051 } 1052 break; 1053 } 1054 return NOTIFY_DONE; 1055 } 1056 1057 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) 1058 { 1059 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1060 struct in_device *in_dev; 1061 struct net *net = dev_net(dev); 1062 1063 if (event == NETDEV_UNREGISTER) { 1064 fib_disable_ip(dev, 2); 1065 rt_flush_dev(dev); 1066 return NOTIFY_DONE; 1067 } 1068 1069 in_dev = __in_dev_get_rtnl(dev); 1070 if (!in_dev) 1071 return NOTIFY_DONE; 1072 1073 switch (event) { 1074 case NETDEV_UP: 1075 for_ifa(in_dev) { 1076 fib_add_ifaddr(ifa); 1077 } endfor_ifa(in_dev); 1078 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1079 fib_sync_up(dev); 1080 #endif 1081 atomic_inc(&net->ipv4.dev_addr_genid); 1082 rt_cache_flush(net); 1083 break; 1084 case NETDEV_DOWN: 1085 fib_disable_ip(dev, 0); 1086 break; 1087 case NETDEV_CHANGEMTU: 1088 case NETDEV_CHANGE: 1089 rt_cache_flush(net); 1090 break; 1091 } 1092 return NOTIFY_DONE; 1093 } 1094 1095 static struct notifier_block fib_inetaddr_notifier = { 1096 .notifier_call = fib_inetaddr_event, 1097 }; 1098 1099 static struct notifier_block fib_netdev_notifier = { 1100 .notifier_call = fib_netdev_event, 1101 }; 1102 1103 static int __net_init ip_fib_net_init(struct net *net) 1104 { 1105 int err; 1106 size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; 1107 1108 /* Avoid false sharing : Use at least a full cache line */ 1109 size = max_t(size_t, size, L1_CACHE_BYTES); 1110 1111 net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); 1112 if (net->ipv4.fib_table_hash == NULL) 1113 return -ENOMEM; 1114 1115 err = fib4_rules_init(net); 1116 if (err < 0) 1117 goto fail; 1118 return 0; 1119 1120 fail: 1121 kfree(net->ipv4.fib_table_hash); 1122 return err; 1123 } 1124 1125 static void ip_fib_net_exit(struct net *net) 1126 { 1127 unsigned int i; 1128 1129 #ifdef CONFIG_IP_MULTIPLE_TABLES 1130 fib4_rules_exit(net); 1131 #endif 1132 1133 rtnl_lock(); 1134 for (i = 0; i < FIB_TABLE_HASHSZ; i++) { 1135 struct hlist_head *head = &net->ipv4.fib_table_hash[i]; 1136 struct hlist_node *tmp; 1137 struct fib_table *tb; 1138 1139 /* this is done in two passes as flushing the table could 1140 * cause it to be reallocated in order to accommodate new 1141 * tnodes at the root as the table shrinks. 1142 */ 1143 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) 1144 fib_table_flush(tb); 1145 1146 hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) { 1147 #ifdef CONFIG_IP_MULTIPLE_TABLES 1148 switch (tb->tb_id) { 1149 case RT_TABLE_LOCAL: 1150 RCU_INIT_POINTER(net->ipv4.fib_local, NULL); 1151 break; 1152 case RT_TABLE_MAIN: 1153 RCU_INIT_POINTER(net->ipv4.fib_main, NULL); 1154 break; 1155 case RT_TABLE_DEFAULT: 1156 RCU_INIT_POINTER(net->ipv4.fib_default, NULL); 1157 break; 1158 default: 1159 break; 1160 } 1161 #endif 1162 hlist_del(&tb->tb_hlist); 1163 fib_free_table(tb); 1164 } 1165 } 1166 rtnl_unlock(); 1167 kfree(net->ipv4.fib_table_hash); 1168 } 1169 1170 static int __net_init fib_net_init(struct net *net) 1171 { 1172 int error; 1173 1174 #ifdef CONFIG_IP_ROUTE_CLASSID 1175 net->ipv4.fib_num_tclassid_users = 0; 1176 #endif 1177 error = ip_fib_net_init(net); 1178 if (error < 0) 1179 goto out; 1180 error = nl_fib_lookup_init(net); 1181 if (error < 0) 1182 goto out_nlfl; 1183 error = fib_proc_init(net); 1184 if (error < 0) 1185 goto out_proc; 1186 out: 1187 return error; 1188 1189 out_proc: 1190 nl_fib_lookup_exit(net); 1191 out_nlfl: 1192 ip_fib_net_exit(net); 1193 goto out; 1194 } 1195 1196 static void __net_exit fib_net_exit(struct net *net) 1197 { 1198 fib_proc_exit(net); 1199 nl_fib_lookup_exit(net); 1200 ip_fib_net_exit(net); 1201 } 1202 1203 static struct pernet_operations fib_net_ops = { 1204 .init = fib_net_init, 1205 .exit = fib_net_exit, 1206 }; 1207 1208 void __init ip_fib_init(void) 1209 { 1210 rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); 1211 rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); 1212 rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); 1213 1214 register_pernet_subsys(&fib_net_ops); 1215 register_netdevice_notifier(&fib_netdev_notifier); 1216 register_inetaddr_notifier(&fib_inetaddr_notifier); 1217 1218 fib_trie_init(); 1219 } 1220