1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * IPv4 Forwarding Information Base: semantics. 7 * 8 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 */ 15 16 #include <linux/uaccess.h> 17 #include <linux/bitops.h> 18 #include <linux/types.h> 19 #include <linux/kernel.h> 20 #include <linux/jiffies.h> 21 #include <linux/mm.h> 22 #include <linux/string.h> 23 #include <linux/socket.h> 24 #include <linux/sockios.h> 25 #include <linux/errno.h> 26 #include <linux/in.h> 27 #include <linux/inet.h> 28 #include <linux/inetdevice.h> 29 #include <linux/netdevice.h> 30 #include <linux/if_arp.h> 31 #include <linux/proc_fs.h> 32 #include <linux/skbuff.h> 33 #include <linux/init.h> 34 #include <linux/slab.h> 35 #include <linux/netlink.h> 36 37 #include <net/arp.h> 38 #include <net/ip.h> 39 #include <net/protocol.h> 40 #include <net/route.h> 41 #include <net/tcp.h> 42 #include <net/sock.h> 43 #include <net/ip_fib.h> 44 #include <net/netlink.h> 45 #include <net/nexthop.h> 46 #include <net/lwtunnel.h> 47 #include <net/fib_notifier.h> 48 49 #include "fib_lookup.h" 50 51 static DEFINE_SPINLOCK(fib_info_lock); 52 static struct hlist_head *fib_info_hash; 53 static struct hlist_head *fib_info_laddrhash; 54 static unsigned int fib_info_hash_size; 55 static unsigned int fib_info_cnt; 56 57 #define DEVINDEX_HASHBITS 8 58 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) 59 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; 60 61 #ifdef CONFIG_IP_ROUTE_MULTIPATH 62 63 #define for_nexthops(fi) { \ 64 int nhsel; const struct fib_nh *nh; \ 65 for (nhsel = 0, nh = (fi)->fib_nh; \ 66 nhsel < (fi)->fib_nhs; \ 67 nh++, nhsel++) 68 69 #define change_nexthops(fi) { \ 70 int nhsel; struct fib_nh *nexthop_nh; \ 71 for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 72 nhsel < (fi)->fib_nhs; \ 73 nexthop_nh++, nhsel++) 74 75 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 76 77 /* Hope, that gcc will optimize it to get rid of dummy loop */ 78 79 #define for_nexthops(fi) { \ 80 int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ 81 for (nhsel = 0; nhsel < 1; nhsel++) 82 83 #define change_nexthops(fi) { \ 84 int nhsel; \ 85 struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ 86 for (nhsel = 0; nhsel < 1; nhsel++) 87 88 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 89 90 #define endfor_nexthops(fi) } 91 92 93 const struct fib_prop fib_props[RTN_MAX + 1] = { 94 [RTN_UNSPEC] = { 95 .error = 0, 96 .scope = RT_SCOPE_NOWHERE, 97 }, 98 [RTN_UNICAST] = { 99 .error = 0, 100 .scope = RT_SCOPE_UNIVERSE, 101 }, 102 [RTN_LOCAL] = { 103 .error = 0, 104 .scope = RT_SCOPE_HOST, 105 }, 106 [RTN_BROADCAST] = { 107 .error = 0, 108 .scope = RT_SCOPE_LINK, 109 }, 110 [RTN_ANYCAST] = { 111 .error = 0, 112 .scope = RT_SCOPE_LINK, 113 }, 114 [RTN_MULTICAST] = { 115 .error = 0, 116 .scope = RT_SCOPE_UNIVERSE, 117 }, 118 [RTN_BLACKHOLE] = { 119 .error = -EINVAL, 120 .scope = RT_SCOPE_UNIVERSE, 121 }, 122 [RTN_UNREACHABLE] = { 123 .error = -EHOSTUNREACH, 124 .scope = RT_SCOPE_UNIVERSE, 125 }, 126 [RTN_PROHIBIT] = { 127 .error = -EACCES, 128 .scope = RT_SCOPE_UNIVERSE, 129 }, 130 [RTN_THROW] = { 131 .error = -EAGAIN, 132 .scope = RT_SCOPE_UNIVERSE, 133 }, 134 [RTN_NAT] = { 135 .error = -EINVAL, 136 .scope = RT_SCOPE_NOWHERE, 137 }, 138 [RTN_XRESOLVE] = { 139 .error = -EINVAL, 140 .scope = RT_SCOPE_NOWHERE, 141 }, 142 }; 143 144 static void rt_fibinfo_free(struct rtable __rcu **rtp) 145 { 146 struct rtable *rt = rcu_dereference_protected(*rtp, 1); 147 148 if (!rt) 149 return; 150 151 /* Not even needed : RCU_INIT_POINTER(*rtp, NULL); 152 * because we waited an RCU grace period before calling 153 * free_fib_info_rcu() 154 */ 155 156 dst_dev_put(&rt->dst); 157 dst_release_immediate(&rt->dst); 158 } 159 160 static void free_nh_exceptions(struct fib_nh *nh) 161 { 162 struct fnhe_hash_bucket *hash; 163 int i; 164 165 hash = rcu_dereference_protected(nh->nh_exceptions, 1); 166 if (!hash) 167 return; 168 for (i = 0; i < FNHE_HASH_SIZE; i++) { 169 struct fib_nh_exception *fnhe; 170 171 fnhe = rcu_dereference_protected(hash[i].chain, 1); 172 while (fnhe) { 173 struct fib_nh_exception *next; 174 175 next = rcu_dereference_protected(fnhe->fnhe_next, 1); 176 177 rt_fibinfo_free(&fnhe->fnhe_rth_input); 178 rt_fibinfo_free(&fnhe->fnhe_rth_output); 179 180 kfree(fnhe); 181 182 fnhe = next; 183 } 184 } 185 kfree(hash); 186 } 187 188 static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) 189 { 190 int cpu; 191 192 if (!rtp) 193 return; 194 195 for_each_possible_cpu(cpu) { 196 struct rtable *rt; 197 198 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1); 199 if (rt) { 200 dst_dev_put(&rt->dst); 201 dst_release_immediate(&rt->dst); 202 } 203 } 204 free_percpu(rtp); 205 } 206 207 void fib_nh_release(struct net *net, struct fib_nh *fib_nh) 208 { 209 #ifdef CONFIG_IP_ROUTE_CLASSID 210 if (fib_nh->nh_tclassid) 211 net->ipv4.fib_num_tclassid_users--; 212 #endif 213 if (fib_nh->fib_nh_dev) 214 dev_put(fib_nh->fib_nh_dev); 215 216 lwtstate_put(fib_nh->fib_nh_lws); 217 free_nh_exceptions(fib_nh); 218 rt_fibinfo_free_cpus(fib_nh->nh_pcpu_rth_output); 219 rt_fibinfo_free(&fib_nh->nh_rth_input); 220 } 221 222 /* Release a nexthop info record */ 223 static void free_fib_info_rcu(struct rcu_head *head) 224 { 225 struct fib_info *fi = container_of(head, struct fib_info, rcu); 226 227 change_nexthops(fi) { 228 fib_nh_release(fi->fib_net, nexthop_nh); 229 } endfor_nexthops(fi); 230 231 ip_fib_metrics_put(fi->fib_metrics); 232 233 kfree(fi); 234 } 235 236 void free_fib_info(struct fib_info *fi) 237 { 238 if (fi->fib_dead == 0) { 239 pr_warn("Freeing alive fib_info %p\n", fi); 240 return; 241 } 242 fib_info_cnt--; 243 244 call_rcu(&fi->rcu, free_fib_info_rcu); 245 } 246 EXPORT_SYMBOL_GPL(free_fib_info); 247 248 void fib_release_info(struct fib_info *fi) 249 { 250 spin_lock_bh(&fib_info_lock); 251 if (fi && --fi->fib_treeref == 0) { 252 hlist_del(&fi->fib_hash); 253 if (fi->fib_prefsrc) 254 hlist_del(&fi->fib_lhash); 255 change_nexthops(fi) { 256 if (!nexthop_nh->fib_nh_dev) 257 continue; 258 hlist_del(&nexthop_nh->nh_hash); 259 } endfor_nexthops(fi) 260 fi->fib_dead = 1; 261 fib_info_put(fi); 262 } 263 spin_unlock_bh(&fib_info_lock); 264 } 265 266 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) 267 { 268 const struct fib_nh *onh = ofi->fib_nh; 269 270 for_nexthops(fi) { 271 if (nh->fib_nh_oif != onh->fib_nh_oif || 272 nh->fib_nh_gw4 != onh->fib_nh_gw4 || 273 nh->fib_nh_scope != onh->fib_nh_scope || 274 #ifdef CONFIG_IP_ROUTE_MULTIPATH 275 nh->fib_nh_weight != onh->fib_nh_weight || 276 #endif 277 #ifdef CONFIG_IP_ROUTE_CLASSID 278 nh->nh_tclassid != onh->nh_tclassid || 279 #endif 280 lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) || 281 ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK)) 282 return -1; 283 onh++; 284 } endfor_nexthops(fi); 285 return 0; 286 } 287 288 static inline unsigned int fib_devindex_hashfn(unsigned int val) 289 { 290 unsigned int mask = DEVINDEX_HASHSIZE - 1; 291 292 return (val ^ 293 (val >> DEVINDEX_HASHBITS) ^ 294 (val >> (DEVINDEX_HASHBITS * 2))) & mask; 295 } 296 297 static inline unsigned int fib_info_hashfn(const struct fib_info *fi) 298 { 299 unsigned int mask = (fib_info_hash_size - 1); 300 unsigned int val = fi->fib_nhs; 301 302 val ^= (fi->fib_protocol << 8) | fi->fib_scope; 303 val ^= (__force u32)fi->fib_prefsrc; 304 val ^= fi->fib_priority; 305 for_nexthops(fi) { 306 val ^= fib_devindex_hashfn(nh->fib_nh_oif); 307 } endfor_nexthops(fi) 308 309 return (val ^ (val >> 7) ^ (val >> 12)) & mask; 310 } 311 312 static struct fib_info *fib_find_info(const struct fib_info *nfi) 313 { 314 struct hlist_head *head; 315 struct fib_info *fi; 316 unsigned int hash; 317 318 hash = fib_info_hashfn(nfi); 319 head = &fib_info_hash[hash]; 320 321 hlist_for_each_entry(fi, head, fib_hash) { 322 if (!net_eq(fi->fib_net, nfi->fib_net)) 323 continue; 324 if (fi->fib_nhs != nfi->fib_nhs) 325 continue; 326 if (nfi->fib_protocol == fi->fib_protocol && 327 nfi->fib_scope == fi->fib_scope && 328 nfi->fib_prefsrc == fi->fib_prefsrc && 329 nfi->fib_priority == fi->fib_priority && 330 nfi->fib_type == fi->fib_type && 331 memcmp(nfi->fib_metrics, fi->fib_metrics, 332 sizeof(u32) * RTAX_MAX) == 0 && 333 !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) && 334 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) 335 return fi; 336 } 337 338 return NULL; 339 } 340 341 /* Check, that the gateway is already configured. 342 * Used only by redirect accept routine. 343 */ 344 int ip_fib_check_default(__be32 gw, struct net_device *dev) 345 { 346 struct hlist_head *head; 347 struct fib_nh *nh; 348 unsigned int hash; 349 350 spin_lock(&fib_info_lock); 351 352 hash = fib_devindex_hashfn(dev->ifindex); 353 head = &fib_info_devhash[hash]; 354 hlist_for_each_entry(nh, head, nh_hash) { 355 if (nh->fib_nh_dev == dev && 356 nh->fib_nh_gw4 == gw && 357 !(nh->fib_nh_flags & RTNH_F_DEAD)) { 358 spin_unlock(&fib_info_lock); 359 return 0; 360 } 361 } 362 363 spin_unlock(&fib_info_lock); 364 365 return -1; 366 } 367 368 static inline size_t fib_nlmsg_size(struct fib_info *fi) 369 { 370 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) 371 + nla_total_size(4) /* RTA_TABLE */ 372 + nla_total_size(4) /* RTA_DST */ 373 + nla_total_size(4) /* RTA_PRIORITY */ 374 + nla_total_size(4) /* RTA_PREFSRC */ 375 + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */ 376 377 /* space for nested metrics */ 378 payload += nla_total_size((RTAX_MAX * nla_total_size(4))); 379 380 if (fi->fib_nhs) { 381 size_t nh_encapsize = 0; 382 /* Also handles the special case fib_nhs == 1 */ 383 384 /* each nexthop is packed in an attribute */ 385 size_t nhsize = nla_total_size(sizeof(struct rtnexthop)); 386 387 /* may contain flow and gateway attribute */ 388 nhsize += 2 * nla_total_size(4); 389 390 /* grab encap info */ 391 for_nexthops(fi) { 392 if (nh->fib_nh_lws) { 393 /* RTA_ENCAP_TYPE */ 394 nh_encapsize += lwtunnel_get_encap_size( 395 nh->fib_nh_lws); 396 /* RTA_ENCAP */ 397 nh_encapsize += nla_total_size(2); 398 } 399 } endfor_nexthops(fi); 400 401 /* all nexthops are packed in a nested attribute */ 402 payload += nla_total_size((fi->fib_nhs * nhsize) + 403 nh_encapsize); 404 405 } 406 407 return payload; 408 } 409 410 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, 411 int dst_len, u32 tb_id, const struct nl_info *info, 412 unsigned int nlm_flags) 413 { 414 struct sk_buff *skb; 415 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 416 int err = -ENOBUFS; 417 418 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL); 419 if (!skb) 420 goto errout; 421 422 err = fib_dump_info(skb, info->portid, seq, event, tb_id, 423 fa->fa_type, key, dst_len, 424 fa->fa_tos, fa->fa_info, nlm_flags); 425 if (err < 0) { 426 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ 427 WARN_ON(err == -EMSGSIZE); 428 kfree_skb(skb); 429 goto errout; 430 } 431 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE, 432 info->nlh, GFP_KERNEL); 433 return; 434 errout: 435 if (err < 0) 436 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 437 } 438 439 static int fib_detect_death(struct fib_info *fi, int order, 440 struct fib_info **last_resort, int *last_idx, 441 int dflt) 442 { 443 struct neighbour *n; 444 int state = NUD_NONE; 445 446 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].fib_nh_gw4, fi->fib_dev); 447 if (n) { 448 state = n->nud_state; 449 neigh_release(n); 450 } else { 451 return 0; 452 } 453 if (state == NUD_REACHABLE) 454 return 0; 455 if ((state & NUD_VALID) && order != dflt) 456 return 0; 457 if ((state & NUD_VALID) || 458 (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { 459 *last_resort = fi; 460 *last_idx = order; 461 } 462 return 1; 463 } 464 465 int fib_nh_init(struct net *net, struct fib_nh *nh, 466 struct fib_config *cfg, int nh_weight, 467 struct netlink_ext_ack *extack) 468 { 469 int err = -ENOMEM; 470 471 nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); 472 if (!nh->nh_pcpu_rth_output) 473 goto err_out; 474 475 if (cfg->fc_encap) { 476 struct lwtunnel_state *lwtstate; 477 478 err = -EINVAL; 479 if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) { 480 NL_SET_ERR_MSG(extack, "LWT encap type not specified"); 481 goto lwt_failure; 482 } 483 err = lwtunnel_build_state(cfg->fc_encap_type, 484 cfg->fc_encap, AF_INET, cfg, 485 &lwtstate, extack); 486 if (err) 487 goto lwt_failure; 488 489 nh->fib_nh_lws = lwtstate_get(lwtstate); 490 } 491 492 nh->fib_nh_oif = cfg->fc_oif; 493 nh->fib_nh_gw4 = cfg->fc_gw; 494 nh->fib_nh_flags = cfg->fc_flags; 495 496 #ifdef CONFIG_IP_ROUTE_CLASSID 497 nh->nh_tclassid = cfg->fc_flow; 498 if (nh->nh_tclassid) 499 net->ipv4.fib_num_tclassid_users++; 500 #endif 501 #ifdef CONFIG_IP_ROUTE_MULTIPATH 502 nh->fib_nh_weight = nh_weight; 503 #endif 504 return 0; 505 506 lwt_failure: 507 rt_fibinfo_free_cpus(nh->nh_pcpu_rth_output); 508 nh->nh_pcpu_rth_output = NULL; 509 err_out: 510 return err; 511 } 512 513 #ifdef CONFIG_IP_ROUTE_MULTIPATH 514 515 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining, 516 struct netlink_ext_ack *extack) 517 { 518 int nhs = 0; 519 520 while (rtnh_ok(rtnh, remaining)) { 521 nhs++; 522 rtnh = rtnh_next(rtnh, &remaining); 523 } 524 525 /* leftover implies invalid nexthop configuration, discard it */ 526 if (remaining > 0) { 527 NL_SET_ERR_MSG(extack, 528 "Invalid nexthop configuration - extra data after nexthops"); 529 nhs = 0; 530 } 531 532 return nhs; 533 } 534 535 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 536 int remaining, struct fib_config *cfg, 537 struct netlink_ext_ack *extack) 538 { 539 struct net *net = fi->fib_net; 540 struct fib_config fib_cfg; 541 int ret; 542 543 change_nexthops(fi) { 544 int attrlen; 545 546 memset(&fib_cfg, 0, sizeof(fib_cfg)); 547 548 if (!rtnh_ok(rtnh, remaining)) { 549 NL_SET_ERR_MSG(extack, 550 "Invalid nexthop configuration - extra data after nexthop"); 551 return -EINVAL; 552 } 553 554 if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { 555 NL_SET_ERR_MSG(extack, 556 "Invalid flags for nexthop - can not contain DEAD or LINKDOWN"); 557 return -EINVAL; 558 } 559 560 fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; 561 fib_cfg.fc_oif = rtnh->rtnh_ifindex; 562 563 attrlen = rtnh_attrlen(rtnh); 564 if (attrlen > 0) { 565 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 566 567 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 568 if (nla) 569 fib_cfg.fc_gw = nla_get_in_addr(nla); 570 571 nla = nla_find(attrs, attrlen, RTA_FLOW); 572 if (nla) 573 fib_cfg.fc_flow = nla_get_u32(nla); 574 575 fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); 576 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); 577 if (nla) 578 fib_cfg.fc_encap_type = nla_get_u16(nla); 579 } 580 581 ret = fib_nh_init(net, nexthop_nh, &fib_cfg, 582 rtnh->rtnh_hops + 1, extack); 583 if (ret) 584 goto errout; 585 586 rtnh = rtnh_next(rtnh, &remaining); 587 } endfor_nexthops(fi); 588 589 ret = -EINVAL; 590 if (cfg->fc_oif && fi->fib_nh->fib_nh_oif != cfg->fc_oif) { 591 NL_SET_ERR_MSG(extack, 592 "Nexthop device index does not match RTA_OIF"); 593 goto errout; 594 } 595 if (cfg->fc_gw && fi->fib_nh->fib_nh_gw4 != cfg->fc_gw) { 596 NL_SET_ERR_MSG(extack, 597 "Nexthop gateway does not match RTA_GATEWAY"); 598 goto errout; 599 } 600 #ifdef CONFIG_IP_ROUTE_CLASSID 601 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) { 602 NL_SET_ERR_MSG(extack, 603 "Nexthop class id does not match RTA_FLOW"); 604 goto errout; 605 } 606 #endif 607 ret = 0; 608 errout: 609 return ret; 610 } 611 612 static void fib_rebalance(struct fib_info *fi) 613 { 614 int total; 615 int w; 616 617 if (fi->fib_nhs < 2) 618 return; 619 620 total = 0; 621 for_nexthops(fi) { 622 if (nh->fib_nh_flags & RTNH_F_DEAD) 623 continue; 624 625 if (ip_ignore_linkdown(nh->fib_nh_dev) && 626 nh->fib_nh_flags & RTNH_F_LINKDOWN) 627 continue; 628 629 total += nh->fib_nh_weight; 630 } endfor_nexthops(fi); 631 632 w = 0; 633 change_nexthops(fi) { 634 int upper_bound; 635 636 if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) { 637 upper_bound = -1; 638 } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) && 639 nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) { 640 upper_bound = -1; 641 } else { 642 w += nexthop_nh->fib_nh_weight; 643 upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, 644 total) - 1; 645 } 646 647 atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound); 648 } endfor_nexthops(fi); 649 } 650 #else /* CONFIG_IP_ROUTE_MULTIPATH */ 651 652 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, 653 int remaining, struct fib_config *cfg, 654 struct netlink_ext_ack *extack) 655 { 656 NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel"); 657 658 return -EINVAL; 659 } 660 661 #define fib_rebalance(fi) do { } while (0) 662 663 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 664 665 static int fib_encap_match(u16 encap_type, 666 struct nlattr *encap, 667 const struct fib_nh *nh, 668 const struct fib_config *cfg, 669 struct netlink_ext_ack *extack) 670 { 671 struct lwtunnel_state *lwtstate; 672 int ret, result = 0; 673 674 if (encap_type == LWTUNNEL_ENCAP_NONE) 675 return 0; 676 677 ret = lwtunnel_build_state(encap_type, encap, AF_INET, 678 cfg, &lwtstate, extack); 679 if (!ret) { 680 result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws); 681 lwtstate_free(lwtstate); 682 } 683 684 return result; 685 } 686 687 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, 688 struct netlink_ext_ack *extack) 689 { 690 #ifdef CONFIG_IP_ROUTE_MULTIPATH 691 struct rtnexthop *rtnh; 692 int remaining; 693 #endif 694 695 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority) 696 return 1; 697 698 if (cfg->fc_oif || cfg->fc_gw) { 699 if (cfg->fc_encap) { 700 if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap, 701 fi->fib_nh, cfg, extack)) 702 return 1; 703 } 704 #ifdef CONFIG_IP_ROUTE_CLASSID 705 if (cfg->fc_flow && 706 cfg->fc_flow != fi->fib_nh->nh_tclassid) 707 return 1; 708 #endif 709 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->fib_nh_oif) && 710 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->fib_nh_gw4)) 711 return 0; 712 return 1; 713 } 714 715 #ifdef CONFIG_IP_ROUTE_MULTIPATH 716 if (!cfg->fc_mp) 717 return 0; 718 719 rtnh = cfg->fc_mp; 720 remaining = cfg->fc_mp_len; 721 722 for_nexthops(fi) { 723 int attrlen; 724 725 if (!rtnh_ok(rtnh, remaining)) 726 return -EINVAL; 727 728 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif) 729 return 1; 730 731 attrlen = rtnh_attrlen(rtnh); 732 if (attrlen > 0) { 733 struct nlattr *nla, *attrs = rtnh_attrs(rtnh); 734 735 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 736 if (nla && nla_get_in_addr(nla) != nh->fib_nh_gw4) 737 return 1; 738 #ifdef CONFIG_IP_ROUTE_CLASSID 739 nla = nla_find(attrs, attrlen, RTA_FLOW); 740 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 741 return 1; 742 #endif 743 } 744 745 rtnh = rtnh_next(rtnh, &remaining); 746 } endfor_nexthops(fi); 747 #endif 748 return 0; 749 } 750 751 bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) 752 { 753 struct nlattr *nla; 754 int remaining; 755 756 if (!cfg->fc_mx) 757 return true; 758 759 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { 760 int type = nla_type(nla); 761 u32 fi_val, val; 762 763 if (!type) 764 continue; 765 if (type > RTAX_MAX) 766 return false; 767 768 if (type == RTAX_CC_ALGO) { 769 char tmp[TCP_CA_NAME_MAX]; 770 bool ecn_ca = false; 771 772 nla_strlcpy(tmp, nla, sizeof(tmp)); 773 val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); 774 } else { 775 if (nla_len(nla) != sizeof(u32)) 776 return false; 777 val = nla_get_u32(nla); 778 } 779 780 fi_val = fi->fib_metrics->metrics[type - 1]; 781 if (type == RTAX_FEATURES) 782 fi_val &= ~DST_FEATURE_ECN_CA; 783 784 if (fi_val != val) 785 return false; 786 } 787 788 return true; 789 } 790 791 792 /* 793 * Picture 794 * ------- 795 * 796 * Semantics of nexthop is very messy by historical reasons. 797 * We have to take into account, that: 798 * a) gateway can be actually local interface address, 799 * so that gatewayed route is direct. 800 * b) gateway must be on-link address, possibly 801 * described not by an ifaddr, but also by a direct route. 802 * c) If both gateway and interface are specified, they should not 803 * contradict. 804 * d) If we use tunnel routes, gateway could be not on-link. 805 * 806 * Attempt to reconcile all of these (alas, self-contradictory) conditions 807 * results in pretty ugly and hairy code with obscure logic. 808 * 809 * I chose to generalized it instead, so that the size 810 * of code does not increase practically, but it becomes 811 * much more general. 812 * Every prefix is assigned a "scope" value: "host" is local address, 813 * "link" is direct route, 814 * [ ... "site" ... "interior" ... ] 815 * and "universe" is true gateway route with global meaning. 816 * 817 * Every prefix refers to a set of "nexthop"s (gw, oif), 818 * where gw must have narrower scope. This recursion stops 819 * when gw has LOCAL scope or if "nexthop" is declared ONLINK, 820 * which means that gw is forced to be on link. 821 * 822 * Code is still hairy, but now it is apparently logically 823 * consistent and very flexible. F.e. as by-product it allows 824 * to co-exists in peace independent exterior and interior 825 * routing processes. 826 * 827 * Normally it looks as following. 828 * 829 * {universe prefix} -> (gw, oif) [scope link] 830 * | 831 * |-> {link prefix} -> (gw, oif) [scope local] 832 * | 833 * |-> {local prefix} (terminal node) 834 */ 835 static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh, 836 struct netlink_ext_ack *extack) 837 { 838 int err = 0; 839 struct net *net; 840 struct net_device *dev; 841 842 net = cfg->fc_nlinfo.nl_net; 843 if (nh->fib_nh_gw4) { 844 struct fib_result res; 845 846 if (nh->fib_nh_flags & RTNH_F_ONLINK) { 847 unsigned int addr_type; 848 849 if (cfg->fc_scope >= RT_SCOPE_LINK) { 850 NL_SET_ERR_MSG(extack, 851 "Nexthop has invalid scope"); 852 return -EINVAL; 853 } 854 dev = __dev_get_by_index(net, nh->fib_nh_oif); 855 if (!dev) { 856 NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); 857 return -ENODEV; 858 } 859 if (!(dev->flags & IFF_UP)) { 860 NL_SET_ERR_MSG(extack, 861 "Nexthop device is not up"); 862 return -ENETDOWN; 863 } 864 addr_type = inet_addr_type_dev_table(net, dev, 865 nh->fib_nh_gw4); 866 if (addr_type != RTN_UNICAST) { 867 NL_SET_ERR_MSG(extack, 868 "Nexthop has invalid gateway"); 869 return -EINVAL; 870 } 871 if (!netif_carrier_ok(dev)) 872 nh->fib_nh_flags |= RTNH_F_LINKDOWN; 873 nh->fib_nh_dev = dev; 874 dev_hold(dev); 875 nh->fib_nh_scope = RT_SCOPE_LINK; 876 return 0; 877 } 878 rcu_read_lock(); 879 { 880 struct fib_table *tbl = NULL; 881 struct flowi4 fl4 = { 882 .daddr = nh->fib_nh_gw4, 883 .flowi4_scope = cfg->fc_scope + 1, 884 .flowi4_oif = nh->fib_nh_oif, 885 .flowi4_iif = LOOPBACK_IFINDEX, 886 }; 887 888 /* It is not necessary, but requires a bit of thinking */ 889 if (fl4.flowi4_scope < RT_SCOPE_LINK) 890 fl4.flowi4_scope = RT_SCOPE_LINK; 891 892 if (cfg->fc_table) 893 tbl = fib_get_table(net, cfg->fc_table); 894 895 if (tbl) 896 err = fib_table_lookup(tbl, &fl4, &res, 897 FIB_LOOKUP_IGNORE_LINKSTATE | 898 FIB_LOOKUP_NOREF); 899 900 /* on error or if no table given do full lookup. This 901 * is needed for example when nexthops are in the local 902 * table rather than the given table 903 */ 904 if (!tbl || err) { 905 err = fib_lookup(net, &fl4, &res, 906 FIB_LOOKUP_IGNORE_LINKSTATE); 907 } 908 909 if (err) { 910 NL_SET_ERR_MSG(extack, 911 "Nexthop has invalid gateway"); 912 rcu_read_unlock(); 913 return err; 914 } 915 } 916 err = -EINVAL; 917 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) { 918 NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); 919 goto out; 920 } 921 nh->fib_nh_scope = res.scope; 922 nh->fib_nh_oif = FIB_RES_OIF(res); 923 nh->fib_nh_dev = dev = FIB_RES_DEV(res); 924 if (!dev) { 925 NL_SET_ERR_MSG(extack, 926 "No egress device for nexthop gateway"); 927 goto out; 928 } 929 dev_hold(dev); 930 if (!netif_carrier_ok(dev)) 931 nh->fib_nh_flags |= RTNH_F_LINKDOWN; 932 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; 933 } else { 934 struct in_device *in_dev; 935 936 if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) { 937 NL_SET_ERR_MSG(extack, 938 "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set"); 939 return -EINVAL; 940 } 941 rcu_read_lock(); 942 err = -ENODEV; 943 in_dev = inetdev_by_index(net, nh->fib_nh_oif); 944 if (!in_dev) 945 goto out; 946 err = -ENETDOWN; 947 if (!(in_dev->dev->flags & IFF_UP)) { 948 NL_SET_ERR_MSG(extack, "Device for nexthop is not up"); 949 goto out; 950 } 951 nh->fib_nh_dev = in_dev->dev; 952 dev_hold(nh->fib_nh_dev); 953 nh->fib_nh_scope = RT_SCOPE_HOST; 954 if (!netif_carrier_ok(nh->fib_nh_dev)) 955 nh->fib_nh_flags |= RTNH_F_LINKDOWN; 956 err = 0; 957 } 958 out: 959 rcu_read_unlock(); 960 return err; 961 } 962 963 static inline unsigned int fib_laddr_hashfn(__be32 val) 964 { 965 unsigned int mask = (fib_info_hash_size - 1); 966 967 return ((__force u32)val ^ 968 ((__force u32)val >> 7) ^ 969 ((__force u32)val >> 14)) & mask; 970 } 971 972 static struct hlist_head *fib_info_hash_alloc(int bytes) 973 { 974 if (bytes <= PAGE_SIZE) 975 return kzalloc(bytes, GFP_KERNEL); 976 else 977 return (struct hlist_head *) 978 __get_free_pages(GFP_KERNEL | __GFP_ZERO, 979 get_order(bytes)); 980 } 981 982 static void fib_info_hash_free(struct hlist_head *hash, int bytes) 983 { 984 if (!hash) 985 return; 986 987 if (bytes <= PAGE_SIZE) 988 kfree(hash); 989 else 990 free_pages((unsigned long) hash, get_order(bytes)); 991 } 992 993 static void fib_info_hash_move(struct hlist_head *new_info_hash, 994 struct hlist_head *new_laddrhash, 995 unsigned int new_size) 996 { 997 struct hlist_head *old_info_hash, *old_laddrhash; 998 unsigned int old_size = fib_info_hash_size; 999 unsigned int i, bytes; 1000 1001 spin_lock_bh(&fib_info_lock); 1002 old_info_hash = fib_info_hash; 1003 old_laddrhash = fib_info_laddrhash; 1004 fib_info_hash_size = new_size; 1005 1006 for (i = 0; i < old_size; i++) { 1007 struct hlist_head *head = &fib_info_hash[i]; 1008 struct hlist_node *n; 1009 struct fib_info *fi; 1010 1011 hlist_for_each_entry_safe(fi, n, head, fib_hash) { 1012 struct hlist_head *dest; 1013 unsigned int new_hash; 1014 1015 new_hash = fib_info_hashfn(fi); 1016 dest = &new_info_hash[new_hash]; 1017 hlist_add_head(&fi->fib_hash, dest); 1018 } 1019 } 1020 fib_info_hash = new_info_hash; 1021 1022 for (i = 0; i < old_size; i++) { 1023 struct hlist_head *lhead = &fib_info_laddrhash[i]; 1024 struct hlist_node *n; 1025 struct fib_info *fi; 1026 1027 hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { 1028 struct hlist_head *ldest; 1029 unsigned int new_hash; 1030 1031 new_hash = fib_laddr_hashfn(fi->fib_prefsrc); 1032 ldest = &new_laddrhash[new_hash]; 1033 hlist_add_head(&fi->fib_lhash, ldest); 1034 } 1035 } 1036 fib_info_laddrhash = new_laddrhash; 1037 1038 spin_unlock_bh(&fib_info_lock); 1039 1040 bytes = old_size * sizeof(struct hlist_head *); 1041 fib_info_hash_free(old_info_hash, bytes); 1042 fib_info_hash_free(old_laddrhash, bytes); 1043 } 1044 1045 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) 1046 { 1047 nh->nh_saddr = inet_select_addr(nh->fib_nh_dev, 1048 nh->fib_nh_gw4, 1049 nh->nh_parent->fib_scope); 1050 nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid); 1051 1052 return nh->nh_saddr; 1053 } 1054 1055 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) 1056 { 1057 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || 1058 fib_prefsrc != cfg->fc_dst) { 1059 u32 tb_id = cfg->fc_table; 1060 int rc; 1061 1062 if (tb_id == RT_TABLE_MAIN) 1063 tb_id = RT_TABLE_LOCAL; 1064 1065 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, 1066 fib_prefsrc, tb_id); 1067 1068 if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { 1069 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, 1070 fib_prefsrc, RT_TABLE_LOCAL); 1071 } 1072 1073 if (rc != RTN_LOCAL) 1074 return false; 1075 } 1076 return true; 1077 } 1078 1079 struct fib_info *fib_create_info(struct fib_config *cfg, 1080 struct netlink_ext_ack *extack) 1081 { 1082 int err; 1083 struct fib_info *fi = NULL; 1084 struct fib_info *ofi; 1085 int nhs = 1; 1086 struct net *net = cfg->fc_nlinfo.nl_net; 1087 1088 if (cfg->fc_type > RTN_MAX) 1089 goto err_inval; 1090 1091 /* Fast check to catch the most weird cases */ 1092 if (fib_props[cfg->fc_type].scope > cfg->fc_scope) { 1093 NL_SET_ERR_MSG(extack, "Invalid scope"); 1094 goto err_inval; 1095 } 1096 1097 if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) { 1098 NL_SET_ERR_MSG(extack, 1099 "Invalid rtm_flags - can not contain DEAD or LINKDOWN"); 1100 goto err_inval; 1101 } 1102 1103 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1104 if (cfg->fc_mp) { 1105 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack); 1106 if (nhs == 0) 1107 goto err_inval; 1108 } 1109 #endif 1110 1111 err = -ENOBUFS; 1112 if (fib_info_cnt >= fib_info_hash_size) { 1113 unsigned int new_size = fib_info_hash_size << 1; 1114 struct hlist_head *new_info_hash; 1115 struct hlist_head *new_laddrhash; 1116 unsigned int bytes; 1117 1118 if (!new_size) 1119 new_size = 16; 1120 bytes = new_size * sizeof(struct hlist_head *); 1121 new_info_hash = fib_info_hash_alloc(bytes); 1122 new_laddrhash = fib_info_hash_alloc(bytes); 1123 if (!new_info_hash || !new_laddrhash) { 1124 fib_info_hash_free(new_info_hash, bytes); 1125 fib_info_hash_free(new_laddrhash, bytes); 1126 } else 1127 fib_info_hash_move(new_info_hash, new_laddrhash, new_size); 1128 1129 if (!fib_info_hash_size) 1130 goto failure; 1131 } 1132 1133 fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); 1134 if (!fi) 1135 goto failure; 1136 fi->fib_metrics = ip_fib_metrics_init(fi->fib_net, cfg->fc_mx, 1137 cfg->fc_mx_len, extack); 1138 if (unlikely(IS_ERR(fi->fib_metrics))) { 1139 err = PTR_ERR(fi->fib_metrics); 1140 kfree(fi); 1141 return ERR_PTR(err); 1142 } 1143 1144 fib_info_cnt++; 1145 fi->fib_net = net; 1146 fi->fib_protocol = cfg->fc_protocol; 1147 fi->fib_scope = cfg->fc_scope; 1148 fi->fib_flags = cfg->fc_flags; 1149 fi->fib_priority = cfg->fc_priority; 1150 fi->fib_prefsrc = cfg->fc_prefsrc; 1151 fi->fib_type = cfg->fc_type; 1152 fi->fib_tb_id = cfg->fc_table; 1153 1154 fi->fib_nhs = nhs; 1155 change_nexthops(fi) { 1156 nexthop_nh->nh_parent = fi; 1157 } endfor_nexthops(fi) 1158 1159 if (cfg->fc_mp) 1160 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack); 1161 else 1162 err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack); 1163 1164 if (err != 0) 1165 goto failure; 1166 1167 if (fib_props[cfg->fc_type].error) { 1168 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) { 1169 NL_SET_ERR_MSG(extack, 1170 "Gateway, device and multipath can not be specified for this route type"); 1171 goto err_inval; 1172 } 1173 goto link_it; 1174 } else { 1175 switch (cfg->fc_type) { 1176 case RTN_UNICAST: 1177 case RTN_LOCAL: 1178 case RTN_BROADCAST: 1179 case RTN_ANYCAST: 1180 case RTN_MULTICAST: 1181 break; 1182 default: 1183 NL_SET_ERR_MSG(extack, "Invalid route type"); 1184 goto err_inval; 1185 } 1186 } 1187 1188 if (cfg->fc_scope > RT_SCOPE_HOST) { 1189 NL_SET_ERR_MSG(extack, "Invalid scope"); 1190 goto err_inval; 1191 } 1192 1193 if (cfg->fc_scope == RT_SCOPE_HOST) { 1194 struct fib_nh *nh = fi->fib_nh; 1195 1196 /* Local address is added. */ 1197 if (nhs != 1) { 1198 NL_SET_ERR_MSG(extack, 1199 "Route with host scope can not have multiple nexthops"); 1200 goto err_inval; 1201 } 1202 if (nh->fib_nh_gw4) { 1203 NL_SET_ERR_MSG(extack, 1204 "Route with host scope can not have a gateway"); 1205 goto err_inval; 1206 } 1207 nh->fib_nh_scope = RT_SCOPE_NOWHERE; 1208 nh->fib_nh_dev = dev_get_by_index(net, fi->fib_nh->fib_nh_oif); 1209 err = -ENODEV; 1210 if (!nh->fib_nh_dev) 1211 goto failure; 1212 } else { 1213 int linkdown = 0; 1214 1215 change_nexthops(fi) { 1216 err = fib_check_nh(cfg, nexthop_nh, extack); 1217 if (err != 0) 1218 goto failure; 1219 if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) 1220 linkdown++; 1221 } endfor_nexthops(fi) 1222 if (linkdown == fi->fib_nhs) 1223 fi->fib_flags |= RTNH_F_LINKDOWN; 1224 } 1225 1226 if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) { 1227 NL_SET_ERR_MSG(extack, "Invalid prefsrc address"); 1228 goto err_inval; 1229 } 1230 1231 change_nexthops(fi) { 1232 fib_info_update_nh_saddr(net, nexthop_nh); 1233 } endfor_nexthops(fi) 1234 1235 fib_rebalance(fi); 1236 1237 link_it: 1238 ofi = fib_find_info(fi); 1239 if (ofi) { 1240 fi->fib_dead = 1; 1241 free_fib_info(fi); 1242 ofi->fib_treeref++; 1243 return ofi; 1244 } 1245 1246 fi->fib_treeref++; 1247 refcount_set(&fi->fib_clntref, 1); 1248 spin_lock_bh(&fib_info_lock); 1249 hlist_add_head(&fi->fib_hash, 1250 &fib_info_hash[fib_info_hashfn(fi)]); 1251 if (fi->fib_prefsrc) { 1252 struct hlist_head *head; 1253 1254 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; 1255 hlist_add_head(&fi->fib_lhash, head); 1256 } 1257 change_nexthops(fi) { 1258 struct hlist_head *head; 1259 unsigned int hash; 1260 1261 if (!nexthop_nh->fib_nh_dev) 1262 continue; 1263 hash = fib_devindex_hashfn(nexthop_nh->fib_nh_dev->ifindex); 1264 head = &fib_info_devhash[hash]; 1265 hlist_add_head(&nexthop_nh->nh_hash, head); 1266 } endfor_nexthops(fi) 1267 spin_unlock_bh(&fib_info_lock); 1268 return fi; 1269 1270 err_inval: 1271 err = -EINVAL; 1272 1273 failure: 1274 if (fi) { 1275 fi->fib_dead = 1; 1276 free_fib_info(fi); 1277 } 1278 1279 return ERR_PTR(err); 1280 } 1281 1282 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, 1283 u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, 1284 struct fib_info *fi, unsigned int flags) 1285 { 1286 struct nlmsghdr *nlh; 1287 struct rtmsg *rtm; 1288 1289 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags); 1290 if (!nlh) 1291 return -EMSGSIZE; 1292 1293 rtm = nlmsg_data(nlh); 1294 rtm->rtm_family = AF_INET; 1295 rtm->rtm_dst_len = dst_len; 1296 rtm->rtm_src_len = 0; 1297 rtm->rtm_tos = tos; 1298 if (tb_id < 256) 1299 rtm->rtm_table = tb_id; 1300 else 1301 rtm->rtm_table = RT_TABLE_COMPAT; 1302 if (nla_put_u32(skb, RTA_TABLE, tb_id)) 1303 goto nla_put_failure; 1304 rtm->rtm_type = type; 1305 rtm->rtm_flags = fi->fib_flags; 1306 rtm->rtm_scope = fi->fib_scope; 1307 rtm->rtm_protocol = fi->fib_protocol; 1308 1309 if (rtm->rtm_dst_len && 1310 nla_put_in_addr(skb, RTA_DST, dst)) 1311 goto nla_put_failure; 1312 if (fi->fib_priority && 1313 nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) 1314 goto nla_put_failure; 1315 if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) 1316 goto nla_put_failure; 1317 1318 if (fi->fib_prefsrc && 1319 nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc)) 1320 goto nla_put_failure; 1321 if (fi->fib_nhs == 1) { 1322 if (fi->fib_nh->fib_nh_gw4 && 1323 nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->fib_nh_gw4)) 1324 goto nla_put_failure; 1325 if (fi->fib_nh->fib_nh_oif && 1326 nla_put_u32(skb, RTA_OIF, fi->fib_nh->fib_nh_oif)) 1327 goto nla_put_failure; 1328 if (fi->fib_nh->fib_nh_flags & RTNH_F_LINKDOWN) { 1329 rcu_read_lock(); 1330 if (ip_ignore_linkdown(fi->fib_nh->fib_nh_dev)) 1331 rtm->rtm_flags |= RTNH_F_DEAD; 1332 rcu_read_unlock(); 1333 } 1334 if (fi->fib_nh->fib_nh_flags & RTNH_F_OFFLOAD) 1335 rtm->rtm_flags |= RTNH_F_OFFLOAD; 1336 #ifdef CONFIG_IP_ROUTE_CLASSID 1337 if (fi->fib_nh[0].nh_tclassid && 1338 nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) 1339 goto nla_put_failure; 1340 #endif 1341 if (fi->fib_nh->fib_nh_lws && 1342 lwtunnel_fill_encap(skb, fi->fib_nh->fib_nh_lws) < 0) 1343 goto nla_put_failure; 1344 } 1345 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1346 if (fi->fib_nhs > 1) { 1347 struct rtnexthop *rtnh; 1348 struct nlattr *mp; 1349 1350 mp = nla_nest_start(skb, RTA_MULTIPATH); 1351 if (!mp) 1352 goto nla_put_failure; 1353 1354 for_nexthops(fi) { 1355 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); 1356 if (!rtnh) 1357 goto nla_put_failure; 1358 1359 rtnh->rtnh_flags = nh->fib_nh_flags & 0xFF; 1360 if (nh->fib_nh_flags & RTNH_F_LINKDOWN) { 1361 rcu_read_lock(); 1362 if (ip_ignore_linkdown(nh->fib_nh_dev)) 1363 rtnh->rtnh_flags |= RTNH_F_DEAD; 1364 rcu_read_unlock(); 1365 } 1366 rtnh->rtnh_hops = nh->fib_nh_weight - 1; 1367 rtnh->rtnh_ifindex = nh->fib_nh_oif; 1368 1369 if (nh->fib_nh_gw4 && 1370 nla_put_in_addr(skb, RTA_GATEWAY, nh->fib_nh_gw4)) 1371 goto nla_put_failure; 1372 #ifdef CONFIG_IP_ROUTE_CLASSID 1373 if (nh->nh_tclassid && 1374 nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) 1375 goto nla_put_failure; 1376 #endif 1377 if (nh->fib_nh_lws && 1378 lwtunnel_fill_encap(skb, nh->fib_nh_lws) < 0) 1379 goto nla_put_failure; 1380 1381 /* length of rtnetlink header + attributes */ 1382 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; 1383 } endfor_nexthops(fi); 1384 1385 nla_nest_end(skb, mp); 1386 } 1387 #endif 1388 nlmsg_end(skb, nlh); 1389 return 0; 1390 1391 nla_put_failure: 1392 nlmsg_cancel(skb, nlh); 1393 return -EMSGSIZE; 1394 } 1395 1396 /* 1397 * Update FIB if: 1398 * - local address disappeared -> we must delete all the entries 1399 * referring to it. 1400 * - device went down -> we must shutdown all nexthops going via it. 1401 */ 1402 int fib_sync_down_addr(struct net_device *dev, __be32 local) 1403 { 1404 int ret = 0; 1405 unsigned int hash = fib_laddr_hashfn(local); 1406 struct hlist_head *head = &fib_info_laddrhash[hash]; 1407 struct net *net = dev_net(dev); 1408 int tb_id = l3mdev_fib_table(dev); 1409 struct fib_info *fi; 1410 1411 if (!fib_info_laddrhash || local == 0) 1412 return 0; 1413 1414 hlist_for_each_entry(fi, head, fib_lhash) { 1415 if (!net_eq(fi->fib_net, net) || 1416 fi->fib_tb_id != tb_id) 1417 continue; 1418 if (fi->fib_prefsrc == local) { 1419 fi->fib_flags |= RTNH_F_DEAD; 1420 ret++; 1421 } 1422 } 1423 return ret; 1424 } 1425 1426 static int call_fib_nh_notifiers(struct fib_nh *nh, 1427 enum fib_event_type event_type) 1428 { 1429 bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev); 1430 struct fib_nh_notifier_info info = { 1431 .fib_nh = nh, 1432 }; 1433 1434 switch (event_type) { 1435 case FIB_EVENT_NH_ADD: 1436 if (nh->fib_nh_flags & RTNH_F_DEAD) 1437 break; 1438 if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) 1439 break; 1440 return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type, 1441 &info.info); 1442 case FIB_EVENT_NH_DEL: 1443 if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) || 1444 (nh->fib_nh_flags & RTNH_F_DEAD)) 1445 return call_fib4_notifiers(dev_net(nh->fib_nh_dev), 1446 event_type, &info.info); 1447 default: 1448 break; 1449 } 1450 1451 return NOTIFY_DONE; 1452 } 1453 1454 /* Update the PMTU of exceptions when: 1455 * - the new MTU of the first hop becomes smaller than the PMTU 1456 * - the old MTU was the same as the PMTU, and it limited discovery of 1457 * larger MTUs on the path. With that limit raised, we can now 1458 * discover larger MTUs 1459 * A special case is locked exceptions, for which the PMTU is smaller 1460 * than the minimal accepted PMTU: 1461 * - if the new MTU is greater than the PMTU, don't make any change 1462 * - otherwise, unlock and set PMTU 1463 */ 1464 static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig) 1465 { 1466 struct fnhe_hash_bucket *bucket; 1467 int i; 1468 1469 bucket = rcu_dereference_protected(nh->nh_exceptions, 1); 1470 if (!bucket) 1471 return; 1472 1473 for (i = 0; i < FNHE_HASH_SIZE; i++) { 1474 struct fib_nh_exception *fnhe; 1475 1476 for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); 1477 fnhe; 1478 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { 1479 if (fnhe->fnhe_mtu_locked) { 1480 if (new <= fnhe->fnhe_pmtu) { 1481 fnhe->fnhe_pmtu = new; 1482 fnhe->fnhe_mtu_locked = false; 1483 } 1484 } else if (new < fnhe->fnhe_pmtu || 1485 orig == fnhe->fnhe_pmtu) { 1486 fnhe->fnhe_pmtu = new; 1487 } 1488 } 1489 } 1490 } 1491 1492 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) 1493 { 1494 unsigned int hash = fib_devindex_hashfn(dev->ifindex); 1495 struct hlist_head *head = &fib_info_devhash[hash]; 1496 struct fib_nh *nh; 1497 1498 hlist_for_each_entry(nh, head, nh_hash) { 1499 if (nh->fib_nh_dev == dev) 1500 nh_update_mtu(nh, dev->mtu, orig_mtu); 1501 } 1502 } 1503 1504 /* Event force Flags Description 1505 * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host 1506 * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host 1507 * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed 1508 * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed 1509 */ 1510 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) 1511 { 1512 int ret = 0; 1513 int scope = RT_SCOPE_NOWHERE; 1514 struct fib_info *prev_fi = NULL; 1515 unsigned int hash = fib_devindex_hashfn(dev->ifindex); 1516 struct hlist_head *head = &fib_info_devhash[hash]; 1517 struct fib_nh *nh; 1518 1519 if (force) 1520 scope = -1; 1521 1522 hlist_for_each_entry(nh, head, nh_hash) { 1523 struct fib_info *fi = nh->nh_parent; 1524 int dead; 1525 1526 BUG_ON(!fi->fib_nhs); 1527 if (nh->fib_nh_dev != dev || fi == prev_fi) 1528 continue; 1529 prev_fi = fi; 1530 dead = 0; 1531 change_nexthops(fi) { 1532 if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) 1533 dead++; 1534 else if (nexthop_nh->fib_nh_dev == dev && 1535 nexthop_nh->fib_nh_scope != scope) { 1536 switch (event) { 1537 case NETDEV_DOWN: 1538 case NETDEV_UNREGISTER: 1539 nexthop_nh->fib_nh_flags |= RTNH_F_DEAD; 1540 /* fall through */ 1541 case NETDEV_CHANGE: 1542 nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN; 1543 break; 1544 } 1545 call_fib_nh_notifiers(nexthop_nh, 1546 FIB_EVENT_NH_DEL); 1547 dead++; 1548 } 1549 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1550 if (event == NETDEV_UNREGISTER && 1551 nexthop_nh->fib_nh_dev == dev) { 1552 dead = fi->fib_nhs; 1553 break; 1554 } 1555 #endif 1556 } endfor_nexthops(fi) 1557 if (dead == fi->fib_nhs) { 1558 switch (event) { 1559 case NETDEV_DOWN: 1560 case NETDEV_UNREGISTER: 1561 fi->fib_flags |= RTNH_F_DEAD; 1562 /* fall through */ 1563 case NETDEV_CHANGE: 1564 fi->fib_flags |= RTNH_F_LINKDOWN; 1565 break; 1566 } 1567 ret++; 1568 } 1569 1570 fib_rebalance(fi); 1571 } 1572 1573 return ret; 1574 } 1575 1576 /* Must be invoked inside of an RCU protected region. */ 1577 static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) 1578 { 1579 struct fib_info *fi = NULL, *last_resort = NULL; 1580 struct hlist_head *fa_head = res->fa_head; 1581 struct fib_table *tb = res->table; 1582 u8 slen = 32 - res->prefixlen; 1583 int order = -1, last_idx = -1; 1584 struct fib_alias *fa, *fa1 = NULL; 1585 u32 last_prio = res->fi->fib_priority; 1586 u8 last_tos = 0; 1587 1588 hlist_for_each_entry_rcu(fa, fa_head, fa_list) { 1589 struct fib_info *next_fi = fa->fa_info; 1590 1591 if (fa->fa_slen != slen) 1592 continue; 1593 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) 1594 continue; 1595 if (fa->tb_id != tb->tb_id) 1596 continue; 1597 if (next_fi->fib_priority > last_prio && 1598 fa->fa_tos == last_tos) { 1599 if (last_tos) 1600 continue; 1601 break; 1602 } 1603 if (next_fi->fib_flags & RTNH_F_DEAD) 1604 continue; 1605 last_tos = fa->fa_tos; 1606 last_prio = next_fi->fib_priority; 1607 1608 if (next_fi->fib_scope != res->scope || 1609 fa->fa_type != RTN_UNICAST) 1610 continue; 1611 if (!next_fi->fib_nh[0].fib_nh_gw4 || 1612 next_fi->fib_nh[0].fib_nh_scope != RT_SCOPE_LINK) 1613 continue; 1614 1615 fib_alias_accessed(fa); 1616 1617 if (!fi) { 1618 if (next_fi != res->fi) 1619 break; 1620 fa1 = fa; 1621 } else if (!fib_detect_death(fi, order, &last_resort, 1622 &last_idx, fa1->fa_default)) { 1623 fib_result_assign(res, fi); 1624 fa1->fa_default = order; 1625 goto out; 1626 } 1627 fi = next_fi; 1628 order++; 1629 } 1630 1631 if (order <= 0 || !fi) { 1632 if (fa1) 1633 fa1->fa_default = -1; 1634 goto out; 1635 } 1636 1637 if (!fib_detect_death(fi, order, &last_resort, &last_idx, 1638 fa1->fa_default)) { 1639 fib_result_assign(res, fi); 1640 fa1->fa_default = order; 1641 goto out; 1642 } 1643 1644 if (last_idx >= 0) 1645 fib_result_assign(res, last_resort); 1646 fa1->fa_default = last_idx; 1647 out: 1648 return; 1649 } 1650 1651 /* 1652 * Dead device goes up. We wake up dead nexthops. 1653 * It takes sense only on multipath routes. 1654 */ 1655 int fib_sync_up(struct net_device *dev, unsigned int nh_flags) 1656 { 1657 struct fib_info *prev_fi; 1658 unsigned int hash; 1659 struct hlist_head *head; 1660 struct fib_nh *nh; 1661 int ret; 1662 1663 if (!(dev->flags & IFF_UP)) 1664 return 0; 1665 1666 if (nh_flags & RTNH_F_DEAD) { 1667 unsigned int flags = dev_get_flags(dev); 1668 1669 if (flags & (IFF_RUNNING | IFF_LOWER_UP)) 1670 nh_flags |= RTNH_F_LINKDOWN; 1671 } 1672 1673 prev_fi = NULL; 1674 hash = fib_devindex_hashfn(dev->ifindex); 1675 head = &fib_info_devhash[hash]; 1676 ret = 0; 1677 1678 hlist_for_each_entry(nh, head, nh_hash) { 1679 struct fib_info *fi = nh->nh_parent; 1680 int alive; 1681 1682 BUG_ON(!fi->fib_nhs); 1683 if (nh->fib_nh_dev != dev || fi == prev_fi) 1684 continue; 1685 1686 prev_fi = fi; 1687 alive = 0; 1688 change_nexthops(fi) { 1689 if (!(nexthop_nh->fib_nh_flags & nh_flags)) { 1690 alive++; 1691 continue; 1692 } 1693 if (!nexthop_nh->fib_nh_dev || 1694 !(nexthop_nh->fib_nh_dev->flags & IFF_UP)) 1695 continue; 1696 if (nexthop_nh->fib_nh_dev != dev || 1697 !__in_dev_get_rtnl(dev)) 1698 continue; 1699 alive++; 1700 nexthop_nh->fib_nh_flags &= ~nh_flags; 1701 call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD); 1702 } endfor_nexthops(fi) 1703 1704 if (alive > 0) { 1705 fi->fib_flags &= ~nh_flags; 1706 ret++; 1707 } 1708 1709 fib_rebalance(fi); 1710 } 1711 1712 return ret; 1713 } 1714 1715 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1716 static bool fib_good_nh(const struct fib_nh *nh) 1717 { 1718 int state = NUD_REACHABLE; 1719 1720 if (nh->fib_nh_scope == RT_SCOPE_LINK) { 1721 struct neighbour *n; 1722 1723 rcu_read_lock_bh(); 1724 1725 n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, 1726 (__force u32)nh->fib_nh_gw4); 1727 if (n) 1728 state = n->nud_state; 1729 1730 rcu_read_unlock_bh(); 1731 } 1732 1733 return !!(state & NUD_VALID); 1734 } 1735 1736 void fib_select_multipath(struct fib_result *res, int hash) 1737 { 1738 struct fib_info *fi = res->fi; 1739 struct net *net = fi->fib_net; 1740 bool first = false; 1741 1742 for_nexthops(fi) { 1743 if (net->ipv4.sysctl_fib_multipath_use_neigh) { 1744 if (!fib_good_nh(nh)) 1745 continue; 1746 if (!first) { 1747 res->nh_sel = nhsel; 1748 first = true; 1749 } 1750 } 1751 1752 if (hash > atomic_read(&nh->fib_nh_upper_bound)) 1753 continue; 1754 1755 res->nh_sel = nhsel; 1756 return; 1757 } endfor_nexthops(fi); 1758 } 1759 #endif 1760 1761 void fib_select_path(struct net *net, struct fib_result *res, 1762 struct flowi4 *fl4, const struct sk_buff *skb) 1763 { 1764 if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) 1765 goto check_saddr; 1766 1767 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1768 if (res->fi->fib_nhs > 1) { 1769 int h = fib_multipath_hash(net, fl4, skb, NULL); 1770 1771 fib_select_multipath(res, h); 1772 } 1773 else 1774 #endif 1775 if (!res->prefixlen && 1776 res->table->tb_num_default > 1 && 1777 res->type == RTN_UNICAST) 1778 fib_select_default(fl4, res); 1779 1780 check_saddr: 1781 if (!fl4->saddr) 1782 fl4->saddr = FIB_RES_PREFSRC(net, *res); 1783 } 1784