1 // SPDX-License-Identifier: GPL-2.0 2 /* Generic nexthop implementation 3 * 4 * Copyright (c) 2017-19 Cumulus Networks 5 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> 6 */ 7 8 #include <linux/nexthop.h> 9 #include <linux/rtnetlink.h> 10 #include <linux/slab.h> 11 #include <net/arp.h> 12 #include <net/ipv6_stubs.h> 13 #include <net/lwtunnel.h> 14 #include <net/ndisc.h> 15 #include <net/nexthop.h> 16 #include <net/route.h> 17 #include <net/sock.h> 18 19 static void remove_nexthop(struct net *net, struct nexthop *nh, 20 struct nl_info *nlinfo); 21 22 #define NH_DEV_HASHBITS 8 23 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) 24 25 static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { 26 [NHA_ID] = { .type = NLA_U32 }, 27 [NHA_GROUP] = { .type = NLA_BINARY }, 28 [NHA_GROUP_TYPE] = { .type = NLA_U16 }, 29 [NHA_BLACKHOLE] = { .type = NLA_FLAG }, 30 [NHA_OIF] = { .type = NLA_U32 }, 31 [NHA_GATEWAY] = { .type = NLA_BINARY }, 32 [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, 33 [NHA_ENCAP] = { .type = NLA_NESTED }, 34 [NHA_GROUPS] = { .type = NLA_FLAG }, 35 [NHA_MASTER] = { .type = NLA_U32 }, 36 [NHA_FDB] = { .type = NLA_FLAG }, 37 }; 38 39 static int call_nexthop_notifiers(struct net *net, 40 enum nexthop_event_type event_type, 41 struct nexthop *nh) 42 { 43 int err; 44 45 err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, 46 event_type, nh); 47 return notifier_to_errno(err); 48 } 49 50 static unsigned int nh_dev_hashfn(unsigned int val) 51 { 52 unsigned int mask = NH_DEV_HASHSIZE - 1; 53 54 return (val ^ 55 (val >> NH_DEV_HASHBITS) ^ 56 (val >> (NH_DEV_HASHBITS * 2))) & mask; 57 } 58 59 static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) 60 { 61 struct net_device *dev = nhi->fib_nhc.nhc_dev; 62 struct hlist_head *head; 63 unsigned int hash; 64 65 WARN_ON(!dev); 66 67 hash = nh_dev_hashfn(dev->ifindex); 68 head = &net->nexthop.devhash[hash]; 69 hlist_add_head(&nhi->dev_hash, head); 70 } 71 72 static void nexthop_free_mpath(struct nexthop *nh) 73 { 74 struct nh_group *nhg; 75 int i; 76 77 nhg = rcu_dereference_raw(nh->nh_grp); 78 for (i = 0; i < nhg->num_nh; ++i) { 79 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 80 81 WARN_ON(!list_empty(&nhge->nh_list)); 82 nexthop_put(nhge->nh); 83 } 84 85 WARN_ON(nhg->spare == nhg); 86 87 kfree(nhg->spare); 88 kfree(nhg); 89 } 90 91 static void nexthop_free_single(struct nexthop *nh) 92 { 93 struct nh_info *nhi; 94 95 nhi = rcu_dereference_raw(nh->nh_info); 96 switch (nhi->family) { 97 case AF_INET: 98 fib_nh_release(nh->net, &nhi->fib_nh); 99 break; 100 case AF_INET6: 101 ipv6_stub->fib6_nh_release(&nhi->fib6_nh); 102 break; 103 } 104 kfree(nhi); 105 } 106 107 void nexthop_free_rcu(struct rcu_head *head) 108 { 109 struct nexthop *nh = container_of(head, struct nexthop, rcu); 110 111 if (nh->is_group) 112 nexthop_free_mpath(nh); 113 else 114 nexthop_free_single(nh); 115 116 kfree(nh); 117 } 118 EXPORT_SYMBOL_GPL(nexthop_free_rcu); 119 120 static struct nexthop *nexthop_alloc(void) 121 { 122 struct nexthop *nh; 123 124 nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); 125 if (nh) { 126 INIT_LIST_HEAD(&nh->fi_list); 127 INIT_LIST_HEAD(&nh->f6i_list); 128 INIT_LIST_HEAD(&nh->grp_list); 129 INIT_LIST_HEAD(&nh->fdb_list); 130 } 131 return nh; 132 } 133 134 static struct nh_group *nexthop_grp_alloc(u16 num_nh) 135 { 136 size_t sz = offsetof(struct nexthop, nh_grp) 137 + sizeof(struct nh_group) 138 + sizeof(struct nh_grp_entry) * num_nh; 139 struct nh_group *nhg; 140 141 nhg = kzalloc(sz, GFP_KERNEL); 142 if (nhg) 143 nhg->num_nh = num_nh; 144 145 return nhg; 146 } 147 148 static void nh_base_seq_inc(struct net *net) 149 { 150 while (++net->nexthop.seq == 0) 151 ; 152 } 153 154 /* no reference taken; rcu lock or rtnl must be held */ 155 struct nexthop *nexthop_find_by_id(struct net *net, u32 id) 156 { 157 struct rb_node **pp, *parent = NULL, *next; 158 159 pp = &net->nexthop.rb_root.rb_node; 160 while (1) { 161 struct nexthop *nh; 162 163 next = rcu_dereference_raw(*pp); 164 if (!next) 165 break; 166 parent = next; 167 168 nh = rb_entry(parent, struct nexthop, rb_node); 169 if (id < nh->id) 170 pp = &next->rb_left; 171 else if (id > nh->id) 172 pp = &next->rb_right; 173 else 174 return nh; 175 } 176 return NULL; 177 } 178 EXPORT_SYMBOL_GPL(nexthop_find_by_id); 179 180 /* used for auto id allocation; called with rtnl held */ 181 static u32 nh_find_unused_id(struct net *net) 182 { 183 u32 id_start = net->nexthop.last_id_allocated; 184 185 while (1) { 186 net->nexthop.last_id_allocated++; 187 if (net->nexthop.last_id_allocated == id_start) 188 break; 189 190 if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) 191 return net->nexthop.last_id_allocated; 192 } 193 return 0; 194 } 195 196 static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nhg) 197 { 198 struct nexthop_grp *p; 199 size_t len = nhg->num_nh * sizeof(*p); 200 struct nlattr *nla; 201 u16 group_type = 0; 202 int i; 203 204 if (nhg->mpath) 205 group_type = NEXTHOP_GRP_TYPE_MPATH; 206 207 if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) 208 goto nla_put_failure; 209 210 nla = nla_reserve(skb, NHA_GROUP, len); 211 if (!nla) 212 goto nla_put_failure; 213 214 p = nla_data(nla); 215 for (i = 0; i < nhg->num_nh; ++i) { 216 p->id = nhg->nh_entries[i].nh->id; 217 p->weight = nhg->nh_entries[i].weight - 1; 218 p += 1; 219 } 220 221 return 0; 222 223 nla_put_failure: 224 return -EMSGSIZE; 225 } 226 227 static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, 228 int event, u32 portid, u32 seq, unsigned int nlflags) 229 { 230 struct fib6_nh *fib6_nh; 231 struct fib_nh *fib_nh; 232 struct nlmsghdr *nlh; 233 struct nh_info *nhi; 234 struct nhmsg *nhm; 235 236 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); 237 if (!nlh) 238 return -EMSGSIZE; 239 240 nhm = nlmsg_data(nlh); 241 nhm->nh_family = AF_UNSPEC; 242 nhm->nh_flags = nh->nh_flags; 243 nhm->nh_protocol = nh->protocol; 244 nhm->nh_scope = 0; 245 nhm->resvd = 0; 246 247 if (nla_put_u32(skb, NHA_ID, nh->id)) 248 goto nla_put_failure; 249 250 if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB)) 251 goto nla_put_failure; 252 253 if (nh->is_group) { 254 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 255 256 if (nla_put_nh_group(skb, nhg)) 257 goto nla_put_failure; 258 goto out; 259 } 260 261 nhi = rtnl_dereference(nh->nh_info); 262 nhm->nh_family = nhi->family; 263 if (nhi->reject_nh) { 264 if (nla_put_flag(skb, NHA_BLACKHOLE)) 265 goto nla_put_failure; 266 goto out; 267 } else if (!nh->is_fdb_nh) { 268 const struct net_device *dev; 269 270 dev = nhi->fib_nhc.nhc_dev; 271 if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) 272 goto nla_put_failure; 273 } 274 275 nhm->nh_scope = nhi->fib_nhc.nhc_scope; 276 switch (nhi->family) { 277 case AF_INET: 278 fib_nh = &nhi->fib_nh; 279 if (fib_nh->fib_nh_gw_family && 280 nla_put_u32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) 281 goto nla_put_failure; 282 break; 283 284 case AF_INET6: 285 fib6_nh = &nhi->fib6_nh; 286 if (fib6_nh->fib_nh_gw_family && 287 nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) 288 goto nla_put_failure; 289 break; 290 } 291 292 if (nhi->fib_nhc.nhc_lwtstate && 293 lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, 294 NHA_ENCAP, NHA_ENCAP_TYPE) < 0) 295 goto nla_put_failure; 296 297 out: 298 nlmsg_end(skb, nlh); 299 return 0; 300 301 nla_put_failure: 302 nlmsg_cancel(skb, nlh); 303 return -EMSGSIZE; 304 } 305 306 static size_t nh_nlmsg_size_grp(struct nexthop *nh) 307 { 308 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 309 size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; 310 311 return nla_total_size(sz) + 312 nla_total_size(2); /* NHA_GROUP_TYPE */ 313 } 314 315 static size_t nh_nlmsg_size_single(struct nexthop *nh) 316 { 317 struct nh_info *nhi = rtnl_dereference(nh->nh_info); 318 size_t sz; 319 320 /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE 321 * are mutually exclusive 322 */ 323 sz = nla_total_size(4); /* NHA_OIF */ 324 325 switch (nhi->family) { 326 case AF_INET: 327 if (nhi->fib_nh.fib_nh_gw_family) 328 sz += nla_total_size(4); /* NHA_GATEWAY */ 329 break; 330 331 case AF_INET6: 332 /* NHA_GATEWAY */ 333 if (nhi->fib6_nh.fib_nh_gw_family) 334 sz += nla_total_size(sizeof(const struct in6_addr)); 335 break; 336 } 337 338 if (nhi->fib_nhc.nhc_lwtstate) { 339 sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); 340 sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ 341 } 342 343 return sz; 344 } 345 346 static size_t nh_nlmsg_size(struct nexthop *nh) 347 { 348 size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); 349 350 sz += nla_total_size(4); /* NHA_ID */ 351 352 if (nh->is_group) 353 sz += nh_nlmsg_size_grp(nh); 354 else 355 sz += nh_nlmsg_size_single(nh); 356 357 return sz; 358 } 359 360 static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) 361 { 362 unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; 363 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; 364 struct sk_buff *skb; 365 int err = -ENOBUFS; 366 367 skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); 368 if (!skb) 369 goto errout; 370 371 err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags); 372 if (err < 0) { 373 /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ 374 WARN_ON(err == -EMSGSIZE); 375 kfree_skb(skb); 376 goto errout; 377 } 378 379 rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, 380 info->nlh, gfp_any()); 381 return; 382 errout: 383 if (err < 0) 384 rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); 385 } 386 387 static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, 388 struct netlink_ext_ack *extack) 389 { 390 if (nh->is_group) { 391 struct nh_group *nhg = rtnl_dereference(nh->nh_grp); 392 393 /* nested multipath (group within a group) is not 394 * supported 395 */ 396 if (nhg->mpath) { 397 NL_SET_ERR_MSG(extack, 398 "Multipath group can not be a nexthop within a group"); 399 return false; 400 } 401 } else { 402 struct nh_info *nhi = rtnl_dereference(nh->nh_info); 403 404 if (nhi->reject_nh && npaths > 1) { 405 NL_SET_ERR_MSG(extack, 406 "Blackhole nexthop can not be used in a group with more than 1 path"); 407 return false; 408 } 409 } 410 411 return true; 412 } 413 414 static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, 415 struct netlink_ext_ack *extack) 416 { 417 struct nh_info *nhi; 418 419 if (!nh->is_fdb_nh) { 420 NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); 421 return -EINVAL; 422 } 423 424 nhi = rtnl_dereference(nh->nh_info); 425 if (*nh_family == AF_UNSPEC) { 426 *nh_family = nhi->family; 427 } else if (*nh_family != nhi->family) { 428 NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); 429 return -EINVAL; 430 } 431 432 return 0; 433 } 434 435 static int nh_check_attr_group(struct net *net, struct nlattr *tb[], 436 struct netlink_ext_ack *extack) 437 { 438 unsigned int len = nla_len(tb[NHA_GROUP]); 439 u8 nh_family = AF_UNSPEC; 440 struct nexthop_grp *nhg; 441 unsigned int i, j; 442 u8 nhg_fdb = 0; 443 444 if (len & (sizeof(struct nexthop_grp) - 1)) { 445 NL_SET_ERR_MSG(extack, 446 "Invalid length for nexthop group attribute"); 447 return -EINVAL; 448 } 449 450 /* convert len to number of nexthop ids */ 451 len /= sizeof(*nhg); 452 453 nhg = nla_data(tb[NHA_GROUP]); 454 for (i = 0; i < len; ++i) { 455 if (nhg[i].resvd1 || nhg[i].resvd2) { 456 NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); 457 return -EINVAL; 458 } 459 if (nhg[i].weight > 254) { 460 NL_SET_ERR_MSG(extack, "Invalid value for weight"); 461 return -EINVAL; 462 } 463 for (j = i + 1; j < len; ++j) { 464 if (nhg[i].id == nhg[j].id) { 465 NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); 466 return -EINVAL; 467 } 468 } 469 } 470 471 if (tb[NHA_FDB]) 472 nhg_fdb = 1; 473 nhg = nla_data(tb[NHA_GROUP]); 474 for (i = 0; i < len; ++i) { 475 struct nexthop *nh; 476 477 nh = nexthop_find_by_id(net, nhg[i].id); 478 if (!nh) { 479 NL_SET_ERR_MSG(extack, "Invalid nexthop id"); 480 return -EINVAL; 481 } 482 if (!valid_group_nh(nh, len, extack)) 483 return -EINVAL; 484 485 if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) 486 return -EINVAL; 487 488 if (!nhg_fdb && nh->is_fdb_nh) { 489 NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); 490 return -EINVAL; 491 } 492 } 493 for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) { 494 if (!tb[i]) 495 continue; 496 if (tb[NHA_FDB]) 497 continue; 498 NL_SET_ERR_MSG(extack, 499 "No other attributes can be set in nexthop groups"); 500 return -EINVAL; 501 } 502 503 return 0; 504 } 505 506 static bool ipv6_good_nh(const struct fib6_nh *nh) 507 { 508 int state = NUD_REACHABLE; 509 struct neighbour *n; 510 511 rcu_read_lock_bh(); 512 513 n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); 514 if (n) 515 state = n->nud_state; 516 517 rcu_read_unlock_bh(); 518 519 return !!(state & NUD_VALID); 520 } 521 522 static bool ipv4_good_nh(const struct fib_nh *nh) 523 { 524 int state = NUD_REACHABLE; 525 struct neighbour *n; 526 527 rcu_read_lock_bh(); 528 529 n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, 530 (__force u32)nh->fib_nh_gw4); 531 if (n) 532 state = n->nud_state; 533 534 rcu_read_unlock_bh(); 535 536 return !!(state & NUD_VALID); 537 } 538 539 struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) 540 { 541 struct nexthop *rc = NULL; 542 struct nh_group *nhg; 543 int i; 544 545 if (!nh->is_group) 546 return nh; 547 548 nhg = rcu_dereference(nh->nh_grp); 549 for (i = 0; i < nhg->num_nh; ++i) { 550 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 551 struct nh_info *nhi; 552 553 if (hash > atomic_read(&nhge->upper_bound)) 554 continue; 555 556 if (nhge->nh->is_fdb_nh) 557 return nhge->nh; 558 559 /* nexthops always check if it is good and does 560 * not rely on a sysctl for this behavior 561 */ 562 nhi = rcu_dereference(nhge->nh->nh_info); 563 switch (nhi->family) { 564 case AF_INET: 565 if (ipv4_good_nh(&nhi->fib_nh)) 566 return nhge->nh; 567 break; 568 case AF_INET6: 569 if (ipv6_good_nh(&nhi->fib6_nh)) 570 return nhge->nh; 571 break; 572 } 573 574 if (!rc) 575 rc = nhge->nh; 576 } 577 578 return rc; 579 } 580 EXPORT_SYMBOL_GPL(nexthop_select_path); 581 582 int nexthop_for_each_fib6_nh(struct nexthop *nh, 583 int (*cb)(struct fib6_nh *nh, void *arg), 584 void *arg) 585 { 586 struct nh_info *nhi; 587 int err; 588 589 if (nh->is_group) { 590 struct nh_group *nhg; 591 int i; 592 593 nhg = rcu_dereference_rtnl(nh->nh_grp); 594 for (i = 0; i < nhg->num_nh; i++) { 595 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 596 597 nhi = rcu_dereference_rtnl(nhge->nh->nh_info); 598 err = cb(&nhi->fib6_nh, arg); 599 if (err) 600 return err; 601 } 602 } else { 603 nhi = rcu_dereference_rtnl(nh->nh_info); 604 err = cb(&nhi->fib6_nh, arg); 605 if (err) 606 return err; 607 } 608 609 return 0; 610 } 611 EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); 612 613 static int check_src_addr(const struct in6_addr *saddr, 614 struct netlink_ext_ack *extack) 615 { 616 if (!ipv6_addr_any(saddr)) { 617 NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); 618 return -EINVAL; 619 } 620 return 0; 621 } 622 623 int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, 624 struct netlink_ext_ack *extack) 625 { 626 struct nh_info *nhi; 627 628 if (nh->is_fdb_nh) { 629 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); 630 return -EINVAL; 631 } 632 633 /* fib6_src is unique to a fib6_info and limits the ability to cache 634 * routes in fib6_nh within a nexthop that is potentially shared 635 * across multiple fib entries. If the config wants to use source 636 * routing it can not use nexthop objects. mlxsw also does not allow 637 * fib6_src on routes. 638 */ 639 if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) 640 return -EINVAL; 641 642 if (nh->is_group) { 643 struct nh_group *nhg; 644 645 nhg = rtnl_dereference(nh->nh_grp); 646 if (nhg->has_v4) 647 goto no_v4_nh; 648 } else { 649 nhi = rtnl_dereference(nh->nh_info); 650 if (nhi->family == AF_INET) 651 goto no_v4_nh; 652 } 653 654 return 0; 655 no_v4_nh: 656 NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); 657 return -EINVAL; 658 } 659 EXPORT_SYMBOL_GPL(fib6_check_nexthop); 660 661 /* if existing nexthop has ipv6 routes linked to it, need 662 * to verify this new spec works with ipv6 663 */ 664 static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, 665 struct netlink_ext_ack *extack) 666 { 667 struct fib6_info *f6i; 668 669 if (list_empty(&old->f6i_list)) 670 return 0; 671 672 list_for_each_entry(f6i, &old->f6i_list, nh_list) { 673 if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) 674 return -EINVAL; 675 } 676 677 return fib6_check_nexthop(new, NULL, extack); 678 } 679 680 static int nexthop_check_scope(struct nexthop *nh, u8 scope, 681 struct netlink_ext_ack *extack) 682 { 683 struct nh_info *nhi; 684 685 nhi = rtnl_dereference(nh->nh_info); 686 if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { 687 NL_SET_ERR_MSG(extack, 688 "Route with host scope can not have a gateway"); 689 return -EINVAL; 690 } 691 692 if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { 693 NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); 694 return -EINVAL; 695 } 696 697 return 0; 698 } 699 700 /* Invoked by fib add code to verify nexthop by id is ok with 701 * config for prefix; parts of fib_check_nh not done when nexthop 702 * object is used. 703 */ 704 int fib_check_nexthop(struct nexthop *nh, u8 scope, 705 struct netlink_ext_ack *extack) 706 { 707 int err = 0; 708 709 if (nh->is_fdb_nh) { 710 NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); 711 err = -EINVAL; 712 goto out; 713 } 714 715 if (nh->is_group) { 716 struct nh_group *nhg; 717 718 if (scope == RT_SCOPE_HOST) { 719 NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); 720 err = -EINVAL; 721 goto out; 722 } 723 724 nhg = rtnl_dereference(nh->nh_grp); 725 /* all nexthops in a group have the same scope */ 726 err = nexthop_check_scope(nhg->nh_entries[0].nh, scope, extack); 727 } else { 728 err = nexthop_check_scope(nh, scope, extack); 729 } 730 out: 731 return err; 732 } 733 734 static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, 735 struct netlink_ext_ack *extack) 736 { 737 struct fib_info *fi; 738 739 list_for_each_entry(fi, &old->fi_list, nh_list) { 740 int err; 741 742 err = fib_check_nexthop(new, fi->fib_scope, extack); 743 if (err) 744 return err; 745 } 746 return 0; 747 } 748 749 static void nh_group_rebalance(struct nh_group *nhg) 750 { 751 int total = 0; 752 int w = 0; 753 int i; 754 755 for (i = 0; i < nhg->num_nh; ++i) 756 total += nhg->nh_entries[i].weight; 757 758 for (i = 0; i < nhg->num_nh; ++i) { 759 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 760 int upper_bound; 761 762 w += nhge->weight; 763 upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; 764 atomic_set(&nhge->upper_bound, upper_bound); 765 } 766 } 767 768 static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, 769 struct nl_info *nlinfo) 770 { 771 struct nh_grp_entry *nhges, *new_nhges; 772 struct nexthop *nhp = nhge->nh_parent; 773 struct nexthop *nh = nhge->nh; 774 struct nh_group *nhg, *newg; 775 int i, j; 776 777 WARN_ON(!nh); 778 779 nhg = rtnl_dereference(nhp->nh_grp); 780 newg = nhg->spare; 781 782 /* last entry, keep it visible and remove the parent */ 783 if (nhg->num_nh == 1) { 784 remove_nexthop(net, nhp, nlinfo); 785 return; 786 } 787 788 newg->has_v4 = nhg->has_v4; 789 newg->mpath = nhg->mpath; 790 newg->num_nh = nhg->num_nh; 791 792 /* copy old entries to new except the one getting removed */ 793 nhges = nhg->nh_entries; 794 new_nhges = newg->nh_entries; 795 for (i = 0, j = 0; i < nhg->num_nh; ++i) { 796 /* current nexthop getting removed */ 797 if (nhg->nh_entries[i].nh == nh) { 798 newg->num_nh--; 799 continue; 800 } 801 802 list_del(&nhges[i].nh_list); 803 new_nhges[j].nh_parent = nhges[i].nh_parent; 804 new_nhges[j].nh = nhges[i].nh; 805 new_nhges[j].weight = nhges[i].weight; 806 list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); 807 j++; 808 } 809 810 nh_group_rebalance(newg); 811 rcu_assign_pointer(nhp->nh_grp, newg); 812 813 list_del(&nhge->nh_list); 814 nexthop_put(nhge->nh); 815 816 if (nlinfo) 817 nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); 818 } 819 820 static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, 821 struct nl_info *nlinfo) 822 { 823 struct nh_grp_entry *nhge, *tmp; 824 825 list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) 826 remove_nh_grp_entry(net, nhge, nlinfo); 827 828 /* make sure all see the newly published array before releasing rtnl */ 829 synchronize_rcu(); 830 } 831 832 static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) 833 { 834 struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); 835 int i, num_nh = nhg->num_nh; 836 837 for (i = 0; i < num_nh; ++i) { 838 struct nh_grp_entry *nhge = &nhg->nh_entries[i]; 839 840 if (WARN_ON(!nhge->nh)) 841 continue; 842 843 list_del_init(&nhge->nh_list); 844 } 845 } 846 847 /* not called for nexthop replace */ 848 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) 849 { 850 struct fib6_info *f6i, *tmp; 851 bool do_flush = false; 852 struct fib_info *fi; 853 854 call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); 855 856 list_for_each_entry(fi, &nh->fi_list, nh_list) { 857 fi->fib_flags |= RTNH_F_DEAD; 858 do_flush = true; 859 } 860 if (do_flush) 861 fib_flush(net); 862 863 /* ip6_del_rt removes the entry from this list hence the _safe */ 864 list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { 865 /* __ip6_del_rt does a release, so do a hold here */ 866 fib6_info_hold(f6i); 867 ipv6_stub->ip6_del_rt(net, f6i, 868 !net->ipv4.sysctl_nexthop_compat_mode); 869 } 870 } 871 872 static void __remove_nexthop(struct net *net, struct nexthop *nh, 873 struct nl_info *nlinfo) 874 { 875 __remove_nexthop_fib(net, nh); 876 877 if (nh->is_group) { 878 remove_nexthop_group(nh, nlinfo); 879 } else { 880 struct nh_info *nhi; 881 882 nhi = rtnl_dereference(nh->nh_info); 883 if (nhi->fib_nhc.nhc_dev) 884 hlist_del(&nhi->dev_hash); 885 886 remove_nexthop_from_groups(net, nh, nlinfo); 887 } 888 } 889 890 static void remove_nexthop(struct net *net, struct nexthop *nh, 891 struct nl_info *nlinfo) 892 { 893 /* remove from the tree */ 894 rb_erase(&nh->rb_node, &net->nexthop.rb_root); 895 896 if (nlinfo) 897 nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); 898 899 __remove_nexthop(net, nh, nlinfo); 900 nh_base_seq_inc(net); 901 902 nexthop_put(nh); 903 } 904 905 /* if any FIB entries reference this nexthop, any dst entries 906 * need to be regenerated 907 */ 908 static void nh_rt_cache_flush(struct net *net, struct nexthop *nh) 909 { 910 struct fib6_info *f6i; 911 912 if (!list_empty(&nh->fi_list)) 913 rt_cache_flush(net); 914 915 list_for_each_entry(f6i, &nh->f6i_list, nh_list) 916 ipv6_stub->fib6_update_sernum(net, f6i); 917 } 918 919 static int replace_nexthop_grp(struct net *net, struct nexthop *old, 920 struct nexthop *new, 921 struct netlink_ext_ack *extack) 922 { 923 struct nh_group *oldg, *newg; 924 int i; 925 926 if (!new->is_group) { 927 NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); 928 return -EINVAL; 929 } 930 931 oldg = rtnl_dereference(old->nh_grp); 932 newg = rtnl_dereference(new->nh_grp); 933 934 /* update parents - used by nexthop code for cleanup */ 935 for (i = 0; i < newg->num_nh; i++) 936 newg->nh_entries[i].nh_parent = old; 937 938 rcu_assign_pointer(old->nh_grp, newg); 939 940 for (i = 0; i < oldg->num_nh; i++) 941 oldg->nh_entries[i].nh_parent = new; 942 943 rcu_assign_pointer(new->nh_grp, oldg); 944 945 return 0; 946 } 947 948 static int replace_nexthop_single(struct net *net, struct nexthop *old, 949 struct nexthop *new, 950 struct netlink_ext_ack *extack) 951 { 952 struct nh_info *oldi, *newi; 953 954 if (new->is_group) { 955 NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); 956 return -EINVAL; 957 } 958 959 oldi = rtnl_dereference(old->nh_info); 960 newi = rtnl_dereference(new->nh_info); 961 962 newi->nh_parent = old; 963 oldi->nh_parent = new; 964 965 old->protocol = new->protocol; 966 old->nh_flags = new->nh_flags; 967 968 rcu_assign_pointer(old->nh_info, newi); 969 rcu_assign_pointer(new->nh_info, oldi); 970 971 return 0; 972 } 973 974 static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, 975 struct nl_info *info) 976 { 977 struct fib6_info *f6i; 978 979 if (!list_empty(&nh->fi_list)) { 980 struct fib_info *fi; 981 982 /* expectation is a few fib_info per nexthop and then 983 * a lot of routes per fib_info. So mark the fib_info 984 * and then walk the fib tables once 985 */ 986 list_for_each_entry(fi, &nh->fi_list, nh_list) 987 fi->nh_updated = true; 988 989 fib_info_notify_update(net, info); 990 991 list_for_each_entry(fi, &nh->fi_list, nh_list) 992 fi->nh_updated = false; 993 } 994 995 list_for_each_entry(f6i, &nh->f6i_list, nh_list) 996 ipv6_stub->fib6_rt_update(net, f6i, info); 997 } 998 999 /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries 1000 * linked to this nexthop and for all groups that the nexthop 1001 * is a member of 1002 */ 1003 static void nexthop_replace_notify(struct net *net, struct nexthop *nh, 1004 struct nl_info *info) 1005 { 1006 struct nh_grp_entry *nhge; 1007 1008 __nexthop_replace_notify(net, nh, info); 1009 1010 list_for_each_entry(nhge, &nh->grp_list, nh_list) 1011 __nexthop_replace_notify(net, nhge->nh_parent, info); 1012 } 1013 1014 static int replace_nexthop(struct net *net, struct nexthop *old, 1015 struct nexthop *new, struct netlink_ext_ack *extack) 1016 { 1017 bool new_is_reject = false; 1018 struct nh_grp_entry *nhge; 1019 int err; 1020 1021 /* check that existing FIB entries are ok with the 1022 * new nexthop definition 1023 */ 1024 err = fib_check_nh_list(old, new, extack); 1025 if (err) 1026 return err; 1027 1028 err = fib6_check_nh_list(old, new, extack); 1029 if (err) 1030 return err; 1031 1032 if (!new->is_group) { 1033 struct nh_info *nhi = rtnl_dereference(new->nh_info); 1034 1035 new_is_reject = nhi->reject_nh; 1036 } 1037 1038 list_for_each_entry(nhge, &old->grp_list, nh_list) { 1039 /* if new nexthop is a blackhole, any groups using this 1040 * nexthop cannot have more than 1 path 1041 */ 1042 if (new_is_reject && 1043 nexthop_num_path(nhge->nh_parent) > 1) { 1044 NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); 1045 return -EINVAL; 1046 } 1047 1048 err = fib_check_nh_list(nhge->nh_parent, new, extack); 1049 if (err) 1050 return err; 1051 1052 err = fib6_check_nh_list(nhge->nh_parent, new, extack); 1053 if (err) 1054 return err; 1055 } 1056 1057 if (old->is_group) 1058 err = replace_nexthop_grp(net, old, new, extack); 1059 else 1060 err = replace_nexthop_single(net, old, new, extack); 1061 1062 if (!err) { 1063 nh_rt_cache_flush(net, old); 1064 1065 __remove_nexthop(net, new, NULL); 1066 nexthop_put(new); 1067 } 1068 1069 return err; 1070 } 1071 1072 /* called with rtnl_lock held */ 1073 static int insert_nexthop(struct net *net, struct nexthop *new_nh, 1074 struct nh_config *cfg, struct netlink_ext_ack *extack) 1075 { 1076 struct rb_node **pp, *parent = NULL, *next; 1077 struct rb_root *root = &net->nexthop.rb_root; 1078 bool replace = !!(cfg->nlflags & NLM_F_REPLACE); 1079 bool create = !!(cfg->nlflags & NLM_F_CREATE); 1080 u32 new_id = new_nh->id; 1081 int replace_notify = 0; 1082 int rc = -EEXIST; 1083 1084 pp = &root->rb_node; 1085 while (1) { 1086 struct nexthop *nh; 1087 1088 next = rtnl_dereference(*pp); 1089 if (!next) 1090 break; 1091 1092 parent = next; 1093 1094 nh = rb_entry(parent, struct nexthop, rb_node); 1095 if (new_id < nh->id) { 1096 pp = &next->rb_left; 1097 } else if (new_id > nh->id) { 1098 pp = &next->rb_right; 1099 } else if (replace) { 1100 rc = replace_nexthop(net, nh, new_nh, extack); 1101 if (!rc) { 1102 new_nh = nh; /* send notification with old nh */ 1103 replace_notify = 1; 1104 } 1105 goto out; 1106 } else { 1107 /* id already exists and not a replace */ 1108 goto out; 1109 } 1110 } 1111 1112 if (replace && !create) { 1113 NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); 1114 rc = -ENOENT; 1115 goto out; 1116 } 1117 1118 rb_link_node_rcu(&new_nh->rb_node, parent, pp); 1119 rb_insert_color(&new_nh->rb_node, root); 1120 rc = 0; 1121 out: 1122 if (!rc) { 1123 nh_base_seq_inc(net); 1124 nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); 1125 if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode) 1126 nexthop_replace_notify(net, new_nh, &cfg->nlinfo); 1127 } 1128 1129 return rc; 1130 } 1131 1132 /* rtnl */ 1133 /* remove all nexthops tied to a device being deleted */ 1134 static void nexthop_flush_dev(struct net_device *dev) 1135 { 1136 unsigned int hash = nh_dev_hashfn(dev->ifindex); 1137 struct net *net = dev_net(dev); 1138 struct hlist_head *head = &net->nexthop.devhash[hash]; 1139 struct hlist_node *n; 1140 struct nh_info *nhi; 1141 1142 hlist_for_each_entry_safe(nhi, n, head, dev_hash) { 1143 if (nhi->fib_nhc.nhc_dev != dev) 1144 continue; 1145 1146 remove_nexthop(net, nhi->nh_parent, NULL); 1147 } 1148 } 1149 1150 /* rtnl; called when net namespace is deleted */ 1151 static void flush_all_nexthops(struct net *net) 1152 { 1153 struct rb_root *root = &net->nexthop.rb_root; 1154 struct rb_node *node; 1155 struct nexthop *nh; 1156 1157 while ((node = rb_first(root))) { 1158 nh = rb_entry(node, struct nexthop, rb_node); 1159 remove_nexthop(net, nh, NULL); 1160 cond_resched(); 1161 } 1162 } 1163 1164 static struct nexthop *nexthop_create_group(struct net *net, 1165 struct nh_config *cfg) 1166 { 1167 struct nlattr *grps_attr = cfg->nh_grp; 1168 struct nexthop_grp *entry = nla_data(grps_attr); 1169 u16 num_nh = nla_len(grps_attr) / sizeof(*entry); 1170 struct nh_group *nhg; 1171 struct nexthop *nh; 1172 int i; 1173 1174 nh = nexthop_alloc(); 1175 if (!nh) 1176 return ERR_PTR(-ENOMEM); 1177 1178 nh->is_group = 1; 1179 1180 nhg = nexthop_grp_alloc(num_nh); 1181 if (!nhg) { 1182 kfree(nh); 1183 return ERR_PTR(-ENOMEM); 1184 } 1185 1186 /* spare group used for removals */ 1187 nhg->spare = nexthop_grp_alloc(num_nh); 1188 if (!nhg->spare) { 1189 kfree(nhg); 1190 kfree(nh); 1191 return ERR_PTR(-ENOMEM); 1192 } 1193 nhg->spare->spare = nhg; 1194 1195 for (i = 0; i < nhg->num_nh; ++i) { 1196 struct nexthop *nhe; 1197 struct nh_info *nhi; 1198 1199 nhe = nexthop_find_by_id(net, entry[i].id); 1200 if (!nexthop_get(nhe)) 1201 goto out_no_nh; 1202 1203 nhi = rtnl_dereference(nhe->nh_info); 1204 if (nhi->family == AF_INET) 1205 nhg->has_v4 = true; 1206 1207 nhg->nh_entries[i].nh = nhe; 1208 nhg->nh_entries[i].weight = entry[i].weight + 1; 1209 list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); 1210 nhg->nh_entries[i].nh_parent = nh; 1211 } 1212 1213 if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { 1214 nhg->mpath = 1; 1215 nh_group_rebalance(nhg); 1216 } 1217 1218 if (cfg->nh_fdb) 1219 nh->is_fdb_nh = 1; 1220 1221 rcu_assign_pointer(nh->nh_grp, nhg); 1222 1223 return nh; 1224 1225 out_no_nh: 1226 for (; i >= 0; --i) 1227 nexthop_put(nhg->nh_entries[i].nh); 1228 1229 kfree(nhg->spare); 1230 kfree(nhg); 1231 kfree(nh); 1232 1233 return ERR_PTR(-ENOENT); 1234 } 1235 1236 static int nh_create_ipv4(struct net *net, struct nexthop *nh, 1237 struct nh_info *nhi, struct nh_config *cfg, 1238 struct netlink_ext_ack *extack) 1239 { 1240 struct fib_nh *fib_nh = &nhi->fib_nh; 1241 struct fib_config fib_cfg = { 1242 .fc_oif = cfg->nh_ifindex, 1243 .fc_gw4 = cfg->gw.ipv4, 1244 .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, 1245 .fc_flags = cfg->nh_flags, 1246 .fc_encap = cfg->nh_encap, 1247 .fc_encap_type = cfg->nh_encap_type, 1248 }; 1249 u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); 1250 int err; 1251 1252 err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); 1253 if (err) { 1254 fib_nh_release(net, fib_nh); 1255 goto out; 1256 } 1257 1258 if (nh->is_fdb_nh) 1259 goto out; 1260 1261 /* sets nh_dev if successful */ 1262 err = fib_check_nh(net, fib_nh, tb_id, 0, extack); 1263 if (!err) { 1264 nh->nh_flags = fib_nh->fib_nh_flags; 1265 fib_info_update_nhc_saddr(net, &fib_nh->nh_common, 1266 fib_nh->fib_nh_scope); 1267 } else { 1268 fib_nh_release(net, fib_nh); 1269 } 1270 out: 1271 return err; 1272 } 1273 1274 static int nh_create_ipv6(struct net *net, struct nexthop *nh, 1275 struct nh_info *nhi, struct nh_config *cfg, 1276 struct netlink_ext_ack *extack) 1277 { 1278 struct fib6_nh *fib6_nh = &nhi->fib6_nh; 1279 struct fib6_config fib6_cfg = { 1280 .fc_table = l3mdev_fib_table(cfg->dev), 1281 .fc_ifindex = cfg->nh_ifindex, 1282 .fc_gateway = cfg->gw.ipv6, 1283 .fc_flags = cfg->nh_flags, 1284 .fc_encap = cfg->nh_encap, 1285 .fc_encap_type = cfg->nh_encap_type, 1286 .fc_is_fdb = cfg->nh_fdb, 1287 }; 1288 int err; 1289 1290 if (!ipv6_addr_any(&cfg->gw.ipv6)) 1291 fib6_cfg.fc_flags |= RTF_GATEWAY; 1292 1293 /* sets nh_dev if successful */ 1294 err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, 1295 extack); 1296 if (err) 1297 ipv6_stub->fib6_nh_release(fib6_nh); 1298 else 1299 nh->nh_flags = fib6_nh->fib_nh_flags; 1300 1301 return err; 1302 } 1303 1304 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, 1305 struct netlink_ext_ack *extack) 1306 { 1307 struct nh_info *nhi; 1308 struct nexthop *nh; 1309 int err = 0; 1310 1311 nh = nexthop_alloc(); 1312 if (!nh) 1313 return ERR_PTR(-ENOMEM); 1314 1315 nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); 1316 if (!nhi) { 1317 kfree(nh); 1318 return ERR_PTR(-ENOMEM); 1319 } 1320 1321 nh->nh_flags = cfg->nh_flags; 1322 nh->net = net; 1323 1324 nhi->nh_parent = nh; 1325 nhi->family = cfg->nh_family; 1326 nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; 1327 1328 if (cfg->nh_fdb) 1329 nh->is_fdb_nh = 1; 1330 1331 if (cfg->nh_blackhole) { 1332 nhi->reject_nh = 1; 1333 cfg->nh_ifindex = net->loopback_dev->ifindex; 1334 } 1335 1336 switch (cfg->nh_family) { 1337 case AF_INET: 1338 err = nh_create_ipv4(net, nh, nhi, cfg, extack); 1339 break; 1340 case AF_INET6: 1341 err = nh_create_ipv6(net, nh, nhi, cfg, extack); 1342 break; 1343 } 1344 1345 if (err) { 1346 kfree(nhi); 1347 kfree(nh); 1348 return ERR_PTR(err); 1349 } 1350 1351 /* add the entry to the device based hash */ 1352 if (!nh->is_fdb_nh) 1353 nexthop_devhash_add(net, nhi); 1354 1355 rcu_assign_pointer(nh->nh_info, nhi); 1356 1357 return nh; 1358 } 1359 1360 /* called with rtnl lock held */ 1361 static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, 1362 struct netlink_ext_ack *extack) 1363 { 1364 struct nexthop *nh; 1365 int err; 1366 1367 if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { 1368 NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); 1369 return ERR_PTR(-EINVAL); 1370 } 1371 1372 if (!cfg->nh_id) { 1373 cfg->nh_id = nh_find_unused_id(net); 1374 if (!cfg->nh_id) { 1375 NL_SET_ERR_MSG(extack, "No unused id"); 1376 return ERR_PTR(-EINVAL); 1377 } 1378 } 1379 1380 if (cfg->nh_grp) 1381 nh = nexthop_create_group(net, cfg); 1382 else 1383 nh = nexthop_create(net, cfg, extack); 1384 1385 if (IS_ERR(nh)) 1386 return nh; 1387 1388 refcount_set(&nh->refcnt, 1); 1389 nh->id = cfg->nh_id; 1390 nh->protocol = cfg->nh_protocol; 1391 nh->net = net; 1392 1393 err = insert_nexthop(net, nh, cfg, extack); 1394 if (err) { 1395 __remove_nexthop(net, nh, NULL); 1396 nexthop_put(nh); 1397 nh = ERR_PTR(err); 1398 } 1399 1400 return nh; 1401 } 1402 1403 static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, 1404 struct nlmsghdr *nlh, struct nh_config *cfg, 1405 struct netlink_ext_ack *extack) 1406 { 1407 struct nhmsg *nhm = nlmsg_data(nlh); 1408 struct nlattr *tb[NHA_MAX + 1]; 1409 int err; 1410 1411 err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, 1412 extack); 1413 if (err < 0) 1414 return err; 1415 1416 err = -EINVAL; 1417 if (nhm->resvd || nhm->nh_scope) { 1418 NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); 1419 goto out; 1420 } 1421 if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { 1422 NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); 1423 goto out; 1424 } 1425 1426 switch (nhm->nh_family) { 1427 case AF_INET: 1428 case AF_INET6: 1429 break; 1430 case AF_UNSPEC: 1431 if (tb[NHA_GROUP]) 1432 break; 1433 fallthrough; 1434 default: 1435 NL_SET_ERR_MSG(extack, "Invalid address family"); 1436 goto out; 1437 } 1438 1439 if (tb[NHA_GROUPS] || tb[NHA_MASTER]) { 1440 NL_SET_ERR_MSG(extack, "Invalid attributes in request"); 1441 goto out; 1442 } 1443 1444 memset(cfg, 0, sizeof(*cfg)); 1445 cfg->nlflags = nlh->nlmsg_flags; 1446 cfg->nlinfo.portid = NETLINK_CB(skb).portid; 1447 cfg->nlinfo.nlh = nlh; 1448 cfg->nlinfo.nl_net = net; 1449 1450 cfg->nh_family = nhm->nh_family; 1451 cfg->nh_protocol = nhm->nh_protocol; 1452 cfg->nh_flags = nhm->nh_flags; 1453 1454 if (tb[NHA_ID]) 1455 cfg->nh_id = nla_get_u32(tb[NHA_ID]); 1456 1457 if (tb[NHA_FDB]) { 1458 if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || 1459 tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { 1460 NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); 1461 goto out; 1462 } 1463 if (nhm->nh_flags) { 1464 NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); 1465 goto out; 1466 } 1467 cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); 1468 } 1469 1470 if (tb[NHA_GROUP]) { 1471 if (nhm->nh_family != AF_UNSPEC) { 1472 NL_SET_ERR_MSG(extack, "Invalid family for group"); 1473 goto out; 1474 } 1475 cfg->nh_grp = tb[NHA_GROUP]; 1476 1477 cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; 1478 if (tb[NHA_GROUP_TYPE]) 1479 cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); 1480 1481 if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { 1482 NL_SET_ERR_MSG(extack, "Invalid group type"); 1483 goto out; 1484 } 1485 err = nh_check_attr_group(net, tb, extack); 1486 1487 /* no other attributes should be set */ 1488 goto out; 1489 } 1490 1491 if (tb[NHA_BLACKHOLE]) { 1492 if (tb[NHA_GATEWAY] || tb[NHA_OIF] || 1493 tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { 1494 NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); 1495 goto out; 1496 } 1497 1498 cfg->nh_blackhole = 1; 1499 err = 0; 1500 goto out; 1501 } 1502 1503 if (!cfg->nh_fdb && !tb[NHA_OIF]) { 1504 NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); 1505 goto out; 1506 } 1507 1508 if (!cfg->nh_fdb && tb[NHA_OIF]) { 1509 cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); 1510 if (cfg->nh_ifindex) 1511 cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); 1512 1513 if (!cfg->dev) { 1514 NL_SET_ERR_MSG(extack, "Invalid device index"); 1515 goto out; 1516 } else if (!(cfg->dev->flags & IFF_UP)) { 1517 NL_SET_ERR_MSG(extack, "Nexthop device is not up"); 1518 err = -ENETDOWN; 1519 goto out; 1520 } else if (!netif_carrier_ok(cfg->dev)) { 1521 NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); 1522 err = -ENETDOWN; 1523 goto out; 1524 } 1525 } 1526 1527 err = -EINVAL; 1528 if (tb[NHA_GATEWAY]) { 1529 struct nlattr *gwa = tb[NHA_GATEWAY]; 1530 1531 switch (cfg->nh_family) { 1532 case AF_INET: 1533 if (nla_len(gwa) != sizeof(u32)) { 1534 NL_SET_ERR_MSG(extack, "Invalid gateway"); 1535 goto out; 1536 } 1537 cfg->gw.ipv4 = nla_get_be32(gwa); 1538 break; 1539 case AF_INET6: 1540 if (nla_len(gwa) != sizeof(struct in6_addr)) { 1541 NL_SET_ERR_MSG(extack, "Invalid gateway"); 1542 goto out; 1543 } 1544 cfg->gw.ipv6 = nla_get_in6_addr(gwa); 1545 break; 1546 default: 1547 NL_SET_ERR_MSG(extack, 1548 "Unknown address family for gateway"); 1549 goto out; 1550 } 1551 } else { 1552 /* device only nexthop (no gateway) */ 1553 if (cfg->nh_flags & RTNH_F_ONLINK) { 1554 NL_SET_ERR_MSG(extack, 1555 "ONLINK flag can not be set for nexthop without a gateway"); 1556 goto out; 1557 } 1558 } 1559 1560 if (tb[NHA_ENCAP]) { 1561 cfg->nh_encap = tb[NHA_ENCAP]; 1562 1563 if (!tb[NHA_ENCAP_TYPE]) { 1564 NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); 1565 goto out; 1566 } 1567 1568 cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); 1569 err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); 1570 if (err < 0) 1571 goto out; 1572 1573 } else if (tb[NHA_ENCAP_TYPE]) { 1574 NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); 1575 goto out; 1576 } 1577 1578 1579 err = 0; 1580 out: 1581 return err; 1582 } 1583 1584 /* rtnl */ 1585 static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, 1586 struct netlink_ext_ack *extack) 1587 { 1588 struct net *net = sock_net(skb->sk); 1589 struct nh_config cfg; 1590 struct nexthop *nh; 1591 int err; 1592 1593 err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); 1594 if (!err) { 1595 nh = nexthop_add(net, &cfg, extack); 1596 if (IS_ERR(nh)) 1597 err = PTR_ERR(nh); 1598 } 1599 1600 return err; 1601 } 1602 1603 static int nh_valid_get_del_req(struct nlmsghdr *nlh, u32 *id, 1604 struct netlink_ext_ack *extack) 1605 { 1606 struct nhmsg *nhm = nlmsg_data(nlh); 1607 struct nlattr *tb[NHA_MAX + 1]; 1608 int err, i; 1609 1610 err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, 1611 extack); 1612 if (err < 0) 1613 return err; 1614 1615 err = -EINVAL; 1616 for (i = 0; i < __NHA_MAX; ++i) { 1617 if (!tb[i]) 1618 continue; 1619 1620 switch (i) { 1621 case NHA_ID: 1622 break; 1623 default: 1624 NL_SET_ERR_MSG_ATTR(extack, tb[i], 1625 "Unexpected attribute in request"); 1626 goto out; 1627 } 1628 } 1629 if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { 1630 NL_SET_ERR_MSG(extack, "Invalid values in header"); 1631 goto out; 1632 } 1633 1634 if (!tb[NHA_ID]) { 1635 NL_SET_ERR_MSG(extack, "Nexthop id is missing"); 1636 goto out; 1637 } 1638 1639 *id = nla_get_u32(tb[NHA_ID]); 1640 if (!(*id)) 1641 NL_SET_ERR_MSG(extack, "Invalid nexthop id"); 1642 else 1643 err = 0; 1644 out: 1645 return err; 1646 } 1647 1648 /* rtnl */ 1649 static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, 1650 struct netlink_ext_ack *extack) 1651 { 1652 struct net *net = sock_net(skb->sk); 1653 struct nl_info nlinfo = { 1654 .nlh = nlh, 1655 .nl_net = net, 1656 .portid = NETLINK_CB(skb).portid, 1657 }; 1658 struct nexthop *nh; 1659 int err; 1660 u32 id; 1661 1662 err = nh_valid_get_del_req(nlh, &id, extack); 1663 if (err) 1664 return err; 1665 1666 nh = nexthop_find_by_id(net, id); 1667 if (!nh) 1668 return -ENOENT; 1669 1670 remove_nexthop(net, nh, &nlinfo); 1671 1672 return 0; 1673 } 1674 1675 /* rtnl */ 1676 static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, 1677 struct netlink_ext_ack *extack) 1678 { 1679 struct net *net = sock_net(in_skb->sk); 1680 struct sk_buff *skb = NULL; 1681 struct nexthop *nh; 1682 int err; 1683 u32 id; 1684 1685 err = nh_valid_get_del_req(nlh, &id, extack); 1686 if (err) 1687 return err; 1688 1689 err = -ENOBUFS; 1690 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 1691 if (!skb) 1692 goto out; 1693 1694 err = -ENOENT; 1695 nh = nexthop_find_by_id(net, id); 1696 if (!nh) 1697 goto errout_free; 1698 1699 err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, 1700 nlh->nlmsg_seq, 0); 1701 if (err < 0) { 1702 WARN_ON(err == -EMSGSIZE); 1703 goto errout_free; 1704 } 1705 1706 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 1707 out: 1708 return err; 1709 errout_free: 1710 kfree_skb(skb); 1711 goto out; 1712 } 1713 1714 static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, 1715 bool group_filter, u8 family) 1716 { 1717 const struct net_device *dev; 1718 const struct nh_info *nhi; 1719 1720 if (group_filter && !nh->is_group) 1721 return true; 1722 1723 if (!dev_idx && !master_idx && !family) 1724 return false; 1725 1726 if (nh->is_group) 1727 return true; 1728 1729 nhi = rtnl_dereference(nh->nh_info); 1730 if (family && nhi->family != family) 1731 return true; 1732 1733 dev = nhi->fib_nhc.nhc_dev; 1734 if (dev_idx && (!dev || dev->ifindex != dev_idx)) 1735 return true; 1736 1737 if (master_idx) { 1738 struct net_device *master; 1739 1740 if (!dev) 1741 return true; 1742 1743 master = netdev_master_upper_dev_get((struct net_device *)dev); 1744 if (!master || master->ifindex != master_idx) 1745 return true; 1746 } 1747 1748 return false; 1749 } 1750 1751 static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, 1752 int *master_idx, bool *group_filter, 1753 bool *fdb_filter, struct netlink_callback *cb) 1754 { 1755 struct netlink_ext_ack *extack = cb->extack; 1756 struct nlattr *tb[NHA_MAX + 1]; 1757 struct nhmsg *nhm; 1758 int err, i; 1759 u32 idx; 1760 1761 err = nlmsg_parse(nlh, sizeof(*nhm), tb, NHA_MAX, rtm_nh_policy, 1762 NULL); 1763 if (err < 0) 1764 return err; 1765 1766 for (i = 0; i <= NHA_MAX; ++i) { 1767 if (!tb[i]) 1768 continue; 1769 1770 switch (i) { 1771 case NHA_OIF: 1772 idx = nla_get_u32(tb[i]); 1773 if (idx > INT_MAX) { 1774 NL_SET_ERR_MSG(extack, "Invalid device index"); 1775 return -EINVAL; 1776 } 1777 *dev_idx = idx; 1778 break; 1779 case NHA_MASTER: 1780 idx = nla_get_u32(tb[i]); 1781 if (idx > INT_MAX) { 1782 NL_SET_ERR_MSG(extack, "Invalid master device index"); 1783 return -EINVAL; 1784 } 1785 *master_idx = idx; 1786 break; 1787 case NHA_GROUPS: 1788 *group_filter = true; 1789 break; 1790 case NHA_FDB: 1791 *fdb_filter = true; 1792 break; 1793 default: 1794 NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); 1795 return -EINVAL; 1796 } 1797 } 1798 1799 nhm = nlmsg_data(nlh); 1800 if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { 1801 NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); 1802 return -EINVAL; 1803 } 1804 1805 return 0; 1806 } 1807 1808 /* rtnl */ 1809 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) 1810 { 1811 bool group_filter = false, fdb_filter = false; 1812 struct nhmsg *nhm = nlmsg_data(cb->nlh); 1813 int dev_filter_idx = 0, master_idx = 0; 1814 struct net *net = sock_net(skb->sk); 1815 struct rb_root *root = &net->nexthop.rb_root; 1816 struct rb_node *node; 1817 int idx = 0, s_idx; 1818 int err; 1819 1820 err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, 1821 &group_filter, &fdb_filter, cb); 1822 if (err < 0) 1823 return err; 1824 1825 s_idx = cb->args[0]; 1826 for (node = rb_first(root); node; node = rb_next(node)) { 1827 struct nexthop *nh; 1828 1829 if (idx < s_idx) 1830 goto cont; 1831 1832 nh = rb_entry(node, struct nexthop, rb_node); 1833 if (nh_dump_filtered(nh, dev_filter_idx, master_idx, 1834 group_filter, nhm->nh_family)) 1835 goto cont; 1836 1837 err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, 1838 NETLINK_CB(cb->skb).portid, 1839 cb->nlh->nlmsg_seq, NLM_F_MULTI); 1840 if (err < 0) { 1841 if (likely(skb->len)) 1842 goto out; 1843 1844 goto out_err; 1845 } 1846 cont: 1847 idx++; 1848 } 1849 1850 out: 1851 err = skb->len; 1852 out_err: 1853 cb->args[0] = idx; 1854 cb->seq = net->nexthop.seq; 1855 nl_dump_check_consistent(cb, nlmsg_hdr(skb)); 1856 1857 return err; 1858 } 1859 1860 static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) 1861 { 1862 unsigned int hash = nh_dev_hashfn(dev->ifindex); 1863 struct net *net = dev_net(dev); 1864 struct hlist_head *head = &net->nexthop.devhash[hash]; 1865 struct hlist_node *n; 1866 struct nh_info *nhi; 1867 1868 hlist_for_each_entry_safe(nhi, n, head, dev_hash) { 1869 if (nhi->fib_nhc.nhc_dev == dev) { 1870 if (nhi->family == AF_INET) 1871 fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, 1872 orig_mtu); 1873 } 1874 } 1875 } 1876 1877 /* rtnl */ 1878 static int nh_netdev_event(struct notifier_block *this, 1879 unsigned long event, void *ptr) 1880 { 1881 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 1882 struct netdev_notifier_info_ext *info_ext; 1883 1884 switch (event) { 1885 case NETDEV_DOWN: 1886 case NETDEV_UNREGISTER: 1887 nexthop_flush_dev(dev); 1888 break; 1889 case NETDEV_CHANGE: 1890 if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) 1891 nexthop_flush_dev(dev); 1892 break; 1893 case NETDEV_CHANGEMTU: 1894 info_ext = ptr; 1895 nexthop_sync_mtu(dev, info_ext->ext.mtu); 1896 rt_cache_flush(dev_net(dev)); 1897 break; 1898 } 1899 return NOTIFY_DONE; 1900 } 1901 1902 static struct notifier_block nh_netdev_notifier = { 1903 .notifier_call = nh_netdev_event, 1904 }; 1905 1906 int register_nexthop_notifier(struct net *net, struct notifier_block *nb) 1907 { 1908 return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); 1909 } 1910 EXPORT_SYMBOL(register_nexthop_notifier); 1911 1912 int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) 1913 { 1914 return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, 1915 nb); 1916 } 1917 EXPORT_SYMBOL(unregister_nexthop_notifier); 1918 1919 static void __net_exit nexthop_net_exit(struct net *net) 1920 { 1921 rtnl_lock(); 1922 flush_all_nexthops(net); 1923 rtnl_unlock(); 1924 kfree(net->nexthop.devhash); 1925 } 1926 1927 static int __net_init nexthop_net_init(struct net *net) 1928 { 1929 size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; 1930 1931 net->nexthop.rb_root = RB_ROOT; 1932 net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); 1933 if (!net->nexthop.devhash) 1934 return -ENOMEM; 1935 ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); 1936 1937 return 0; 1938 } 1939 1940 static struct pernet_operations nexthop_net_ops = { 1941 .init = nexthop_net_init, 1942 .exit = nexthop_net_exit, 1943 }; 1944 1945 static int __init nexthop_init(void) 1946 { 1947 register_pernet_subsys(&nexthop_net_ops); 1948 1949 register_netdevice_notifier(&nh_netdev_notifier); 1950 1951 rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); 1952 rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); 1953 rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, 1954 rtm_dump_nexthop, 0); 1955 1956 rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); 1957 rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); 1958 1959 rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); 1960 rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); 1961 1962 return 0; 1963 } 1964 subsys_initcall(nexthop_init); 1965