1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> 4 * 5 * Development of this code funded by Astaro AG (http://www.astaro.com/) 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/init.h> 10 #include <linux/module.h> 11 #include <linux/list.h> 12 #include <linux/rbtree.h> 13 #include <linux/netlink.h> 14 #include <linux/netfilter.h> 15 #include <linux/netfilter/nf_tables.h> 16 #include <net/netfilter/nf_tables_core.h> 17 18 struct nft_rbtree { 19 struct rb_root root; 20 rwlock_t lock; 21 seqcount_rwlock_t count; 22 struct delayed_work gc_work; 23 }; 24 25 struct nft_rbtree_elem { 26 struct rb_node node; 27 struct nft_set_ext ext; 28 }; 29 30 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) 31 { 32 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && 33 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); 34 } 35 36 static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) 37 { 38 return !nft_rbtree_interval_end(rbe); 39 } 40 41 static bool nft_rbtree_equal(const struct nft_set *set, const void *this, 42 const struct nft_rbtree_elem *interval) 43 { 44 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; 45 } 46 47 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, 48 const u32 *key, const struct nft_set_ext **ext, 49 unsigned int seq) 50 { 51 struct nft_rbtree *priv = nft_set_priv(set); 52 const struct nft_rbtree_elem *rbe, *interval = NULL; 53 u8 genmask = nft_genmask_cur(net); 54 const struct rb_node *parent; 55 const void *this; 56 int d; 57 58 parent = rcu_dereference_raw(priv->root.rb_node); 59 while (parent != NULL) { 60 if (read_seqcount_retry(&priv->count, seq)) 61 return false; 62 63 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 64 65 this = nft_set_ext_key(&rbe->ext); 66 d = memcmp(this, key, set->klen); 67 if (d < 0) { 68 parent = rcu_dereference_raw(parent->rb_left); 69 if (interval && 70 nft_rbtree_equal(set, this, interval) && 71 nft_rbtree_interval_end(rbe) && 72 nft_rbtree_interval_start(interval)) 73 continue; 74 interval = rbe; 75 } else if (d > 0) 76 parent = rcu_dereference_raw(parent->rb_right); 77 else { 78 if (!nft_set_elem_active(&rbe->ext, genmask)) { 79 parent = rcu_dereference_raw(parent->rb_left); 80 continue; 81 } 82 83 if (nft_set_elem_expired(&rbe->ext)) 84 return false; 85 86 if (nft_rbtree_interval_end(rbe)) { 87 if (nft_set_is_anonymous(set)) 88 return false; 89 parent = rcu_dereference_raw(parent->rb_left); 90 interval = NULL; 91 continue; 92 } 93 94 *ext = &rbe->ext; 95 return true; 96 } 97 } 98 99 if (set->flags & NFT_SET_INTERVAL && interval != NULL && 100 nft_set_elem_active(&interval->ext, genmask) && 101 !nft_set_elem_expired(&interval->ext) && 102 nft_rbtree_interval_start(interval)) { 103 *ext = &interval->ext; 104 return true; 105 } 106 107 return false; 108 } 109 110 static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, 111 const u32 *key, const struct nft_set_ext **ext) 112 { 113 struct nft_rbtree *priv = nft_set_priv(set); 114 unsigned int seq = read_seqcount_begin(&priv->count); 115 bool ret; 116 117 ret = __nft_rbtree_lookup(net, set, key, ext, seq); 118 if (ret || !read_seqcount_retry(&priv->count, seq)) 119 return ret; 120 121 read_lock_bh(&priv->lock); 122 seq = read_seqcount_begin(&priv->count); 123 ret = __nft_rbtree_lookup(net, set, key, ext, seq); 124 read_unlock_bh(&priv->lock); 125 126 return ret; 127 } 128 129 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, 130 const u32 *key, struct nft_rbtree_elem **elem, 131 unsigned int seq, unsigned int flags, u8 genmask) 132 { 133 struct nft_rbtree_elem *rbe, *interval = NULL; 134 struct nft_rbtree *priv = nft_set_priv(set); 135 const struct rb_node *parent; 136 const void *this; 137 int d; 138 139 parent = rcu_dereference_raw(priv->root.rb_node); 140 while (parent != NULL) { 141 if (read_seqcount_retry(&priv->count, seq)) 142 return false; 143 144 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 145 146 this = nft_set_ext_key(&rbe->ext); 147 d = memcmp(this, key, set->klen); 148 if (d < 0) { 149 parent = rcu_dereference_raw(parent->rb_left); 150 if (!(flags & NFT_SET_ELEM_INTERVAL_END)) 151 interval = rbe; 152 } else if (d > 0) { 153 parent = rcu_dereference_raw(parent->rb_right); 154 if (flags & NFT_SET_ELEM_INTERVAL_END) 155 interval = rbe; 156 } else { 157 if (!nft_set_elem_active(&rbe->ext, genmask)) { 158 parent = rcu_dereference_raw(parent->rb_left); 159 continue; 160 } 161 162 if (nft_set_elem_expired(&rbe->ext)) 163 return false; 164 165 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || 166 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == 167 (flags & NFT_SET_ELEM_INTERVAL_END)) { 168 *elem = rbe; 169 return true; 170 } 171 172 if (nft_rbtree_interval_end(rbe)) 173 interval = NULL; 174 175 parent = rcu_dereference_raw(parent->rb_left); 176 } 177 } 178 179 if (set->flags & NFT_SET_INTERVAL && interval != NULL && 180 nft_set_elem_active(&interval->ext, genmask) && 181 !nft_set_elem_expired(&interval->ext) && 182 ((!nft_rbtree_interval_end(interval) && 183 !(flags & NFT_SET_ELEM_INTERVAL_END)) || 184 (nft_rbtree_interval_end(interval) && 185 (flags & NFT_SET_ELEM_INTERVAL_END)))) { 186 *elem = interval; 187 return true; 188 } 189 190 return false; 191 } 192 193 static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, 194 const struct nft_set_elem *elem, unsigned int flags) 195 { 196 struct nft_rbtree *priv = nft_set_priv(set); 197 unsigned int seq = read_seqcount_begin(&priv->count); 198 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); 199 const u32 *key = (const u32 *)&elem->key.val; 200 u8 genmask = nft_genmask_cur(net); 201 bool ret; 202 203 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); 204 if (ret || !read_seqcount_retry(&priv->count, seq)) 205 return rbe; 206 207 read_lock_bh(&priv->lock); 208 seq = read_seqcount_begin(&priv->count); 209 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); 210 if (!ret) 211 rbe = ERR_PTR(-ENOENT); 212 read_unlock_bh(&priv->lock); 213 214 return rbe; 215 } 216 217 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, 218 struct nft_rbtree_elem *new, 219 struct nft_set_ext **ext) 220 { 221 bool overlap = false, dup_end_left = false, dup_end_right = false; 222 struct nft_rbtree *priv = nft_set_priv(set); 223 u8 genmask = nft_genmask_next(net); 224 struct nft_rbtree_elem *rbe; 225 struct rb_node *parent, **p; 226 int d; 227 228 /* Detect overlaps as we descend the tree. Set the flag in these cases: 229 * 230 * a1. _ _ __>| ?_ _ __| (insert end before existing end) 231 * a2. _ _ ___| ?_ _ _>| (insert end after existing end) 232 * a3. _ _ ___? >|_ _ __| (insert start before existing end) 233 * 234 * and clear it later on, as we eventually reach the points indicated by 235 * '?' above, in the cases described below. We'll always meet these 236 * later, locally, due to tree ordering, and overlaps for the intervals 237 * that are the closest together are always evaluated last. 238 * 239 * b1. _ _ __>| !_ _ __| (insert end before existing start) 240 * b2. _ _ ___| !_ _ _>| (insert end after existing start) 241 * b3. _ _ ___! >|_ _ __| (insert start after existing end, as a leaf) 242 * '--' no nodes falling in this range 243 * b4. >|_ _ ! (insert start before existing start) 244 * 245 * Case a3. resolves to b3.: 246 * - if the inserted start element is the leftmost, because the '0' 247 * element in the tree serves as end element 248 * - otherwise, if an existing end is found immediately to the left. If 249 * there are existing nodes in between, we need to further descend the 250 * tree before we can conclude the new start isn't causing an overlap 251 * 252 * or to b4., which, preceded by a3., means we already traversed one or 253 * more existing intervals entirely, from the right. 254 * 255 * For a new, rightmost pair of elements, we'll hit cases b3. and b2., 256 * in that order. 257 * 258 * The flag is also cleared in two special cases: 259 * 260 * b5. |__ _ _!|<_ _ _ (insert start right before existing end) 261 * b6. |__ _ >|!__ _ _ (insert end right after existing start) 262 * 263 * which always happen as last step and imply that no further 264 * overlapping is possible. 265 * 266 * Another special case comes from the fact that start elements matching 267 * an already existing start element are allowed: insertion is not 268 * performed but we return -EEXIST in that case, and the error will be 269 * cleared by the caller if NLM_F_EXCL is not present in the request. 270 * This way, request for insertion of an exact overlap isn't reported as 271 * error to userspace if not desired. 272 * 273 * However, if the existing start matches a pre-existing start, but the 274 * end element doesn't match the corresponding pre-existing end element, 275 * we need to report a partial overlap. This is a local condition that 276 * can be noticed without need for a tracking flag, by checking for a 277 * local duplicated end for a corresponding start, from left and right, 278 * separately. 279 */ 280 281 parent = NULL; 282 p = &priv->root.rb_node; 283 while (*p != NULL) { 284 parent = *p; 285 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 286 d = memcmp(nft_set_ext_key(&rbe->ext), 287 nft_set_ext_key(&new->ext), 288 set->klen); 289 if (d < 0) { 290 p = &parent->rb_left; 291 292 if (nft_rbtree_interval_start(new)) { 293 if (nft_rbtree_interval_end(rbe) && 294 nft_set_elem_active(&rbe->ext, genmask) && 295 !nft_set_elem_expired(&rbe->ext) && !*p) 296 overlap = false; 297 } else { 298 if (dup_end_left && !*p) 299 return -ENOTEMPTY; 300 301 overlap = nft_rbtree_interval_end(rbe) && 302 nft_set_elem_active(&rbe->ext, 303 genmask) && 304 !nft_set_elem_expired(&rbe->ext); 305 306 if (overlap) { 307 dup_end_right = true; 308 continue; 309 } 310 } 311 } else if (d > 0) { 312 p = &parent->rb_right; 313 314 if (nft_rbtree_interval_end(new)) { 315 if (dup_end_right && !*p) 316 return -ENOTEMPTY; 317 318 overlap = nft_rbtree_interval_end(rbe) && 319 nft_set_elem_active(&rbe->ext, 320 genmask) && 321 !nft_set_elem_expired(&rbe->ext); 322 323 if (overlap) { 324 dup_end_left = true; 325 continue; 326 } 327 } else if (nft_set_elem_active(&rbe->ext, genmask) && 328 !nft_set_elem_expired(&rbe->ext)) { 329 overlap = nft_rbtree_interval_end(rbe); 330 } 331 } else { 332 if (nft_rbtree_interval_end(rbe) && 333 nft_rbtree_interval_start(new)) { 334 p = &parent->rb_left; 335 336 if (nft_set_elem_active(&rbe->ext, genmask) && 337 !nft_set_elem_expired(&rbe->ext)) 338 overlap = false; 339 } else if (nft_rbtree_interval_start(rbe) && 340 nft_rbtree_interval_end(new)) { 341 p = &parent->rb_right; 342 343 if (nft_set_elem_active(&rbe->ext, genmask) && 344 !nft_set_elem_expired(&rbe->ext)) 345 overlap = false; 346 } else if (nft_set_elem_active(&rbe->ext, genmask) && 347 !nft_set_elem_expired(&rbe->ext)) { 348 *ext = &rbe->ext; 349 return -EEXIST; 350 } else { 351 p = &parent->rb_left; 352 } 353 } 354 355 dup_end_left = dup_end_right = false; 356 } 357 358 if (overlap) 359 return -ENOTEMPTY; 360 361 rb_link_node_rcu(&new->node, parent, p); 362 rb_insert_color(&new->node, &priv->root); 363 return 0; 364 } 365 366 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, 367 const struct nft_set_elem *elem, 368 struct nft_set_ext **ext) 369 { 370 struct nft_rbtree *priv = nft_set_priv(set); 371 struct nft_rbtree_elem *rbe = elem->priv; 372 int err; 373 374 write_lock_bh(&priv->lock); 375 write_seqcount_begin(&priv->count); 376 err = __nft_rbtree_insert(net, set, rbe, ext); 377 write_seqcount_end(&priv->count); 378 write_unlock_bh(&priv->lock); 379 380 return err; 381 } 382 383 static void nft_rbtree_remove(const struct net *net, 384 const struct nft_set *set, 385 const struct nft_set_elem *elem) 386 { 387 struct nft_rbtree *priv = nft_set_priv(set); 388 struct nft_rbtree_elem *rbe = elem->priv; 389 390 write_lock_bh(&priv->lock); 391 write_seqcount_begin(&priv->count); 392 rb_erase(&rbe->node, &priv->root); 393 write_seqcount_end(&priv->count); 394 write_unlock_bh(&priv->lock); 395 } 396 397 static void nft_rbtree_activate(const struct net *net, 398 const struct nft_set *set, 399 const struct nft_set_elem *elem) 400 { 401 struct nft_rbtree_elem *rbe = elem->priv; 402 403 nft_set_elem_change_active(net, set, &rbe->ext); 404 nft_set_elem_clear_busy(&rbe->ext); 405 } 406 407 static bool nft_rbtree_flush(const struct net *net, 408 const struct nft_set *set, void *priv) 409 { 410 struct nft_rbtree_elem *rbe = priv; 411 412 if (!nft_set_elem_mark_busy(&rbe->ext) || 413 !nft_is_active(net, &rbe->ext)) { 414 nft_set_elem_change_active(net, set, &rbe->ext); 415 return true; 416 } 417 return false; 418 } 419 420 static void *nft_rbtree_deactivate(const struct net *net, 421 const struct nft_set *set, 422 const struct nft_set_elem *elem) 423 { 424 const struct nft_rbtree *priv = nft_set_priv(set); 425 const struct rb_node *parent = priv->root.rb_node; 426 struct nft_rbtree_elem *rbe, *this = elem->priv; 427 u8 genmask = nft_genmask_next(net); 428 int d; 429 430 while (parent != NULL) { 431 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 432 433 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, 434 set->klen); 435 if (d < 0) 436 parent = parent->rb_left; 437 else if (d > 0) 438 parent = parent->rb_right; 439 else { 440 if (nft_rbtree_interval_end(rbe) && 441 nft_rbtree_interval_start(this)) { 442 parent = parent->rb_left; 443 continue; 444 } else if (nft_rbtree_interval_start(rbe) && 445 nft_rbtree_interval_end(this)) { 446 parent = parent->rb_right; 447 continue; 448 } else if (!nft_set_elem_active(&rbe->ext, genmask)) { 449 parent = parent->rb_left; 450 continue; 451 } 452 nft_rbtree_flush(net, set, rbe); 453 return rbe; 454 } 455 } 456 return NULL; 457 } 458 459 static void nft_rbtree_walk(const struct nft_ctx *ctx, 460 struct nft_set *set, 461 struct nft_set_iter *iter) 462 { 463 struct nft_rbtree *priv = nft_set_priv(set); 464 struct nft_rbtree_elem *rbe; 465 struct nft_set_elem elem; 466 struct rb_node *node; 467 468 read_lock_bh(&priv->lock); 469 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { 470 rbe = rb_entry(node, struct nft_rbtree_elem, node); 471 472 if (iter->count < iter->skip) 473 goto cont; 474 if (nft_set_elem_expired(&rbe->ext)) 475 goto cont; 476 if (!nft_set_elem_active(&rbe->ext, iter->genmask)) 477 goto cont; 478 479 elem.priv = rbe; 480 481 iter->err = iter->fn(ctx, set, iter, &elem); 482 if (iter->err < 0) { 483 read_unlock_bh(&priv->lock); 484 return; 485 } 486 cont: 487 iter->count++; 488 } 489 read_unlock_bh(&priv->lock); 490 } 491 492 static void nft_rbtree_gc(struct work_struct *work) 493 { 494 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; 495 struct nft_set_gc_batch *gcb = NULL; 496 struct nft_rbtree *priv; 497 struct rb_node *node; 498 struct nft_set *set; 499 500 priv = container_of(work, struct nft_rbtree, gc_work.work); 501 set = nft_set_container_of(priv); 502 503 write_lock_bh(&priv->lock); 504 write_seqcount_begin(&priv->count); 505 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { 506 rbe = rb_entry(node, struct nft_rbtree_elem, node); 507 508 if (nft_rbtree_interval_end(rbe)) { 509 rbe_end = rbe; 510 continue; 511 } 512 if (!nft_set_elem_expired(&rbe->ext)) 513 continue; 514 if (nft_set_elem_mark_busy(&rbe->ext)) 515 continue; 516 517 if (rbe_prev) { 518 rb_erase(&rbe_prev->node, &priv->root); 519 rbe_prev = NULL; 520 } 521 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); 522 if (!gcb) 523 break; 524 525 atomic_dec(&set->nelems); 526 nft_set_gc_batch_add(gcb, rbe); 527 rbe_prev = rbe; 528 529 if (rbe_end) { 530 atomic_dec(&set->nelems); 531 nft_set_gc_batch_add(gcb, rbe_end); 532 rb_erase(&rbe_end->node, &priv->root); 533 rbe_end = NULL; 534 } 535 node = rb_next(node); 536 if (!node) 537 break; 538 } 539 if (rbe_prev) 540 rb_erase(&rbe_prev->node, &priv->root); 541 write_seqcount_end(&priv->count); 542 write_unlock_bh(&priv->lock); 543 544 nft_set_gc_batch_complete(gcb); 545 546 queue_delayed_work(system_power_efficient_wq, &priv->gc_work, 547 nft_set_gc_interval(set)); 548 } 549 550 static u64 nft_rbtree_privsize(const struct nlattr * const nla[], 551 const struct nft_set_desc *desc) 552 { 553 return sizeof(struct nft_rbtree); 554 } 555 556 static int nft_rbtree_init(const struct nft_set *set, 557 const struct nft_set_desc *desc, 558 const struct nlattr * const nla[]) 559 { 560 struct nft_rbtree *priv = nft_set_priv(set); 561 562 rwlock_init(&priv->lock); 563 seqcount_rwlock_init(&priv->count, &priv->lock); 564 priv->root = RB_ROOT; 565 566 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); 567 if (set->flags & NFT_SET_TIMEOUT) 568 queue_delayed_work(system_power_efficient_wq, &priv->gc_work, 569 nft_set_gc_interval(set)); 570 571 return 0; 572 } 573 574 static void nft_rbtree_destroy(const struct nft_set *set) 575 { 576 struct nft_rbtree *priv = nft_set_priv(set); 577 struct nft_rbtree_elem *rbe; 578 struct rb_node *node; 579 580 cancel_delayed_work_sync(&priv->gc_work); 581 rcu_barrier(); 582 while ((node = priv->root.rb_node) != NULL) { 583 rb_erase(node, &priv->root); 584 rbe = rb_entry(node, struct nft_rbtree_elem, node); 585 nft_set_elem_destroy(set, rbe, true); 586 } 587 } 588 589 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, 590 struct nft_set_estimate *est) 591 { 592 if (desc->field_count > 1) 593 return false; 594 595 if (desc->size) 596 est->size = sizeof(struct nft_rbtree) + 597 desc->size * sizeof(struct nft_rbtree_elem); 598 else 599 est->size = ~0; 600 601 est->lookup = NFT_SET_CLASS_O_LOG_N; 602 est->space = NFT_SET_CLASS_O_N; 603 604 return true; 605 } 606 607 const struct nft_set_type nft_set_rbtree_type = { 608 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, 609 .ops = { 610 .privsize = nft_rbtree_privsize, 611 .elemsize = offsetof(struct nft_rbtree_elem, ext), 612 .estimate = nft_rbtree_estimate, 613 .init = nft_rbtree_init, 614 .destroy = nft_rbtree_destroy, 615 .insert = nft_rbtree_insert, 616 .remove = nft_rbtree_remove, 617 .deactivate = nft_rbtree_deactivate, 618 .flush = nft_rbtree_flush, 619 .activate = nft_rbtree_activate, 620 .lookup = nft_rbtree_lookup, 621 .walk = nft_rbtree_walk, 622 .get = nft_rbtree_get, 623 }, 624 }; 625