1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> 4 * 5 * Development of this code funded by Astaro AG (http://www.astaro.com/) 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/init.h> 10 #include <linux/module.h> 11 #include <linux/list.h> 12 #include <linux/rbtree.h> 13 #include <linux/netlink.h> 14 #include <linux/netfilter.h> 15 #include <linux/netfilter/nf_tables.h> 16 #include <net/netfilter/nf_tables_core.h> 17 18 struct nft_rbtree { 19 struct rb_root root; 20 rwlock_t lock; 21 seqcount_t count; 22 struct delayed_work gc_work; 23 }; 24 25 struct nft_rbtree_elem { 26 struct rb_node node; 27 struct nft_set_ext ext; 28 }; 29 30 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) 31 { 32 return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && 33 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); 34 } 35 36 static bool nft_rbtree_equal(const struct nft_set *set, const void *this, 37 const struct nft_rbtree_elem *interval) 38 { 39 return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; 40 } 41 42 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, 43 const u32 *key, const struct nft_set_ext **ext, 44 unsigned int seq) 45 { 46 struct nft_rbtree *priv = nft_set_priv(set); 47 const struct nft_rbtree_elem *rbe, *interval = NULL; 48 u8 genmask = nft_genmask_cur(net); 49 const struct rb_node *parent; 50 const void *this; 51 int d; 52 53 parent = rcu_dereference_raw(priv->root.rb_node); 54 while (parent != NULL) { 55 if (read_seqcount_retry(&priv->count, seq)) 56 return false; 57 58 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 59 60 this = nft_set_ext_key(&rbe->ext); 61 d = memcmp(this, key, set->klen); 62 if (d < 0) { 63 parent = rcu_dereference_raw(parent->rb_left); 64 if (interval && 65 nft_rbtree_equal(set, this, interval) && 66 nft_rbtree_interval_end(rbe) && 67 !nft_rbtree_interval_end(interval)) 68 continue; 69 interval = rbe; 70 } else if (d > 0) 71 parent = rcu_dereference_raw(parent->rb_right); 72 else { 73 if (!nft_set_elem_active(&rbe->ext, genmask)) { 74 parent = rcu_dereference_raw(parent->rb_left); 75 continue; 76 } 77 if (nft_rbtree_interval_end(rbe)) 78 goto out; 79 80 *ext = &rbe->ext; 81 return true; 82 } 83 } 84 85 if (set->flags & NFT_SET_INTERVAL && interval != NULL && 86 nft_set_elem_active(&interval->ext, genmask) && 87 !nft_rbtree_interval_end(interval)) { 88 *ext = &interval->ext; 89 return true; 90 } 91 out: 92 return false; 93 } 94 95 static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, 96 const u32 *key, const struct nft_set_ext **ext) 97 { 98 struct nft_rbtree *priv = nft_set_priv(set); 99 unsigned int seq = read_seqcount_begin(&priv->count); 100 bool ret; 101 102 ret = __nft_rbtree_lookup(net, set, key, ext, seq); 103 if (ret || !read_seqcount_retry(&priv->count, seq)) 104 return ret; 105 106 read_lock_bh(&priv->lock); 107 seq = read_seqcount_begin(&priv->count); 108 ret = __nft_rbtree_lookup(net, set, key, ext, seq); 109 read_unlock_bh(&priv->lock); 110 111 return ret; 112 } 113 114 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, 115 const u32 *key, struct nft_rbtree_elem **elem, 116 unsigned int seq, unsigned int flags, u8 genmask) 117 { 118 struct nft_rbtree_elem *rbe, *interval = NULL; 119 struct nft_rbtree *priv = nft_set_priv(set); 120 const struct rb_node *parent; 121 const void *this; 122 int d; 123 124 parent = rcu_dereference_raw(priv->root.rb_node); 125 while (parent != NULL) { 126 if (read_seqcount_retry(&priv->count, seq)) 127 return false; 128 129 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 130 131 this = nft_set_ext_key(&rbe->ext); 132 d = memcmp(this, key, set->klen); 133 if (d < 0) { 134 parent = rcu_dereference_raw(parent->rb_left); 135 if (!(flags & NFT_SET_ELEM_INTERVAL_END)) 136 interval = rbe; 137 } else if (d > 0) { 138 parent = rcu_dereference_raw(parent->rb_right); 139 if (flags & NFT_SET_ELEM_INTERVAL_END) 140 interval = rbe; 141 } else { 142 if (!nft_set_elem_active(&rbe->ext, genmask)) 143 parent = rcu_dereference_raw(parent->rb_left); 144 145 if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || 146 (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == 147 (flags & NFT_SET_ELEM_INTERVAL_END)) { 148 *elem = rbe; 149 return true; 150 } 151 return false; 152 } 153 } 154 155 if (set->flags & NFT_SET_INTERVAL && interval != NULL && 156 nft_set_elem_active(&interval->ext, genmask) && 157 ((!nft_rbtree_interval_end(interval) && 158 !(flags & NFT_SET_ELEM_INTERVAL_END)) || 159 (nft_rbtree_interval_end(interval) && 160 (flags & NFT_SET_ELEM_INTERVAL_END)))) { 161 *elem = interval; 162 return true; 163 } 164 165 return false; 166 } 167 168 static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, 169 const struct nft_set_elem *elem, unsigned int flags) 170 { 171 struct nft_rbtree *priv = nft_set_priv(set); 172 unsigned int seq = read_seqcount_begin(&priv->count); 173 struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); 174 const u32 *key = (const u32 *)&elem->key.val; 175 u8 genmask = nft_genmask_cur(net); 176 bool ret; 177 178 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); 179 if (ret || !read_seqcount_retry(&priv->count, seq)) 180 return rbe; 181 182 read_lock_bh(&priv->lock); 183 seq = read_seqcount_begin(&priv->count); 184 ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); 185 if (!ret) 186 rbe = ERR_PTR(-ENOENT); 187 read_unlock_bh(&priv->lock); 188 189 return rbe; 190 } 191 192 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, 193 struct nft_rbtree_elem *new, 194 struct nft_set_ext **ext) 195 { 196 struct nft_rbtree *priv = nft_set_priv(set); 197 u8 genmask = nft_genmask_next(net); 198 struct nft_rbtree_elem *rbe; 199 struct rb_node *parent, **p; 200 int d; 201 202 parent = NULL; 203 p = &priv->root.rb_node; 204 while (*p != NULL) { 205 parent = *p; 206 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 207 d = memcmp(nft_set_ext_key(&rbe->ext), 208 nft_set_ext_key(&new->ext), 209 set->klen); 210 if (d < 0) 211 p = &parent->rb_left; 212 else if (d > 0) 213 p = &parent->rb_right; 214 else { 215 if (nft_rbtree_interval_end(rbe) && 216 !nft_rbtree_interval_end(new)) { 217 p = &parent->rb_left; 218 } else if (!nft_rbtree_interval_end(rbe) && 219 nft_rbtree_interval_end(new)) { 220 p = &parent->rb_right; 221 } else if (nft_set_elem_active(&rbe->ext, genmask)) { 222 *ext = &rbe->ext; 223 return -EEXIST; 224 } else { 225 p = &parent->rb_left; 226 } 227 } 228 } 229 rb_link_node_rcu(&new->node, parent, p); 230 rb_insert_color(&new->node, &priv->root); 231 return 0; 232 } 233 234 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, 235 const struct nft_set_elem *elem, 236 struct nft_set_ext **ext) 237 { 238 struct nft_rbtree *priv = nft_set_priv(set); 239 struct nft_rbtree_elem *rbe = elem->priv; 240 int err; 241 242 write_lock_bh(&priv->lock); 243 write_seqcount_begin(&priv->count); 244 err = __nft_rbtree_insert(net, set, rbe, ext); 245 write_seqcount_end(&priv->count); 246 write_unlock_bh(&priv->lock); 247 248 return err; 249 } 250 251 static void nft_rbtree_remove(const struct net *net, 252 const struct nft_set *set, 253 const struct nft_set_elem *elem) 254 { 255 struct nft_rbtree *priv = nft_set_priv(set); 256 struct nft_rbtree_elem *rbe = elem->priv; 257 258 write_lock_bh(&priv->lock); 259 write_seqcount_begin(&priv->count); 260 rb_erase(&rbe->node, &priv->root); 261 write_seqcount_end(&priv->count); 262 write_unlock_bh(&priv->lock); 263 } 264 265 static void nft_rbtree_activate(const struct net *net, 266 const struct nft_set *set, 267 const struct nft_set_elem *elem) 268 { 269 struct nft_rbtree_elem *rbe = elem->priv; 270 271 nft_set_elem_change_active(net, set, &rbe->ext); 272 nft_set_elem_clear_busy(&rbe->ext); 273 } 274 275 static bool nft_rbtree_flush(const struct net *net, 276 const struct nft_set *set, void *priv) 277 { 278 struct nft_rbtree_elem *rbe = priv; 279 280 if (!nft_set_elem_mark_busy(&rbe->ext) || 281 !nft_is_active(net, &rbe->ext)) { 282 nft_set_elem_change_active(net, set, &rbe->ext); 283 return true; 284 } 285 return false; 286 } 287 288 static void *nft_rbtree_deactivate(const struct net *net, 289 const struct nft_set *set, 290 const struct nft_set_elem *elem) 291 { 292 const struct nft_rbtree *priv = nft_set_priv(set); 293 const struct rb_node *parent = priv->root.rb_node; 294 struct nft_rbtree_elem *rbe, *this = elem->priv; 295 u8 genmask = nft_genmask_next(net); 296 int d; 297 298 while (parent != NULL) { 299 rbe = rb_entry(parent, struct nft_rbtree_elem, node); 300 301 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val, 302 set->klen); 303 if (d < 0) 304 parent = parent->rb_left; 305 else if (d > 0) 306 parent = parent->rb_right; 307 else { 308 if (nft_rbtree_interval_end(rbe) && 309 !nft_rbtree_interval_end(this)) { 310 parent = parent->rb_left; 311 continue; 312 } else if (!nft_rbtree_interval_end(rbe) && 313 nft_rbtree_interval_end(this)) { 314 parent = parent->rb_right; 315 continue; 316 } else if (!nft_set_elem_active(&rbe->ext, genmask)) { 317 parent = parent->rb_left; 318 continue; 319 } 320 nft_rbtree_flush(net, set, rbe); 321 return rbe; 322 } 323 } 324 return NULL; 325 } 326 327 static void nft_rbtree_walk(const struct nft_ctx *ctx, 328 struct nft_set *set, 329 struct nft_set_iter *iter) 330 { 331 struct nft_rbtree *priv = nft_set_priv(set); 332 struct nft_rbtree_elem *rbe; 333 struct nft_set_elem elem; 334 struct rb_node *node; 335 336 read_lock_bh(&priv->lock); 337 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { 338 rbe = rb_entry(node, struct nft_rbtree_elem, node); 339 340 if (iter->count < iter->skip) 341 goto cont; 342 if (!nft_set_elem_active(&rbe->ext, iter->genmask)) 343 goto cont; 344 345 elem.priv = rbe; 346 347 iter->err = iter->fn(ctx, set, iter, &elem); 348 if (iter->err < 0) { 349 read_unlock_bh(&priv->lock); 350 return; 351 } 352 cont: 353 iter->count++; 354 } 355 read_unlock_bh(&priv->lock); 356 } 357 358 static void nft_rbtree_gc(struct work_struct *work) 359 { 360 struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; 361 struct nft_set_gc_batch *gcb = NULL; 362 struct nft_rbtree *priv; 363 struct rb_node *node; 364 struct nft_set *set; 365 366 priv = container_of(work, struct nft_rbtree, gc_work.work); 367 set = nft_set_container_of(priv); 368 369 write_lock_bh(&priv->lock); 370 write_seqcount_begin(&priv->count); 371 for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { 372 rbe = rb_entry(node, struct nft_rbtree_elem, node); 373 374 if (nft_rbtree_interval_end(rbe)) { 375 rbe_end = rbe; 376 continue; 377 } 378 if (!nft_set_elem_expired(&rbe->ext)) 379 continue; 380 if (nft_set_elem_mark_busy(&rbe->ext)) 381 continue; 382 383 if (rbe_prev) { 384 rb_erase(&rbe_prev->node, &priv->root); 385 rbe_prev = NULL; 386 } 387 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); 388 if (!gcb) 389 break; 390 391 atomic_dec(&set->nelems); 392 nft_set_gc_batch_add(gcb, rbe); 393 rbe_prev = rbe; 394 395 if (rbe_end) { 396 atomic_dec(&set->nelems); 397 nft_set_gc_batch_add(gcb, rbe_end); 398 rb_erase(&rbe_end->node, &priv->root); 399 rbe_end = NULL; 400 } 401 node = rb_next(node); 402 if (!node) 403 break; 404 } 405 if (rbe_prev) 406 rb_erase(&rbe_prev->node, &priv->root); 407 write_seqcount_end(&priv->count); 408 write_unlock_bh(&priv->lock); 409 410 nft_set_gc_batch_complete(gcb); 411 412 queue_delayed_work(system_power_efficient_wq, &priv->gc_work, 413 nft_set_gc_interval(set)); 414 } 415 416 static u64 nft_rbtree_privsize(const struct nlattr * const nla[], 417 const struct nft_set_desc *desc) 418 { 419 return sizeof(struct nft_rbtree); 420 } 421 422 static int nft_rbtree_init(const struct nft_set *set, 423 const struct nft_set_desc *desc, 424 const struct nlattr * const nla[]) 425 { 426 struct nft_rbtree *priv = nft_set_priv(set); 427 428 rwlock_init(&priv->lock); 429 seqcount_init(&priv->count); 430 priv->root = RB_ROOT; 431 432 INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc); 433 if (set->flags & NFT_SET_TIMEOUT) 434 queue_delayed_work(system_power_efficient_wq, &priv->gc_work, 435 nft_set_gc_interval(set)); 436 437 return 0; 438 } 439 440 static void nft_rbtree_destroy(const struct nft_set *set) 441 { 442 struct nft_rbtree *priv = nft_set_priv(set); 443 struct nft_rbtree_elem *rbe; 444 struct rb_node *node; 445 446 cancel_delayed_work_sync(&priv->gc_work); 447 rcu_barrier(); 448 while ((node = priv->root.rb_node) != NULL) { 449 rb_erase(node, &priv->root); 450 rbe = rb_entry(node, struct nft_rbtree_elem, node); 451 nft_set_elem_destroy(set, rbe, true); 452 } 453 } 454 455 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, 456 struct nft_set_estimate *est) 457 { 458 if (desc->size) 459 est->size = sizeof(struct nft_rbtree) + 460 desc->size * sizeof(struct nft_rbtree_elem); 461 else 462 est->size = ~0; 463 464 est->lookup = NFT_SET_CLASS_O_LOG_N; 465 est->space = NFT_SET_CLASS_O_N; 466 467 return true; 468 } 469 470 struct nft_set_type nft_set_rbtree_type __read_mostly = { 471 .owner = THIS_MODULE, 472 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, 473 .ops = { 474 .privsize = nft_rbtree_privsize, 475 .elemsize = offsetof(struct nft_rbtree_elem, ext), 476 .estimate = nft_rbtree_estimate, 477 .init = nft_rbtree_init, 478 .destroy = nft_rbtree_destroy, 479 .insert = nft_rbtree_insert, 480 .remove = nft_rbtree_remove, 481 .deactivate = nft_rbtree_deactivate, 482 .flush = nft_rbtree_flush, 483 .activate = nft_rbtree_activate, 484 .lookup = nft_rbtree_lookup, 485 .walk = nft_rbtree_walk, 486 .get = nft_rbtree_get, 487 }, 488 }; 489