1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/init.h> 4 #include <linux/module.h> 5 #include <linux/netfilter.h> 6 #include <linux/rhashtable.h> 7 #include <linux/netdevice.h> 8 #include <net/ip.h> 9 #include <net/ip6_route.h> 10 #include <net/netfilter/nf_tables.h> 11 #include <net/netfilter/nf_flow_table.h> 12 #include <net/netfilter/nf_conntrack.h> 13 #include <net/netfilter/nf_conntrack_core.h> 14 #include <net/netfilter/nf_conntrack_l4proto.h> 15 #include <net/netfilter/nf_conntrack_tuple.h> 16 17 static DEFINE_MUTEX(flowtable_lock); 18 static LIST_HEAD(flowtables); 19 20 static void 21 flow_offload_fill_dir(struct flow_offload *flow, 22 enum flow_offload_tuple_dir dir) 23 { 24 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; 25 struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; 26 27 ft->dir = dir; 28 29 switch (ctt->src.l3num) { 30 case NFPROTO_IPV4: 31 ft->src_v4 = ctt->src.u3.in; 32 ft->dst_v4 = ctt->dst.u3.in; 33 break; 34 case NFPROTO_IPV6: 35 ft->src_v6 = ctt->src.u3.in6; 36 ft->dst_v6 = ctt->dst.u3.in6; 37 break; 38 } 39 40 ft->l3proto = ctt->src.l3num; 41 ft->l4proto = ctt->dst.protonum; 42 ft->src_port = ctt->src.u.tcp.port; 43 ft->dst_port = ctt->dst.u.tcp.port; 44 } 45 46 struct flow_offload *flow_offload_alloc(struct nf_conn *ct) 47 { 48 struct flow_offload *flow; 49 50 if (unlikely(nf_ct_is_dying(ct) || 51 !atomic_inc_not_zero(&ct->ct_general.use))) 52 return NULL; 53 54 flow = kzalloc(sizeof(*flow), GFP_ATOMIC); 55 if (!flow) 56 goto err_ct_refcnt; 57 58 flow->ct = ct; 59 60 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); 61 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); 62 63 if (ct->status & IPS_SRC_NAT) 64 __set_bit(NF_FLOW_SNAT, &flow->flags); 65 if (ct->status & IPS_DST_NAT) 66 __set_bit(NF_FLOW_DNAT, &flow->flags); 67 68 return flow; 69 70 err_ct_refcnt: 71 nf_ct_put(ct); 72 73 return NULL; 74 } 75 EXPORT_SYMBOL_GPL(flow_offload_alloc); 76 77 static int flow_offload_fill_route(struct flow_offload *flow, 78 const struct nf_flow_route *route, 79 enum flow_offload_tuple_dir dir) 80 { 81 struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; 82 struct dst_entry *other_dst = route->tuple[!dir].dst; 83 struct dst_entry *dst = route->tuple[dir].dst; 84 85 if (!dst_hold_safe(route->tuple[dir].dst)) 86 return -1; 87 88 switch (flow_tuple->l3proto) { 89 case NFPROTO_IPV4: 90 flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); 91 break; 92 case NFPROTO_IPV6: 93 flow_tuple->mtu = ip6_dst_mtu_forward(dst); 94 break; 95 } 96 97 flow_tuple->iifidx = other_dst->dev->ifindex; 98 flow_tuple->dst_cache = dst; 99 100 return 0; 101 } 102 103 int flow_offload_route_init(struct flow_offload *flow, 104 const struct nf_flow_route *route) 105 { 106 int err; 107 108 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); 109 if (err < 0) 110 return err; 111 112 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); 113 if (err < 0) 114 goto err_route_reply; 115 116 flow->type = NF_FLOW_OFFLOAD_ROUTE; 117 118 return 0; 119 120 err_route_reply: 121 dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst); 122 123 return err; 124 } 125 EXPORT_SYMBOL_GPL(flow_offload_route_init); 126 127 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) 128 { 129 tcp->state = TCP_CONNTRACK_ESTABLISHED; 130 tcp->seen[0].td_maxwin = 0; 131 tcp->seen[1].td_maxwin = 0; 132 } 133 134 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) 135 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) 136 137 static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) 138 { 139 const struct nf_conntrack_l4proto *l4proto; 140 int l4num = nf_ct_protonum(ct); 141 unsigned int timeout; 142 143 l4proto = nf_ct_l4proto_find(l4num); 144 if (!l4proto) 145 return; 146 147 if (l4num == IPPROTO_TCP) 148 timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; 149 else if (l4num == IPPROTO_UDP) 150 timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; 151 else 152 return; 153 154 if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) 155 ct->timeout = nfct_time_stamp + timeout; 156 } 157 158 static void flow_offload_fixup_ct_state(struct nf_conn *ct) 159 { 160 if (nf_ct_protonum(ct) == IPPROTO_TCP) 161 flow_offload_fixup_tcp(&ct->proto.tcp); 162 } 163 164 static void flow_offload_fixup_ct(struct nf_conn *ct) 165 { 166 flow_offload_fixup_ct_state(ct); 167 flow_offload_fixup_ct_timeout(ct); 168 } 169 170 static void flow_offload_route_release(struct flow_offload *flow) 171 { 172 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache); 173 dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache); 174 } 175 176 void flow_offload_free(struct flow_offload *flow) 177 { 178 switch (flow->type) { 179 case NF_FLOW_OFFLOAD_ROUTE: 180 flow_offload_route_release(flow); 181 break; 182 default: 183 break; 184 } 185 nf_ct_put(flow->ct); 186 kfree_rcu(flow, rcu_head); 187 } 188 EXPORT_SYMBOL_GPL(flow_offload_free); 189 190 static u32 flow_offload_hash(const void *data, u32 len, u32 seed) 191 { 192 const struct flow_offload_tuple *tuple = data; 193 194 return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); 195 } 196 197 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) 198 { 199 const struct flow_offload_tuple_rhash *tuplehash = data; 200 201 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); 202 } 203 204 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, 205 const void *ptr) 206 { 207 const struct flow_offload_tuple *tuple = arg->key; 208 const struct flow_offload_tuple_rhash *x = ptr; 209 210 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) 211 return 1; 212 213 return 0; 214 } 215 216 static const struct rhashtable_params nf_flow_offload_rhash_params = { 217 .head_offset = offsetof(struct flow_offload_tuple_rhash, node), 218 .hashfn = flow_offload_hash, 219 .obj_hashfn = flow_offload_hash_obj, 220 .obj_cmpfn = flow_offload_hash_cmp, 221 .automatic_shrinking = true, 222 }; 223 224 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) 225 { 226 int err; 227 228 flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; 229 230 err = rhashtable_insert_fast(&flow_table->rhashtable, 231 &flow->tuplehash[0].node, 232 nf_flow_offload_rhash_params); 233 if (err < 0) 234 return err; 235 236 err = rhashtable_insert_fast(&flow_table->rhashtable, 237 &flow->tuplehash[1].node, 238 nf_flow_offload_rhash_params); 239 if (err < 0) { 240 rhashtable_remove_fast(&flow_table->rhashtable, 241 &flow->tuplehash[0].node, 242 nf_flow_offload_rhash_params); 243 return err; 244 } 245 246 nf_ct_offload_timeout(flow->ct); 247 248 if (nf_flowtable_hw_offload(flow_table)) { 249 __set_bit(NF_FLOW_HW, &flow->flags); 250 nf_flow_offload_add(flow_table, flow); 251 } 252 253 return 0; 254 } 255 EXPORT_SYMBOL_GPL(flow_offload_add); 256 257 void flow_offload_refresh(struct nf_flowtable *flow_table, 258 struct flow_offload *flow) 259 { 260 flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; 261 262 if (likely(!nf_flowtable_hw_offload(flow_table) || 263 !test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags))) 264 return; 265 266 nf_flow_offload_add(flow_table, flow); 267 } 268 EXPORT_SYMBOL_GPL(flow_offload_refresh); 269 270 static inline bool nf_flow_has_expired(const struct flow_offload *flow) 271 { 272 return nf_flow_timeout_delta(flow->timeout) <= 0; 273 } 274 275 static void flow_offload_del(struct nf_flowtable *flow_table, 276 struct flow_offload *flow) 277 { 278 rhashtable_remove_fast(&flow_table->rhashtable, 279 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, 280 nf_flow_offload_rhash_params); 281 rhashtable_remove_fast(&flow_table->rhashtable, 282 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, 283 nf_flow_offload_rhash_params); 284 285 clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); 286 287 if (nf_flow_has_expired(flow)) 288 flow_offload_fixup_ct(flow->ct); 289 else 290 flow_offload_fixup_ct_timeout(flow->ct); 291 292 flow_offload_free(flow); 293 } 294 295 void flow_offload_teardown(struct flow_offload *flow) 296 { 297 set_bit(NF_FLOW_TEARDOWN, &flow->flags); 298 299 flow_offload_fixup_ct_state(flow->ct); 300 } 301 EXPORT_SYMBOL_GPL(flow_offload_teardown); 302 303 struct flow_offload_tuple_rhash * 304 flow_offload_lookup(struct nf_flowtable *flow_table, 305 struct flow_offload_tuple *tuple) 306 { 307 struct flow_offload_tuple_rhash *tuplehash; 308 struct flow_offload *flow; 309 int dir; 310 311 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, 312 nf_flow_offload_rhash_params); 313 if (!tuplehash) 314 return NULL; 315 316 dir = tuplehash->tuple.dir; 317 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 318 if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) 319 return NULL; 320 321 if (unlikely(nf_ct_is_dying(flow->ct))) 322 return NULL; 323 324 return tuplehash; 325 } 326 EXPORT_SYMBOL_GPL(flow_offload_lookup); 327 328 static int 329 nf_flow_table_iterate(struct nf_flowtable *flow_table, 330 void (*iter)(struct flow_offload *flow, void *data), 331 void *data) 332 { 333 struct flow_offload_tuple_rhash *tuplehash; 334 struct rhashtable_iter hti; 335 struct flow_offload *flow; 336 int err = 0; 337 338 rhashtable_walk_enter(&flow_table->rhashtable, &hti); 339 rhashtable_walk_start(&hti); 340 341 while ((tuplehash = rhashtable_walk_next(&hti))) { 342 if (IS_ERR(tuplehash)) { 343 if (PTR_ERR(tuplehash) != -EAGAIN) { 344 err = PTR_ERR(tuplehash); 345 break; 346 } 347 continue; 348 } 349 if (tuplehash->tuple.dir) 350 continue; 351 352 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); 353 354 iter(flow, data); 355 } 356 rhashtable_walk_stop(&hti); 357 rhashtable_walk_exit(&hti); 358 359 return err; 360 } 361 362 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) 363 { 364 struct nf_flowtable *flow_table = data; 365 366 if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct)) 367 set_bit(NF_FLOW_TEARDOWN, &flow->flags); 368 369 if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { 370 if (test_bit(NF_FLOW_HW, &flow->flags)) { 371 if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) 372 nf_flow_offload_del(flow_table, flow); 373 else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) 374 flow_offload_del(flow_table, flow); 375 } else { 376 flow_offload_del(flow_table, flow); 377 } 378 } else if (test_bit(NF_FLOW_HW, &flow->flags)) { 379 nf_flow_offload_stats(flow_table, flow); 380 } 381 } 382 383 static void nf_flow_offload_work_gc(struct work_struct *work) 384 { 385 struct nf_flowtable *flow_table; 386 387 flow_table = container_of(work, struct nf_flowtable, gc_work.work); 388 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 389 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); 390 } 391 392 393 static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, 394 __be16 port, __be16 new_port) 395 { 396 struct tcphdr *tcph; 397 398 if (skb_try_make_writable(skb, thoff + sizeof(*tcph))) 399 return -1; 400 401 tcph = (void *)(skb_network_header(skb) + thoff); 402 inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); 403 404 return 0; 405 } 406 407 static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, 408 __be16 port, __be16 new_port) 409 { 410 struct udphdr *udph; 411 412 if (skb_try_make_writable(skb, thoff + sizeof(*udph))) 413 return -1; 414 415 udph = (void *)(skb_network_header(skb) + thoff); 416 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { 417 inet_proto_csum_replace2(&udph->check, skb, port, 418 new_port, false); 419 if (!udph->check) 420 udph->check = CSUM_MANGLED_0; 421 } 422 423 return 0; 424 } 425 426 static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, 427 u8 protocol, __be16 port, __be16 new_port) 428 { 429 switch (protocol) { 430 case IPPROTO_TCP: 431 if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0) 432 return NF_DROP; 433 break; 434 case IPPROTO_UDP: 435 if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0) 436 return NF_DROP; 437 break; 438 } 439 440 return 0; 441 } 442 443 int nf_flow_snat_port(const struct flow_offload *flow, 444 struct sk_buff *skb, unsigned int thoff, 445 u8 protocol, enum flow_offload_tuple_dir dir) 446 { 447 struct flow_ports *hdr; 448 __be16 port, new_port; 449 450 if (skb_try_make_writable(skb, thoff + sizeof(*hdr))) 451 return -1; 452 453 hdr = (void *)(skb_network_header(skb) + thoff); 454 455 switch (dir) { 456 case FLOW_OFFLOAD_DIR_ORIGINAL: 457 port = hdr->source; 458 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; 459 hdr->source = new_port; 460 break; 461 case FLOW_OFFLOAD_DIR_REPLY: 462 port = hdr->dest; 463 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; 464 hdr->dest = new_port; 465 break; 466 default: 467 return -1; 468 } 469 470 return nf_flow_nat_port(skb, thoff, protocol, port, new_port); 471 } 472 EXPORT_SYMBOL_GPL(nf_flow_snat_port); 473 474 int nf_flow_dnat_port(const struct flow_offload *flow, 475 struct sk_buff *skb, unsigned int thoff, 476 u8 protocol, enum flow_offload_tuple_dir dir) 477 { 478 struct flow_ports *hdr; 479 __be16 port, new_port; 480 481 if (skb_try_make_writable(skb, thoff + sizeof(*hdr))) 482 return -1; 483 484 hdr = (void *)(skb_network_header(skb) + thoff); 485 486 switch (dir) { 487 case FLOW_OFFLOAD_DIR_ORIGINAL: 488 port = hdr->dest; 489 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; 490 hdr->dest = new_port; 491 break; 492 case FLOW_OFFLOAD_DIR_REPLY: 493 port = hdr->source; 494 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; 495 hdr->source = new_port; 496 break; 497 default: 498 return -1; 499 } 500 501 return nf_flow_nat_port(skb, thoff, protocol, port, new_port); 502 } 503 EXPORT_SYMBOL_GPL(nf_flow_dnat_port); 504 505 int nf_flow_table_init(struct nf_flowtable *flowtable) 506 { 507 int err; 508 509 INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); 510 flow_block_init(&flowtable->flow_block); 511 init_rwsem(&flowtable->flow_block_lock); 512 513 err = rhashtable_init(&flowtable->rhashtable, 514 &nf_flow_offload_rhash_params); 515 if (err < 0) 516 return err; 517 518 queue_delayed_work(system_power_efficient_wq, 519 &flowtable->gc_work, HZ); 520 521 mutex_lock(&flowtable_lock); 522 list_add(&flowtable->list, &flowtables); 523 mutex_unlock(&flowtable_lock); 524 525 return 0; 526 } 527 EXPORT_SYMBOL_GPL(nf_flow_table_init); 528 529 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) 530 { 531 struct net_device *dev = data; 532 533 if (!dev) { 534 flow_offload_teardown(flow); 535 return; 536 } 537 538 if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && 539 (flow->tuplehash[0].tuple.iifidx == dev->ifindex || 540 flow->tuplehash[1].tuple.iifidx == dev->ifindex)) 541 flow_offload_teardown(flow); 542 } 543 544 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, 545 struct net_device *dev) 546 { 547 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); 548 flush_delayed_work(&flowtable->gc_work); 549 nf_flow_table_offload_flush(flowtable); 550 } 551 552 void nf_flow_table_cleanup(struct net_device *dev) 553 { 554 struct nf_flowtable *flowtable; 555 556 mutex_lock(&flowtable_lock); 557 list_for_each_entry(flowtable, &flowtables, list) 558 nf_flow_table_gc_cleanup(flowtable, dev); 559 mutex_unlock(&flowtable_lock); 560 } 561 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); 562 563 void nf_flow_table_free(struct nf_flowtable *flow_table) 564 { 565 mutex_lock(&flowtable_lock); 566 list_del(&flow_table->list); 567 mutex_unlock(&flowtable_lock); 568 569 cancel_delayed_work_sync(&flow_table->gc_work); 570 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); 571 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 572 nf_flow_table_offload_flush(flow_table); 573 if (nf_flowtable_hw_offload(flow_table)) 574 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, 575 flow_table); 576 rhashtable_destroy(&flow_table->rhashtable); 577 } 578 EXPORT_SYMBOL_GPL(nf_flow_table_free); 579 580 static int __init nf_flow_table_module_init(void) 581 { 582 return nf_flow_table_offload_init(); 583 } 584 585 static void __exit nf_flow_table_module_exit(void) 586 { 587 nf_flow_table_offload_exit(); 588 } 589 590 module_init(nf_flow_table_module_init); 591 module_exit(nf_flow_table_module_exit); 592 593 MODULE_LICENSE("GPL"); 594 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 595 MODULE_DESCRIPTION("Netfilter flow table module"); 596