1 // SPDX-License-Identifier: GPL-2.0-only 2 #include <linux/kernel.h> 3 #include <linux/init.h> 4 #include <linux/module.h> 5 #include <linux/netfilter.h> 6 #include <linux/rhashtable.h> 7 #include <linux/netdevice.h> 8 #include <net/ip.h> 9 #include <net/ip6_route.h> 10 #include <net/netfilter/nf_tables.h> 11 #include <net/netfilter/nf_flow_table.h> 12 #include <net/netfilter/nf_conntrack.h> 13 #include <net/netfilter/nf_conntrack_core.h> 14 #include <net/netfilter/nf_conntrack_l4proto.h> 15 #include <net/netfilter/nf_conntrack_tuple.h> 16 17 static DEFINE_MUTEX(flowtable_lock); 18 static LIST_HEAD(flowtables); 19 20 static void 21 flow_offload_fill_dir(struct flow_offload *flow, 22 enum flow_offload_tuple_dir dir) 23 { 24 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple; 25 struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple; 26 27 ft->dir = dir; 28 29 switch (ctt->src.l3num) { 30 case NFPROTO_IPV4: 31 ft->src_v4 = ctt->src.u3.in; 32 ft->dst_v4 = ctt->dst.u3.in; 33 break; 34 case NFPROTO_IPV6: 35 ft->src_v6 = ctt->src.u3.in6; 36 ft->dst_v6 = ctt->dst.u3.in6; 37 break; 38 } 39 40 ft->l3proto = ctt->src.l3num; 41 ft->l4proto = ctt->dst.protonum; 42 ft->src_port = ctt->src.u.tcp.port; 43 ft->dst_port = ctt->dst.u.tcp.port; 44 } 45 46 struct flow_offload *flow_offload_alloc(struct nf_conn *ct) 47 { 48 struct flow_offload *flow; 49 50 if (unlikely(nf_ct_is_dying(ct) || 51 !atomic_inc_not_zero(&ct->ct_general.use))) 52 return NULL; 53 54 flow = kzalloc(sizeof(*flow), GFP_ATOMIC); 55 if (!flow) 56 goto err_ct_refcnt; 57 58 flow->ct = ct; 59 60 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL); 61 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); 62 63 if (ct->status & IPS_SRC_NAT) 64 __set_bit(NF_FLOW_SNAT, &flow->flags); 65 if (ct->status & IPS_DST_NAT) 66 __set_bit(NF_FLOW_DNAT, &flow->flags); 67 68 return flow; 69 70 err_ct_refcnt: 71 nf_ct_put(ct); 72 73 return NULL; 74 } 75 EXPORT_SYMBOL_GPL(flow_offload_alloc); 76 77 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple) 78 { 79 const struct rt6_info *rt; 80 81 if (flow_tuple->l3proto == NFPROTO_IPV6) { 82 rt = (const struct rt6_info *)flow_tuple->dst_cache; 83 return rt6_get_cookie(rt); 84 } 85 86 return 0; 87 } 88 89 static int flow_offload_fill_route(struct flow_offload *flow, 90 const struct nf_flow_route *route, 91 enum flow_offload_tuple_dir dir) 92 { 93 struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple; 94 struct dst_entry *dst = route->tuple[dir].dst; 95 int i, j = 0; 96 97 switch (flow_tuple->l3proto) { 98 case NFPROTO_IPV4: 99 flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); 100 break; 101 case NFPROTO_IPV6: 102 flow_tuple->mtu = ip6_dst_mtu_forward(dst); 103 break; 104 } 105 106 flow_tuple->iifidx = route->tuple[dir].in.ifindex; 107 for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) { 108 flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id; 109 flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto; 110 if (route->tuple[dir].in.ingress_vlans & BIT(i)) 111 flow_tuple->in_vlan_ingress |= BIT(j); 112 j++; 113 } 114 flow_tuple->encap_num = route->tuple[dir].in.num_encaps; 115 116 switch (route->tuple[dir].xmit_type) { 117 case FLOW_OFFLOAD_XMIT_DIRECT: 118 memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest, 119 ETH_ALEN); 120 memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, 121 ETH_ALEN); 122 flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; 123 flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; 124 break; 125 case FLOW_OFFLOAD_XMIT_XFRM: 126 case FLOW_OFFLOAD_XMIT_NEIGH: 127 if (!dst_hold_safe(route->tuple[dir].dst)) 128 return -1; 129 130 flow_tuple->dst_cache = dst; 131 flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); 132 break; 133 default: 134 WARN_ON_ONCE(1); 135 break; 136 } 137 flow_tuple->xmit_type = route->tuple[dir].xmit_type; 138 139 return 0; 140 } 141 142 static void nft_flow_dst_release(struct flow_offload *flow, 143 enum flow_offload_tuple_dir dir) 144 { 145 if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || 146 flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) 147 dst_release(flow->tuplehash[dir].tuple.dst_cache); 148 } 149 150 int flow_offload_route_init(struct flow_offload *flow, 151 const struct nf_flow_route *route) 152 { 153 int err; 154 155 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL); 156 if (err < 0) 157 return err; 158 159 err = flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY); 160 if (err < 0) 161 goto err_route_reply; 162 163 flow->type = NF_FLOW_OFFLOAD_ROUTE; 164 165 return 0; 166 167 err_route_reply: 168 nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); 169 170 return err; 171 } 172 EXPORT_SYMBOL_GPL(flow_offload_route_init); 173 174 static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) 175 { 176 tcp->state = TCP_CONNTRACK_ESTABLISHED; 177 tcp->seen[0].td_maxwin = 0; 178 tcp->seen[1].td_maxwin = 0; 179 } 180 181 #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) 182 #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) 183 184 static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) 185 { 186 const struct nf_conntrack_l4proto *l4proto; 187 int l4num = nf_ct_protonum(ct); 188 unsigned int timeout; 189 190 l4proto = nf_ct_l4proto_find(l4num); 191 if (!l4proto) 192 return; 193 194 if (l4num == IPPROTO_TCP) 195 timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT; 196 else if (l4num == IPPROTO_UDP) 197 timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT; 198 else 199 return; 200 201 if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) 202 ct->timeout = nfct_time_stamp + timeout; 203 } 204 205 static void flow_offload_fixup_ct_state(struct nf_conn *ct) 206 { 207 if (nf_ct_protonum(ct) == IPPROTO_TCP) 208 flow_offload_fixup_tcp(&ct->proto.tcp); 209 } 210 211 static void flow_offload_fixup_ct(struct nf_conn *ct) 212 { 213 flow_offload_fixup_ct_state(ct); 214 flow_offload_fixup_ct_timeout(ct); 215 } 216 217 static void flow_offload_route_release(struct flow_offload *flow) 218 { 219 nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL); 220 nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY); 221 } 222 223 void flow_offload_free(struct flow_offload *flow) 224 { 225 switch (flow->type) { 226 case NF_FLOW_OFFLOAD_ROUTE: 227 flow_offload_route_release(flow); 228 break; 229 default: 230 break; 231 } 232 nf_ct_put(flow->ct); 233 kfree_rcu(flow, rcu_head); 234 } 235 EXPORT_SYMBOL_GPL(flow_offload_free); 236 237 static u32 flow_offload_hash(const void *data, u32 len, u32 seed) 238 { 239 const struct flow_offload_tuple *tuple = data; 240 241 return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed); 242 } 243 244 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed) 245 { 246 const struct flow_offload_tuple_rhash *tuplehash = data; 247 248 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed); 249 } 250 251 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg, 252 const void *ptr) 253 { 254 const struct flow_offload_tuple *tuple = arg->key; 255 const struct flow_offload_tuple_rhash *x = ptr; 256 257 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash))) 258 return 1; 259 260 return 0; 261 } 262 263 static const struct rhashtable_params nf_flow_offload_rhash_params = { 264 .head_offset = offsetof(struct flow_offload_tuple_rhash, node), 265 .hashfn = flow_offload_hash, 266 .obj_hashfn = flow_offload_hash_obj, 267 .obj_cmpfn = flow_offload_hash_cmp, 268 .automatic_shrinking = true, 269 }; 270 271 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) 272 { 273 int err; 274 275 flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; 276 277 err = rhashtable_insert_fast(&flow_table->rhashtable, 278 &flow->tuplehash[0].node, 279 nf_flow_offload_rhash_params); 280 if (err < 0) 281 return err; 282 283 err = rhashtable_insert_fast(&flow_table->rhashtable, 284 &flow->tuplehash[1].node, 285 nf_flow_offload_rhash_params); 286 if (err < 0) { 287 rhashtable_remove_fast(&flow_table->rhashtable, 288 &flow->tuplehash[0].node, 289 nf_flow_offload_rhash_params); 290 return err; 291 } 292 293 nf_ct_offload_timeout(flow->ct); 294 295 if (nf_flowtable_hw_offload(flow_table)) { 296 __set_bit(NF_FLOW_HW, &flow->flags); 297 nf_flow_offload_add(flow_table, flow); 298 } 299 300 return 0; 301 } 302 EXPORT_SYMBOL_GPL(flow_offload_add); 303 304 void flow_offload_refresh(struct nf_flowtable *flow_table, 305 struct flow_offload *flow) 306 { 307 flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT; 308 309 if (likely(!nf_flowtable_hw_offload(flow_table) || 310 !test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags))) 311 return; 312 313 nf_flow_offload_add(flow_table, flow); 314 } 315 EXPORT_SYMBOL_GPL(flow_offload_refresh); 316 317 static inline bool nf_flow_has_expired(const struct flow_offload *flow) 318 { 319 return nf_flow_timeout_delta(flow->timeout) <= 0; 320 } 321 322 static void flow_offload_del(struct nf_flowtable *flow_table, 323 struct flow_offload *flow) 324 { 325 rhashtable_remove_fast(&flow_table->rhashtable, 326 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node, 327 nf_flow_offload_rhash_params); 328 rhashtable_remove_fast(&flow_table->rhashtable, 329 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node, 330 nf_flow_offload_rhash_params); 331 332 clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); 333 334 if (nf_flow_has_expired(flow)) 335 flow_offload_fixup_ct(flow->ct); 336 else 337 flow_offload_fixup_ct_timeout(flow->ct); 338 339 flow_offload_free(flow); 340 } 341 342 void flow_offload_teardown(struct flow_offload *flow) 343 { 344 set_bit(NF_FLOW_TEARDOWN, &flow->flags); 345 346 flow_offload_fixup_ct_state(flow->ct); 347 } 348 EXPORT_SYMBOL_GPL(flow_offload_teardown); 349 350 struct flow_offload_tuple_rhash * 351 flow_offload_lookup(struct nf_flowtable *flow_table, 352 struct flow_offload_tuple *tuple) 353 { 354 struct flow_offload_tuple_rhash *tuplehash; 355 struct flow_offload *flow; 356 int dir; 357 358 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple, 359 nf_flow_offload_rhash_params); 360 if (!tuplehash) 361 return NULL; 362 363 dir = tuplehash->tuple.dir; 364 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 365 if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) 366 return NULL; 367 368 if (unlikely(nf_ct_is_dying(flow->ct))) 369 return NULL; 370 371 return tuplehash; 372 } 373 EXPORT_SYMBOL_GPL(flow_offload_lookup); 374 375 static int 376 nf_flow_table_iterate(struct nf_flowtable *flow_table, 377 void (*iter)(struct flow_offload *flow, void *data), 378 void *data) 379 { 380 struct flow_offload_tuple_rhash *tuplehash; 381 struct rhashtable_iter hti; 382 struct flow_offload *flow; 383 int err = 0; 384 385 rhashtable_walk_enter(&flow_table->rhashtable, &hti); 386 rhashtable_walk_start(&hti); 387 388 while ((tuplehash = rhashtable_walk_next(&hti))) { 389 if (IS_ERR(tuplehash)) { 390 if (PTR_ERR(tuplehash) != -EAGAIN) { 391 err = PTR_ERR(tuplehash); 392 break; 393 } 394 continue; 395 } 396 if (tuplehash->tuple.dir) 397 continue; 398 399 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]); 400 401 iter(flow, data); 402 } 403 rhashtable_walk_stop(&hti); 404 rhashtable_walk_exit(&hti); 405 406 return err; 407 } 408 409 static bool flow_offload_stale_dst(struct flow_offload_tuple *tuple) 410 { 411 struct dst_entry *dst; 412 413 if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH || 414 tuple->xmit_type == FLOW_OFFLOAD_XMIT_XFRM) { 415 dst = tuple->dst_cache; 416 if (!dst_check(dst, tuple->dst_cookie)) 417 return true; 418 } 419 420 return false; 421 } 422 423 static bool nf_flow_has_stale_dst(struct flow_offload *flow) 424 { 425 return flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple) || 426 flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple); 427 } 428 429 static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) 430 { 431 struct nf_flowtable *flow_table = data; 432 433 if (nf_flow_has_expired(flow) || 434 nf_ct_is_dying(flow->ct) || 435 nf_flow_has_stale_dst(flow)) 436 set_bit(NF_FLOW_TEARDOWN, &flow->flags); 437 438 if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { 439 if (test_bit(NF_FLOW_HW, &flow->flags)) { 440 if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) 441 nf_flow_offload_del(flow_table, flow); 442 else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) 443 flow_offload_del(flow_table, flow); 444 } else { 445 flow_offload_del(flow_table, flow); 446 } 447 } else if (test_bit(NF_FLOW_HW, &flow->flags)) { 448 nf_flow_offload_stats(flow_table, flow); 449 } 450 } 451 452 static void nf_flow_offload_work_gc(struct work_struct *work) 453 { 454 struct nf_flowtable *flow_table; 455 456 flow_table = container_of(work, struct nf_flowtable, gc_work.work); 457 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 458 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ); 459 } 460 461 static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff, 462 __be16 port, __be16 new_port) 463 { 464 struct tcphdr *tcph; 465 466 tcph = (void *)(skb_network_header(skb) + thoff); 467 inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false); 468 } 469 470 static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff, 471 __be16 port, __be16 new_port) 472 { 473 struct udphdr *udph; 474 475 udph = (void *)(skb_network_header(skb) + thoff); 476 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { 477 inet_proto_csum_replace2(&udph->check, skb, port, 478 new_port, false); 479 if (!udph->check) 480 udph->check = CSUM_MANGLED_0; 481 } 482 } 483 484 static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff, 485 u8 protocol, __be16 port, __be16 new_port) 486 { 487 switch (protocol) { 488 case IPPROTO_TCP: 489 nf_flow_nat_port_tcp(skb, thoff, port, new_port); 490 break; 491 case IPPROTO_UDP: 492 nf_flow_nat_port_udp(skb, thoff, port, new_port); 493 break; 494 } 495 } 496 497 void nf_flow_snat_port(const struct flow_offload *flow, 498 struct sk_buff *skb, unsigned int thoff, 499 u8 protocol, enum flow_offload_tuple_dir dir) 500 { 501 struct flow_ports *hdr; 502 __be16 port, new_port; 503 504 hdr = (void *)(skb_network_header(skb) + thoff); 505 506 switch (dir) { 507 case FLOW_OFFLOAD_DIR_ORIGINAL: 508 port = hdr->source; 509 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port; 510 hdr->source = new_port; 511 break; 512 case FLOW_OFFLOAD_DIR_REPLY: 513 port = hdr->dest; 514 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port; 515 hdr->dest = new_port; 516 break; 517 } 518 519 nf_flow_nat_port(skb, thoff, protocol, port, new_port); 520 } 521 EXPORT_SYMBOL_GPL(nf_flow_snat_port); 522 523 void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb, 524 unsigned int thoff, u8 protocol, 525 enum flow_offload_tuple_dir dir) 526 { 527 struct flow_ports *hdr; 528 __be16 port, new_port; 529 530 hdr = (void *)(skb_network_header(skb) + thoff); 531 532 switch (dir) { 533 case FLOW_OFFLOAD_DIR_ORIGINAL: 534 port = hdr->dest; 535 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port; 536 hdr->dest = new_port; 537 break; 538 case FLOW_OFFLOAD_DIR_REPLY: 539 port = hdr->source; 540 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port; 541 hdr->source = new_port; 542 break; 543 } 544 545 nf_flow_nat_port(skb, thoff, protocol, port, new_port); 546 } 547 EXPORT_SYMBOL_GPL(nf_flow_dnat_port); 548 549 int nf_flow_table_init(struct nf_flowtable *flowtable) 550 { 551 int err; 552 553 INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc); 554 flow_block_init(&flowtable->flow_block); 555 init_rwsem(&flowtable->flow_block_lock); 556 557 err = rhashtable_init(&flowtable->rhashtable, 558 &nf_flow_offload_rhash_params); 559 if (err < 0) 560 return err; 561 562 queue_delayed_work(system_power_efficient_wq, 563 &flowtable->gc_work, HZ); 564 565 mutex_lock(&flowtable_lock); 566 list_add(&flowtable->list, &flowtables); 567 mutex_unlock(&flowtable_lock); 568 569 return 0; 570 } 571 EXPORT_SYMBOL_GPL(nf_flow_table_init); 572 573 static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) 574 { 575 struct net_device *dev = data; 576 577 if (!dev) { 578 flow_offload_teardown(flow); 579 return; 580 } 581 582 if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && 583 (flow->tuplehash[0].tuple.iifidx == dev->ifindex || 584 flow->tuplehash[1].tuple.iifidx == dev->ifindex)) 585 flow_offload_teardown(flow); 586 } 587 588 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, 589 struct net_device *dev) 590 { 591 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); 592 flush_delayed_work(&flowtable->gc_work); 593 nf_flow_table_offload_flush(flowtable); 594 } 595 596 void nf_flow_table_cleanup(struct net_device *dev) 597 { 598 struct nf_flowtable *flowtable; 599 600 mutex_lock(&flowtable_lock); 601 list_for_each_entry(flowtable, &flowtables, list) 602 nf_flow_table_gc_cleanup(flowtable, dev); 603 mutex_unlock(&flowtable_lock); 604 } 605 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); 606 607 void nf_flow_table_free(struct nf_flowtable *flow_table) 608 { 609 mutex_lock(&flowtable_lock); 610 list_del(&flow_table->list); 611 mutex_unlock(&flowtable_lock); 612 613 cancel_delayed_work_sync(&flow_table->gc_work); 614 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL); 615 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table); 616 nf_flow_table_offload_flush(flow_table); 617 if (nf_flowtable_hw_offload(flow_table)) 618 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, 619 flow_table); 620 rhashtable_destroy(&flow_table->rhashtable); 621 } 622 EXPORT_SYMBOL_GPL(nf_flow_table_free); 623 624 static int __init nf_flow_table_module_init(void) 625 { 626 return nf_flow_table_offload_init(); 627 } 628 629 static void __exit nf_flow_table_module_exit(void) 630 { 631 nf_flow_table_offload_exit(); 632 } 633 634 module_init(nf_flow_table_module_init); 635 module_exit(nf_flow_table_module_exit); 636 637 MODULE_LICENSE("GPL"); 638 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); 639 MODULE_DESCRIPTION("Netfilter flow table module"); 640