1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* - 3 * net/sched/act_ct.c Connection Tracking action 4 * 5 * Authors: Paul Blakey <paulb@mellanox.com> 6 * Yossi Kuperman <yossiku@mellanox.com> 7 * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> 8 */ 9 10 #include <linux/module.h> 11 #include <linux/init.h> 12 #include <linux/kernel.h> 13 #include <linux/skbuff.h> 14 #include <linux/rtnetlink.h> 15 #include <linux/pkt_cls.h> 16 #include <linux/ip.h> 17 #include <linux/ipv6.h> 18 #include <linux/rhashtable.h> 19 #include <net/netlink.h> 20 #include <net/pkt_sched.h> 21 #include <net/pkt_cls.h> 22 #include <net/act_api.h> 23 #include <net/ip.h> 24 #include <net/ipv6_frag.h> 25 #include <uapi/linux/tc_act/tc_ct.h> 26 #include <net/tc_act/tc_ct.h> 27 #include <net/tc_wrapper.h> 28 29 #include <net/netfilter/nf_flow_table.h> 30 #include <net/netfilter/nf_conntrack.h> 31 #include <net/netfilter/nf_conntrack_core.h> 32 #include <net/netfilter/nf_conntrack_zones.h> 33 #include <net/netfilter/nf_conntrack_helper.h> 34 #include <net/netfilter/nf_conntrack_acct.h> 35 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 36 #include <net/netfilter/nf_conntrack_act_ct.h> 37 #include <net/netfilter/nf_conntrack_seqadj.h> 38 #include <uapi/linux/netfilter/nf_nat.h> 39 40 static struct workqueue_struct *act_ct_wq; 41 static struct rhashtable zones_ht; 42 static DEFINE_MUTEX(zones_mutex); 43 44 struct zones_ht_key { 45 struct net *net; 46 u16 zone; 47 /* Note : pad[] must be the last field. */ 48 u8 pad[]; 49 }; 50 51 struct tcf_ct_flow_table { 52 struct rhash_head node; /* In zones tables */ 53 54 struct rcu_work rwork; 55 struct nf_flowtable nf_ft; 56 refcount_t ref; 57 struct zones_ht_key key; 58 59 bool dying; 60 }; 61 62 static const struct rhashtable_params zones_params = { 63 .head_offset = offsetof(struct tcf_ct_flow_table, node), 64 .key_offset = offsetof(struct tcf_ct_flow_table, key), 65 .key_len = offsetof(struct zones_ht_key, pad), 66 .automatic_shrinking = true, 67 }; 68 69 static struct flow_action_entry * 70 tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action) 71 { 72 int i = flow_action->num_entries++; 73 74 return &flow_action->entries[i]; 75 } 76 77 static void tcf_ct_add_mangle_action(struct flow_action *action, 78 enum flow_action_mangle_base htype, 79 u32 offset, 80 u32 mask, 81 u32 val) 82 { 83 struct flow_action_entry *entry; 84 85 entry = tcf_ct_flow_table_flow_action_get_next(action); 86 entry->id = FLOW_ACTION_MANGLE; 87 entry->mangle.htype = htype; 88 entry->mangle.mask = ~mask; 89 entry->mangle.offset = offset; 90 entry->mangle.val = val; 91 } 92 93 /* The following nat helper functions check if the inverted reverse tuple 94 * (target) is different then the current dir tuple - meaning nat for ports 95 * and/or ip is needed, and add the relevant mangle actions. 96 */ 97 static void 98 tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple, 99 struct nf_conntrack_tuple target, 100 struct flow_action *action) 101 { 102 if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3))) 103 tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4, 104 offsetof(struct iphdr, saddr), 105 0xFFFFFFFF, 106 be32_to_cpu(target.src.u3.ip)); 107 if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3))) 108 tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4, 109 offsetof(struct iphdr, daddr), 110 0xFFFFFFFF, 111 be32_to_cpu(target.dst.u3.ip)); 112 } 113 114 static void 115 tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action, 116 union nf_inet_addr *addr, 117 u32 offset) 118 { 119 int i; 120 121 for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++) 122 tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6, 123 i * sizeof(u32) + offset, 124 0xFFFFFFFF, be32_to_cpu(addr->ip6[i])); 125 } 126 127 static void 128 tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple, 129 struct nf_conntrack_tuple target, 130 struct flow_action *action) 131 { 132 if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3))) 133 tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3, 134 offsetof(struct ipv6hdr, 135 saddr)); 136 if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3))) 137 tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3, 138 offsetof(struct ipv6hdr, 139 daddr)); 140 } 141 142 static void 143 tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple, 144 struct nf_conntrack_tuple target, 145 struct flow_action *action) 146 { 147 __be16 target_src = target.src.u.tcp.port; 148 __be16 target_dst = target.dst.u.tcp.port; 149 150 if (target_src != tuple->src.u.tcp.port) 151 tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, 152 offsetof(struct tcphdr, source), 153 0xFFFF, be16_to_cpu(target_src)); 154 if (target_dst != tuple->dst.u.tcp.port) 155 tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP, 156 offsetof(struct tcphdr, dest), 157 0xFFFF, be16_to_cpu(target_dst)); 158 } 159 160 static void 161 tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, 162 struct nf_conntrack_tuple target, 163 struct flow_action *action) 164 { 165 __be16 target_src = target.src.u.udp.port; 166 __be16 target_dst = target.dst.u.udp.port; 167 168 if (target_src != tuple->src.u.udp.port) 169 tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP, 170 offsetof(struct udphdr, source), 171 0xFFFF, be16_to_cpu(target_src)); 172 if (target_dst != tuple->dst.u.udp.port) 173 tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP, 174 offsetof(struct udphdr, dest), 175 0xFFFF, be16_to_cpu(target_dst)); 176 } 177 178 static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, 179 enum ip_conntrack_dir dir, 180 enum ip_conntrack_info ctinfo, 181 struct flow_action *action) 182 { 183 struct nf_conn_labels *ct_labels; 184 struct flow_action_entry *entry; 185 u32 *act_ct_labels; 186 187 entry = tcf_ct_flow_table_flow_action_get_next(action); 188 entry->id = FLOW_ACTION_CT_METADATA; 189 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 190 entry->ct_metadata.mark = READ_ONCE(ct->mark); 191 #endif 192 /* aligns with the CT reference on the SKB nf_ct_set */ 193 entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; 194 entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; 195 196 act_ct_labels = entry->ct_metadata.labels; 197 ct_labels = nf_ct_labels_find(ct); 198 if (ct_labels) 199 memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE); 200 else 201 memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE); 202 } 203 204 static int tcf_ct_flow_table_add_action_nat(struct net *net, 205 struct nf_conn *ct, 206 enum ip_conntrack_dir dir, 207 struct flow_action *action) 208 { 209 const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; 210 struct nf_conntrack_tuple target; 211 212 if (!(ct->status & IPS_NAT_MASK)) 213 return 0; 214 215 nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); 216 217 switch (tuple->src.l3num) { 218 case NFPROTO_IPV4: 219 tcf_ct_flow_table_add_action_nat_ipv4(tuple, target, 220 action); 221 break; 222 case NFPROTO_IPV6: 223 tcf_ct_flow_table_add_action_nat_ipv6(tuple, target, 224 action); 225 break; 226 default: 227 return -EOPNOTSUPP; 228 } 229 230 switch (nf_ct_protonum(ct)) { 231 case IPPROTO_TCP: 232 tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action); 233 break; 234 case IPPROTO_UDP: 235 tcf_ct_flow_table_add_action_nat_udp(tuple, target, action); 236 break; 237 default: 238 return -EOPNOTSUPP; 239 } 240 241 return 0; 242 } 243 244 static int tcf_ct_flow_table_fill_actions(struct net *net, 245 struct flow_offload *flow, 246 enum flow_offload_tuple_dir tdir, 247 struct nf_flow_rule *flow_rule) 248 { 249 struct flow_action *action = &flow_rule->rule->action; 250 int num_entries = action->num_entries; 251 struct nf_conn *ct = flow->ct; 252 enum ip_conntrack_info ctinfo; 253 enum ip_conntrack_dir dir; 254 int i, err; 255 256 switch (tdir) { 257 case FLOW_OFFLOAD_DIR_ORIGINAL: 258 dir = IP_CT_DIR_ORIGINAL; 259 ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? 260 IP_CT_ESTABLISHED : IP_CT_NEW; 261 if (ctinfo == IP_CT_ESTABLISHED) 262 set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); 263 break; 264 case FLOW_OFFLOAD_DIR_REPLY: 265 dir = IP_CT_DIR_REPLY; 266 ctinfo = IP_CT_ESTABLISHED_REPLY; 267 break; 268 default: 269 return -EOPNOTSUPP; 270 } 271 272 err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action); 273 if (err) 274 goto err_nat; 275 276 tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action); 277 return 0; 278 279 err_nat: 280 /* Clear filled actions */ 281 for (i = num_entries; i < action->num_entries; i++) 282 memset(&action->entries[i], 0, sizeof(action->entries[i])); 283 action->num_entries = num_entries; 284 285 return err; 286 } 287 288 static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) 289 { 290 return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && 291 test_bit(IPS_HW_OFFLOAD_BIT, &flow->ct->status) && 292 !test_bit(NF_FLOW_HW_PENDING, &flow->flags) && 293 !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); 294 } 295 296 static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft); 297 298 static void tcf_ct_nf_get(struct nf_flowtable *ft) 299 { 300 struct tcf_ct_flow_table *ct_ft = 301 container_of(ft, struct tcf_ct_flow_table, nf_ft); 302 303 tcf_ct_flow_table_get_ref(ct_ft); 304 } 305 306 static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft); 307 308 static void tcf_ct_nf_put(struct nf_flowtable *ft) 309 { 310 struct tcf_ct_flow_table *ct_ft = 311 container_of(ft, struct tcf_ct_flow_table, nf_ft); 312 313 tcf_ct_flow_table_put(ct_ft); 314 } 315 316 static struct nf_flowtable_type flowtable_ct = { 317 .gc = tcf_ct_flow_is_outdated, 318 .action = tcf_ct_flow_table_fill_actions, 319 .get = tcf_ct_nf_get, 320 .put = tcf_ct_nf_put, 321 .owner = THIS_MODULE, 322 }; 323 324 static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) 325 { 326 struct zones_ht_key key = { .net = net, .zone = params->zone }; 327 struct tcf_ct_flow_table *ct_ft; 328 int err = -ENOMEM; 329 330 mutex_lock(&zones_mutex); 331 ct_ft = rhashtable_lookup_fast(&zones_ht, &key, zones_params); 332 if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) 333 goto out_unlock; 334 335 ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); 336 if (!ct_ft) 337 goto err_alloc; 338 refcount_set(&ct_ft->ref, 1); 339 340 ct_ft->key = key; 341 err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); 342 if (err) 343 goto err_insert; 344 345 ct_ft->nf_ft.type = &flowtable_ct; 346 ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD | 347 NF_FLOWTABLE_COUNTER; 348 err = nf_flow_table_init(&ct_ft->nf_ft); 349 if (err) 350 goto err_init; 351 write_pnet(&ct_ft->nf_ft.net, net); 352 353 __module_get(THIS_MODULE); 354 out_unlock: 355 params->ct_ft = ct_ft; 356 params->nf_ft = &ct_ft->nf_ft; 357 mutex_unlock(&zones_mutex); 358 359 return 0; 360 361 err_init: 362 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); 363 err_insert: 364 kfree(ct_ft); 365 err_alloc: 366 mutex_unlock(&zones_mutex); 367 return err; 368 } 369 370 static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft) 371 { 372 refcount_inc(&ct_ft->ref); 373 } 374 375 static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) 376 { 377 struct tcf_ct_flow_table *ct_ft; 378 struct flow_block *block; 379 380 ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, 381 rwork); 382 nf_flow_table_free(&ct_ft->nf_ft); 383 384 block = &ct_ft->nf_ft.flow_block; 385 down_write(&ct_ft->nf_ft.flow_block_lock); 386 WARN_ON(!list_empty(&block->cb_list)); 387 up_write(&ct_ft->nf_ft.flow_block_lock); 388 kfree(ct_ft); 389 390 module_put(THIS_MODULE); 391 } 392 393 static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft) 394 { 395 if (refcount_dec_and_test(&ct_ft->ref)) { 396 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); 397 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); 398 queue_rcu_work(act_ct_wq, &ct_ft->rwork); 399 } 400 } 401 402 static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, 403 struct nf_conn_act_ct_ext *act_ct_ext, u8 dir) 404 { 405 entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC; 406 entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir]; 407 } 408 409 static void tcf_ct_flow_ct_ext_ifidx_update(struct flow_offload *entry) 410 { 411 struct nf_conn_act_ct_ext *act_ct_ext; 412 413 act_ct_ext = nf_conn_act_ct_ext_find(entry->ct); 414 if (act_ct_ext) { 415 tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); 416 tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); 417 } 418 } 419 420 static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, 421 struct nf_conn *ct, 422 bool tcp, bool bidirectional) 423 { 424 struct nf_conn_act_ct_ext *act_ct_ext; 425 struct flow_offload *entry; 426 int err; 427 428 if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) 429 return; 430 431 entry = flow_offload_alloc(ct); 432 if (!entry) { 433 WARN_ON_ONCE(1); 434 goto err_alloc; 435 } 436 437 if (tcp) { 438 ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 439 ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 440 } 441 if (bidirectional) 442 __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags); 443 444 act_ct_ext = nf_conn_act_ct_ext_find(ct); 445 if (act_ct_ext) { 446 tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL); 447 tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY); 448 } 449 450 err = flow_offload_add(&ct_ft->nf_ft, entry); 451 if (err) 452 goto err_add; 453 454 return; 455 456 err_add: 457 flow_offload_free(entry); 458 err_alloc: 459 clear_bit(IPS_OFFLOAD_BIT, &ct->status); 460 } 461 462 static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, 463 struct nf_conn *ct, 464 enum ip_conntrack_info ctinfo) 465 { 466 bool tcp = false, bidirectional = true; 467 468 switch (nf_ct_protonum(ct)) { 469 case IPPROTO_TCP: 470 if ((ctinfo != IP_CT_ESTABLISHED && 471 ctinfo != IP_CT_ESTABLISHED_REPLY) || 472 !test_bit(IPS_ASSURED_BIT, &ct->status) || 473 ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) 474 return; 475 476 tcp = true; 477 break; 478 case IPPROTO_UDP: 479 if (!nf_ct_is_confirmed(ct)) 480 return; 481 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 482 bidirectional = false; 483 break; 484 #ifdef CONFIG_NF_CT_PROTO_GRE 485 case IPPROTO_GRE: { 486 struct nf_conntrack_tuple *tuple; 487 488 if ((ctinfo != IP_CT_ESTABLISHED && 489 ctinfo != IP_CT_ESTABLISHED_REPLY) || 490 !test_bit(IPS_ASSURED_BIT, &ct->status) || 491 ct->status & IPS_NAT_MASK) 492 return; 493 494 tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 495 /* No support for GRE v1 */ 496 if (tuple->src.u.gre.key || tuple->dst.u.gre.key) 497 return; 498 break; 499 } 500 #endif 501 default: 502 return; 503 } 504 505 if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || 506 ct->status & IPS_SEQ_ADJUST) 507 return; 508 509 tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional); 510 } 511 512 static bool 513 tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb, 514 struct flow_offload_tuple *tuple, 515 struct tcphdr **tcph) 516 { 517 struct flow_ports *ports; 518 unsigned int thoff; 519 struct iphdr *iph; 520 size_t hdrsize; 521 u8 ipproto; 522 523 if (!pskb_network_may_pull(skb, sizeof(*iph))) 524 return false; 525 526 iph = ip_hdr(skb); 527 thoff = iph->ihl * 4; 528 529 if (ip_is_fragment(iph) || 530 unlikely(thoff != sizeof(struct iphdr))) 531 return false; 532 533 ipproto = iph->protocol; 534 switch (ipproto) { 535 case IPPROTO_TCP: 536 hdrsize = sizeof(struct tcphdr); 537 break; 538 case IPPROTO_UDP: 539 hdrsize = sizeof(*ports); 540 break; 541 #ifdef CONFIG_NF_CT_PROTO_GRE 542 case IPPROTO_GRE: 543 hdrsize = sizeof(struct gre_base_hdr); 544 break; 545 #endif 546 default: 547 return false; 548 } 549 550 if (iph->ttl <= 1) 551 return false; 552 553 if (!pskb_network_may_pull(skb, thoff + hdrsize)) 554 return false; 555 556 switch (ipproto) { 557 case IPPROTO_TCP: 558 *tcph = (void *)(skb_network_header(skb) + thoff); 559 fallthrough; 560 case IPPROTO_UDP: 561 ports = (struct flow_ports *)(skb_network_header(skb) + thoff); 562 tuple->src_port = ports->source; 563 tuple->dst_port = ports->dest; 564 break; 565 case IPPROTO_GRE: { 566 struct gre_base_hdr *greh; 567 568 greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff); 569 if ((greh->flags & GRE_VERSION) != GRE_VERSION_0) 570 return false; 571 break; 572 } 573 } 574 575 iph = ip_hdr(skb); 576 577 tuple->src_v4.s_addr = iph->saddr; 578 tuple->dst_v4.s_addr = iph->daddr; 579 tuple->l3proto = AF_INET; 580 tuple->l4proto = ipproto; 581 582 return true; 583 } 584 585 static bool 586 tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb, 587 struct flow_offload_tuple *tuple, 588 struct tcphdr **tcph) 589 { 590 struct flow_ports *ports; 591 struct ipv6hdr *ip6h; 592 unsigned int thoff; 593 size_t hdrsize; 594 u8 nexthdr; 595 596 if (!pskb_network_may_pull(skb, sizeof(*ip6h))) 597 return false; 598 599 ip6h = ipv6_hdr(skb); 600 thoff = sizeof(*ip6h); 601 602 nexthdr = ip6h->nexthdr; 603 switch (nexthdr) { 604 case IPPROTO_TCP: 605 hdrsize = sizeof(struct tcphdr); 606 break; 607 case IPPROTO_UDP: 608 hdrsize = sizeof(*ports); 609 break; 610 #ifdef CONFIG_NF_CT_PROTO_GRE 611 case IPPROTO_GRE: 612 hdrsize = sizeof(struct gre_base_hdr); 613 break; 614 #endif 615 default: 616 return false; 617 } 618 619 if (ip6h->hop_limit <= 1) 620 return false; 621 622 if (!pskb_network_may_pull(skb, thoff + hdrsize)) 623 return false; 624 625 switch (nexthdr) { 626 case IPPROTO_TCP: 627 *tcph = (void *)(skb_network_header(skb) + thoff); 628 fallthrough; 629 case IPPROTO_UDP: 630 ports = (struct flow_ports *)(skb_network_header(skb) + thoff); 631 tuple->src_port = ports->source; 632 tuple->dst_port = ports->dest; 633 break; 634 case IPPROTO_GRE: { 635 struct gre_base_hdr *greh; 636 637 greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff); 638 if ((greh->flags & GRE_VERSION) != GRE_VERSION_0) 639 return false; 640 break; 641 } 642 } 643 644 ip6h = ipv6_hdr(skb); 645 646 tuple->src_v6 = ip6h->saddr; 647 tuple->dst_v6 = ip6h->daddr; 648 tuple->l3proto = AF_INET6; 649 tuple->l4proto = nexthdr; 650 651 return true; 652 } 653 654 static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, 655 struct sk_buff *skb, 656 u8 family) 657 { 658 struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft; 659 struct flow_offload_tuple_rhash *tuplehash; 660 struct flow_offload_tuple tuple = {}; 661 enum ip_conntrack_info ctinfo; 662 struct tcphdr *tcph = NULL; 663 bool force_refresh = false; 664 struct flow_offload *flow; 665 struct nf_conn *ct; 666 u8 dir; 667 668 switch (family) { 669 case NFPROTO_IPV4: 670 if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph)) 671 return false; 672 break; 673 case NFPROTO_IPV6: 674 if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph)) 675 return false; 676 break; 677 default: 678 return false; 679 } 680 681 tuplehash = flow_offload_lookup(nf_ft, &tuple); 682 if (!tuplehash) 683 return false; 684 685 dir = tuplehash->tuple.dir; 686 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 687 ct = flow->ct; 688 689 if (dir == FLOW_OFFLOAD_DIR_REPLY && 690 !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) { 691 /* Only offload reply direction after connection became 692 * assured. 693 */ 694 if (test_bit(IPS_ASSURED_BIT, &ct->status)) 695 set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); 696 else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags)) 697 /* If flow_table flow has already been updated to the 698 * established state, then don't refresh. 699 */ 700 return false; 701 force_refresh = true; 702 } 703 704 if (tcph && (unlikely(tcph->fin || tcph->rst))) { 705 flow_offload_teardown(flow); 706 return false; 707 } 708 709 if (dir == FLOW_OFFLOAD_DIR_ORIGINAL) 710 ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? 711 IP_CT_ESTABLISHED : IP_CT_NEW; 712 else 713 ctinfo = IP_CT_ESTABLISHED_REPLY; 714 715 nf_conn_act_ct_ext_fill(skb, ct, ctinfo); 716 tcf_ct_flow_ct_ext_ifidx_update(flow); 717 flow_offload_refresh(nf_ft, flow, force_refresh); 718 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { 719 /* Process this flow in SW to allow promoting to ASSURED */ 720 return false; 721 } 722 723 nf_conntrack_get(&ct->ct_general); 724 nf_ct_set(skb, ct, ctinfo); 725 if (nf_ft->flags & NF_FLOWTABLE_COUNTER) 726 nf_ct_acct_update(ct, dir, skb->len); 727 728 return true; 729 } 730 731 static int tcf_ct_flow_tables_init(void) 732 { 733 return rhashtable_init(&zones_ht, &zones_params); 734 } 735 736 static void tcf_ct_flow_tables_uninit(void) 737 { 738 rhashtable_destroy(&zones_ht); 739 } 740 741 static struct tc_action_ops act_ct_ops; 742 743 struct tc_ct_action_net { 744 struct tc_action_net tn; /* Must be first */ 745 bool labels; 746 }; 747 748 /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ 749 static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, 750 struct tcf_ct_params *p) 751 { 752 enum ip_conntrack_info ctinfo; 753 struct nf_conn *ct; 754 755 ct = nf_ct_get(skb, &ctinfo); 756 if (!ct) 757 return false; 758 if (!net_eq(net, read_pnet(&ct->ct_net))) 759 goto drop_ct; 760 if (nf_ct_zone(ct)->id != p->zone) 761 goto drop_ct; 762 if (p->helper) { 763 struct nf_conn_help *help; 764 765 help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); 766 if (help && rcu_access_pointer(help->helper) != p->helper) 767 goto drop_ct; 768 } 769 770 /* Force conntrack entry direction. */ 771 if ((p->ct_action & TCA_CT_ACT_FORCE) && 772 CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { 773 if (nf_ct_is_confirmed(ct)) 774 nf_ct_kill(ct); 775 776 goto drop_ct; 777 } 778 779 return true; 780 781 drop_ct: 782 nf_ct_put(ct); 783 nf_ct_set(skb, NULL, IP_CT_UNTRACKED); 784 785 return false; 786 } 787 788 static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) 789 { 790 u8 family = NFPROTO_UNSPEC; 791 792 switch (skb_protocol(skb, true)) { 793 case htons(ETH_P_IP): 794 family = NFPROTO_IPV4; 795 break; 796 case htons(ETH_P_IPV6): 797 family = NFPROTO_IPV6; 798 break; 799 default: 800 break; 801 } 802 803 return family; 804 } 805 806 static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag) 807 { 808 unsigned int len; 809 810 len = skb_network_offset(skb) + sizeof(struct iphdr); 811 if (unlikely(skb->len < len)) 812 return -EINVAL; 813 if (unlikely(!pskb_may_pull(skb, len))) 814 return -ENOMEM; 815 816 *frag = ip_is_fragment(ip_hdr(skb)); 817 return 0; 818 } 819 820 static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag) 821 { 822 unsigned int flags = 0, len, payload_ofs = 0; 823 unsigned short frag_off; 824 int nexthdr; 825 826 len = skb_network_offset(skb) + sizeof(struct ipv6hdr); 827 if (unlikely(skb->len < len)) 828 return -EINVAL; 829 if (unlikely(!pskb_may_pull(skb, len))) 830 return -ENOMEM; 831 832 nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); 833 if (unlikely(nexthdr < 0)) 834 return -EPROTO; 835 836 *frag = flags & IP6_FH_F_FRAG; 837 return 0; 838 } 839 840 static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, 841 u8 family, u16 zone, bool *defrag) 842 { 843 enum ip_conntrack_info ctinfo; 844 struct nf_conn *ct; 845 int err = 0; 846 bool frag; 847 u8 proto; 848 u16 mru; 849 850 /* Previously seen (loopback)? Ignore. */ 851 ct = nf_ct_get(skb, &ctinfo); 852 if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) 853 return 0; 854 855 if (family == NFPROTO_IPV4) 856 err = tcf_ct_ipv4_is_fragment(skb, &frag); 857 else 858 err = tcf_ct_ipv6_is_fragment(skb, &frag); 859 if (err || !frag) 860 return err; 861 862 err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru); 863 if (err) 864 return err; 865 866 *defrag = true; 867 tc_skb_cb(skb)->mru = mru; 868 869 return 0; 870 } 871 872 static void tcf_ct_params_free(struct tcf_ct_params *params) 873 { 874 if (params->helper) { 875 #if IS_ENABLED(CONFIG_NF_NAT) 876 if (params->ct_action & TCA_CT_ACT_NAT) 877 nf_nat_helper_put(params->helper); 878 #endif 879 nf_conntrack_helper_put(params->helper); 880 } 881 if (params->ct_ft) 882 tcf_ct_flow_table_put(params->ct_ft); 883 if (params->tmpl) 884 nf_ct_put(params->tmpl); 885 kfree(params); 886 } 887 888 static void tcf_ct_params_free_rcu(struct rcu_head *head) 889 { 890 struct tcf_ct_params *params; 891 892 params = container_of(head, struct tcf_ct_params, rcu); 893 tcf_ct_params_free(params); 894 } 895 896 static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) 897 { 898 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 899 u32 new_mark; 900 901 if (!mask) 902 return; 903 904 new_mark = mark | (READ_ONCE(ct->mark) & ~(mask)); 905 if (READ_ONCE(ct->mark) != new_mark) { 906 WRITE_ONCE(ct->mark, new_mark); 907 if (nf_ct_is_confirmed(ct)) 908 nf_conntrack_event_cache(IPCT_MARK, ct); 909 } 910 #endif 911 } 912 913 static void tcf_ct_act_set_labels(struct nf_conn *ct, 914 u32 *labels, 915 u32 *labels_m) 916 { 917 #if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) 918 size_t labels_sz = sizeof_field(struct tcf_ct_params, labels); 919 920 if (!memchr_inv(labels_m, 0, labels_sz)) 921 return; 922 923 nf_connlabels_replace(ct, labels, labels_m, 4); 924 #endif 925 } 926 927 static int tcf_ct_act_nat(struct sk_buff *skb, 928 struct nf_conn *ct, 929 enum ip_conntrack_info ctinfo, 930 int ct_action, 931 struct nf_nat_range2 *range, 932 bool commit) 933 { 934 #if IS_ENABLED(CONFIG_NF_NAT) 935 int err, action = 0; 936 937 if (!(ct_action & TCA_CT_ACT_NAT)) 938 return NF_ACCEPT; 939 if (ct_action & TCA_CT_ACT_NAT_SRC) 940 action |= BIT(NF_NAT_MANIP_SRC); 941 if (ct_action & TCA_CT_ACT_NAT_DST) 942 action |= BIT(NF_NAT_MANIP_DST); 943 944 err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit); 945 946 if (action & BIT(NF_NAT_MANIP_SRC)) 947 tc_skb_cb(skb)->post_ct_snat = 1; 948 if (action & BIT(NF_NAT_MANIP_DST)) 949 tc_skb_cb(skb)->post_ct_dnat = 1; 950 951 return err; 952 #else 953 return NF_ACCEPT; 954 #endif 955 } 956 957 TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, 958 struct tcf_result *res) 959 { 960 struct net *net = dev_net(skb->dev); 961 enum ip_conntrack_info ctinfo; 962 struct tcf_ct *c = to_ct(a); 963 struct nf_conn *tmpl = NULL; 964 struct nf_hook_state state; 965 bool cached, commit, clear; 966 int nh_ofs, err, retval; 967 struct tcf_ct_params *p; 968 bool add_helper = false; 969 bool skip_add = false; 970 bool defrag = false; 971 struct nf_conn *ct; 972 u8 family; 973 974 p = rcu_dereference_bh(c->params); 975 976 retval = READ_ONCE(c->tcf_action); 977 commit = p->ct_action & TCA_CT_ACT_COMMIT; 978 clear = p->ct_action & TCA_CT_ACT_CLEAR; 979 tmpl = p->tmpl; 980 981 tcf_lastuse_update(&c->tcf_tm); 982 tcf_action_update_bstats(&c->common, skb); 983 984 if (clear) { 985 tc_skb_cb(skb)->post_ct = false; 986 ct = nf_ct_get(skb, &ctinfo); 987 if (ct) { 988 nf_ct_put(ct); 989 nf_ct_set(skb, NULL, IP_CT_UNTRACKED); 990 } 991 992 goto out_clear; 993 } 994 995 family = tcf_ct_skb_nf_family(skb); 996 if (family == NFPROTO_UNSPEC) 997 goto drop; 998 999 /* The conntrack module expects to be working at L3. 1000 * We also try to pull the IPv4/6 header to linear area 1001 */ 1002 nh_ofs = skb_network_offset(skb); 1003 skb_pull_rcsum(skb, nh_ofs); 1004 err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag); 1005 if (err) 1006 goto out_frag; 1007 1008 err = nf_ct_skb_network_trim(skb, family); 1009 if (err) 1010 goto drop; 1011 1012 /* If we are recirculating packets to match on ct fields and 1013 * committing with a separate ct action, then we don't need to 1014 * actually run the packet through conntrack twice unless it's for a 1015 * different zone. 1016 */ 1017 cached = tcf_ct_skb_nfct_cached(net, skb, p); 1018 if (!cached) { 1019 if (tcf_ct_flow_table_lookup(p, skb, family)) { 1020 skip_add = true; 1021 goto do_nat; 1022 } 1023 1024 /* Associate skb with specified zone. */ 1025 if (tmpl) { 1026 nf_conntrack_put(skb_nfct(skb)); 1027 nf_conntrack_get(&tmpl->ct_general); 1028 nf_ct_set(skb, tmpl, IP_CT_NEW); 1029 } 1030 1031 state.hook = NF_INET_PRE_ROUTING; 1032 state.net = net; 1033 state.pf = family; 1034 err = nf_conntrack_in(skb, &state); 1035 if (err != NF_ACCEPT) 1036 goto out_push; 1037 } 1038 1039 do_nat: 1040 ct = nf_ct_get(skb, &ctinfo); 1041 if (!ct) 1042 goto out_push; 1043 nf_ct_deliver_cached_events(ct); 1044 nf_conn_act_ct_ext_fill(skb, ct, ctinfo); 1045 1046 err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit); 1047 if (err != NF_ACCEPT) 1048 goto drop; 1049 1050 if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) { 1051 err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC); 1052 if (err) 1053 goto drop; 1054 add_helper = true; 1055 if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) { 1056 if (!nfct_seqadj_ext_add(ct)) 1057 goto drop; 1058 } 1059 } 1060 1061 if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) { 1062 if (nf_ct_helper(skb, ct, ctinfo, family) != NF_ACCEPT) 1063 goto drop; 1064 } 1065 1066 if (commit) { 1067 tcf_ct_act_set_mark(ct, p->mark, p->mark_mask); 1068 tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); 1069 1070 if (!nf_ct_is_confirmed(ct)) 1071 nf_conn_act_ct_ext_add(skb, ct, ctinfo); 1072 1073 /* This will take care of sending queued events 1074 * even if the connection is already confirmed. 1075 */ 1076 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 1077 goto drop; 1078 1079 /* The ct may be dropped if a clash has been resolved, 1080 * so it's necessary to retrieve it from skb again to 1081 * prevent UAF. 1082 */ 1083 ct = nf_ct_get(skb, &ctinfo); 1084 if (!ct) 1085 skip_add = true; 1086 } 1087 1088 if (!skip_add) 1089 tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); 1090 1091 out_push: 1092 skb_push_rcsum(skb, nh_ofs); 1093 1094 tc_skb_cb(skb)->post_ct = true; 1095 tc_skb_cb(skb)->zone = p->zone; 1096 out_clear: 1097 if (defrag) 1098 qdisc_skb_cb(skb)->pkt_len = skb->len; 1099 return retval; 1100 1101 out_frag: 1102 if (err != -EINPROGRESS) 1103 tcf_action_inc_drop_qstats(&c->common); 1104 return TC_ACT_CONSUMED; 1105 1106 drop: 1107 tcf_action_inc_drop_qstats(&c->common); 1108 return TC_ACT_SHOT; 1109 } 1110 1111 static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { 1112 [TCA_CT_ACTION] = { .type = NLA_U16 }, 1113 [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)), 1114 [TCA_CT_ZONE] = { .type = NLA_U16 }, 1115 [TCA_CT_MARK] = { .type = NLA_U32 }, 1116 [TCA_CT_MARK_MASK] = { .type = NLA_U32 }, 1117 [TCA_CT_LABELS] = { .type = NLA_BINARY, 1118 .len = 128 / BITS_PER_BYTE }, 1119 [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY, 1120 .len = 128 / BITS_PER_BYTE }, 1121 [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 }, 1122 [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 }, 1123 [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), 1124 [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), 1125 [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 }, 1126 [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 }, 1127 [TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN }, 1128 [TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 }, 1129 [TCA_CT_HELPER_PROTO] = { .type = NLA_U8 }, 1130 }; 1131 1132 static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, 1133 struct tc_ct *parm, 1134 struct nlattr **tb, 1135 struct netlink_ext_ack *extack) 1136 { 1137 struct nf_nat_range2 *range; 1138 1139 if (!(p->ct_action & TCA_CT_ACT_NAT)) 1140 return 0; 1141 1142 if (!IS_ENABLED(CONFIG_NF_NAT)) { 1143 NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel"); 1144 return -EOPNOTSUPP; 1145 } 1146 1147 if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) 1148 return 0; 1149 1150 if ((p->ct_action & TCA_CT_ACT_NAT_SRC) && 1151 (p->ct_action & TCA_CT_ACT_NAT_DST)) { 1152 NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time"); 1153 return -EOPNOTSUPP; 1154 } 1155 1156 range = &p->range; 1157 if (tb[TCA_CT_NAT_IPV4_MIN]) { 1158 struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX]; 1159 1160 p->ipv4_range = true; 1161 range->flags |= NF_NAT_RANGE_MAP_IPS; 1162 range->min_addr.ip = 1163 nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]); 1164 1165 range->max_addr.ip = max_attr ? 1166 nla_get_in_addr(max_attr) : 1167 range->min_addr.ip; 1168 } else if (tb[TCA_CT_NAT_IPV6_MIN]) { 1169 struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX]; 1170 1171 p->ipv4_range = false; 1172 range->flags |= NF_NAT_RANGE_MAP_IPS; 1173 range->min_addr.in6 = 1174 nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]); 1175 1176 range->max_addr.in6 = max_attr ? 1177 nla_get_in6_addr(max_attr) : 1178 range->min_addr.in6; 1179 } 1180 1181 if (tb[TCA_CT_NAT_PORT_MIN]) { 1182 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1183 range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]); 1184 1185 range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ? 1186 nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) : 1187 range->min_proto.all; 1188 } 1189 1190 return 0; 1191 } 1192 1193 static void tcf_ct_set_key_val(struct nlattr **tb, 1194 void *val, int val_type, 1195 void *mask, int mask_type, 1196 int len) 1197 { 1198 if (!tb[val_type]) 1199 return; 1200 nla_memcpy(val, tb[val_type], len); 1201 1202 if (!mask) 1203 return; 1204 1205 if (mask_type == TCA_CT_UNSPEC || !tb[mask_type]) 1206 memset(mask, 0xff, len); 1207 else 1208 nla_memcpy(mask, tb[mask_type], len); 1209 } 1210 1211 static int tcf_ct_fill_params(struct net *net, 1212 struct tcf_ct_params *p, 1213 struct tc_ct *parm, 1214 struct nlattr **tb, 1215 struct netlink_ext_ack *extack) 1216 { 1217 struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); 1218 struct nf_conntrack_zone zone; 1219 int err, family, proto, len; 1220 struct nf_conn *tmpl; 1221 char *name; 1222 1223 p->zone = NF_CT_DEFAULT_ZONE_ID; 1224 1225 tcf_ct_set_key_val(tb, 1226 &p->ct_action, TCA_CT_ACTION, 1227 NULL, TCA_CT_UNSPEC, 1228 sizeof(p->ct_action)); 1229 1230 if (p->ct_action & TCA_CT_ACT_CLEAR) 1231 return 0; 1232 1233 err = tcf_ct_fill_params_nat(p, parm, tb, extack); 1234 if (err) 1235 return err; 1236 1237 if (tb[TCA_CT_MARK]) { 1238 if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) { 1239 NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled."); 1240 return -EOPNOTSUPP; 1241 } 1242 tcf_ct_set_key_val(tb, 1243 &p->mark, TCA_CT_MARK, 1244 &p->mark_mask, TCA_CT_MARK_MASK, 1245 sizeof(p->mark)); 1246 } 1247 1248 if (tb[TCA_CT_LABELS]) { 1249 if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) { 1250 NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled."); 1251 return -EOPNOTSUPP; 1252 } 1253 1254 if (!tn->labels) { 1255 NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length"); 1256 return -EOPNOTSUPP; 1257 } 1258 tcf_ct_set_key_val(tb, 1259 p->labels, TCA_CT_LABELS, 1260 p->labels_mask, TCA_CT_LABELS_MASK, 1261 sizeof(p->labels)); 1262 } 1263 1264 if (tb[TCA_CT_ZONE]) { 1265 if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { 1266 NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled."); 1267 return -EOPNOTSUPP; 1268 } 1269 1270 tcf_ct_set_key_val(tb, 1271 &p->zone, TCA_CT_ZONE, 1272 NULL, TCA_CT_UNSPEC, 1273 sizeof(p->zone)); 1274 } 1275 1276 nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0); 1277 tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL); 1278 if (!tmpl) { 1279 NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template"); 1280 return -ENOMEM; 1281 } 1282 p->tmpl = tmpl; 1283 if (tb[TCA_CT_HELPER_NAME]) { 1284 name = nla_data(tb[TCA_CT_HELPER_NAME]); 1285 len = nla_len(tb[TCA_CT_HELPER_NAME]); 1286 if (len > 16 || name[len - 1] != '\0') { 1287 NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name."); 1288 err = -EINVAL; 1289 goto err; 1290 } 1291 family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET; 1292 proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP; 1293 err = nf_ct_add_helper(tmpl, name, family, proto, 1294 p->ct_action & TCA_CT_ACT_NAT, &p->helper); 1295 if (err) { 1296 NL_SET_ERR_MSG_MOD(extack, "Failed to add helper"); 1297 goto err; 1298 } 1299 } 1300 1301 if (p->ct_action & TCA_CT_ACT_COMMIT) 1302 __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); 1303 return 0; 1304 err: 1305 nf_ct_put(p->tmpl); 1306 p->tmpl = NULL; 1307 return err; 1308 } 1309 1310 static int tcf_ct_init(struct net *net, struct nlattr *nla, 1311 struct nlattr *est, struct tc_action **a, 1312 struct tcf_proto *tp, u32 flags, 1313 struct netlink_ext_ack *extack) 1314 { 1315 struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id); 1316 bool bind = flags & TCA_ACT_FLAGS_BIND; 1317 struct tcf_ct_params *params = NULL; 1318 struct nlattr *tb[TCA_CT_MAX + 1]; 1319 struct tcf_chain *goto_ch = NULL; 1320 struct tc_ct *parm; 1321 struct tcf_ct *c; 1322 int err, res = 0; 1323 u32 index; 1324 1325 if (!nla) { 1326 NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed"); 1327 return -EINVAL; 1328 } 1329 1330 err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack); 1331 if (err < 0) 1332 return err; 1333 1334 if (!tb[TCA_CT_PARMS]) { 1335 NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters"); 1336 return -EINVAL; 1337 } 1338 parm = nla_data(tb[TCA_CT_PARMS]); 1339 index = parm->index; 1340 err = tcf_idr_check_alloc(tn, &index, a, bind); 1341 if (err < 0) 1342 return err; 1343 1344 if (!err) { 1345 err = tcf_idr_create_from_flags(tn, index, est, a, 1346 &act_ct_ops, bind, flags); 1347 if (err) { 1348 tcf_idr_cleanup(tn, index); 1349 return err; 1350 } 1351 res = ACT_P_CREATED; 1352 } else { 1353 if (bind) 1354 return 0; 1355 1356 if (!(flags & TCA_ACT_FLAGS_REPLACE)) { 1357 tcf_idr_release(*a, bind); 1358 return -EEXIST; 1359 } 1360 } 1361 err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); 1362 if (err < 0) 1363 goto cleanup; 1364 1365 c = to_ct(*a); 1366 1367 params = kzalloc(sizeof(*params), GFP_KERNEL); 1368 if (unlikely(!params)) { 1369 err = -ENOMEM; 1370 goto cleanup; 1371 } 1372 1373 err = tcf_ct_fill_params(net, params, parm, tb, extack); 1374 if (err) 1375 goto cleanup; 1376 1377 err = tcf_ct_flow_table_get(net, params); 1378 if (err) 1379 goto cleanup; 1380 1381 spin_lock_bh(&c->tcf_lock); 1382 goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); 1383 params = rcu_replace_pointer(c->params, params, 1384 lockdep_is_held(&c->tcf_lock)); 1385 spin_unlock_bh(&c->tcf_lock); 1386 1387 if (goto_ch) 1388 tcf_chain_put_by_act(goto_ch); 1389 if (params) 1390 call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); 1391 1392 return res; 1393 1394 cleanup: 1395 if (goto_ch) 1396 tcf_chain_put_by_act(goto_ch); 1397 if (params) 1398 tcf_ct_params_free(params); 1399 tcf_idr_release(*a, bind); 1400 return err; 1401 } 1402 1403 static void tcf_ct_cleanup(struct tc_action *a) 1404 { 1405 struct tcf_ct_params *params; 1406 struct tcf_ct *c = to_ct(a); 1407 1408 params = rcu_dereference_protected(c->params, 1); 1409 if (params) 1410 call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); 1411 } 1412 1413 static int tcf_ct_dump_key_val(struct sk_buff *skb, 1414 void *val, int val_type, 1415 void *mask, int mask_type, 1416 int len) 1417 { 1418 int err; 1419 1420 if (mask && !memchr_inv(mask, 0, len)) 1421 return 0; 1422 1423 err = nla_put(skb, val_type, len, val); 1424 if (err) 1425 return err; 1426 1427 if (mask_type != TCA_CT_UNSPEC) { 1428 err = nla_put(skb, mask_type, len, mask); 1429 if (err) 1430 return err; 1431 } 1432 1433 return 0; 1434 } 1435 1436 static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) 1437 { 1438 struct nf_nat_range2 *range = &p->range; 1439 1440 if (!(p->ct_action & TCA_CT_ACT_NAT)) 1441 return 0; 1442 1443 if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) 1444 return 0; 1445 1446 if (range->flags & NF_NAT_RANGE_MAP_IPS) { 1447 if (p->ipv4_range) { 1448 if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN, 1449 range->min_addr.ip)) 1450 return -1; 1451 if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX, 1452 range->max_addr.ip)) 1453 return -1; 1454 } else { 1455 if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN, 1456 &range->min_addr.in6)) 1457 return -1; 1458 if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX, 1459 &range->max_addr.in6)) 1460 return -1; 1461 } 1462 } 1463 1464 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 1465 if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN, 1466 range->min_proto.all)) 1467 return -1; 1468 if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX, 1469 range->max_proto.all)) 1470 return -1; 1471 } 1472 1473 return 0; 1474 } 1475 1476 static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper) 1477 { 1478 if (!helper) 1479 return 0; 1480 1481 if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) || 1482 nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) || 1483 nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum)) 1484 return -1; 1485 1486 return 0; 1487 } 1488 1489 static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, 1490 int bind, int ref) 1491 { 1492 unsigned char *b = skb_tail_pointer(skb); 1493 struct tcf_ct *c = to_ct(a); 1494 struct tcf_ct_params *p; 1495 1496 struct tc_ct opt = { 1497 .index = c->tcf_index, 1498 .refcnt = refcount_read(&c->tcf_refcnt) - ref, 1499 .bindcnt = atomic_read(&c->tcf_bindcnt) - bind, 1500 }; 1501 struct tcf_t t; 1502 1503 spin_lock_bh(&c->tcf_lock); 1504 p = rcu_dereference_protected(c->params, 1505 lockdep_is_held(&c->tcf_lock)); 1506 opt.action = c->tcf_action; 1507 1508 if (tcf_ct_dump_key_val(skb, 1509 &p->ct_action, TCA_CT_ACTION, 1510 NULL, TCA_CT_UNSPEC, 1511 sizeof(p->ct_action))) 1512 goto nla_put_failure; 1513 1514 if (p->ct_action & TCA_CT_ACT_CLEAR) 1515 goto skip_dump; 1516 1517 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 1518 tcf_ct_dump_key_val(skb, 1519 &p->mark, TCA_CT_MARK, 1520 &p->mark_mask, TCA_CT_MARK_MASK, 1521 sizeof(p->mark))) 1522 goto nla_put_failure; 1523 1524 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1525 tcf_ct_dump_key_val(skb, 1526 p->labels, TCA_CT_LABELS, 1527 p->labels_mask, TCA_CT_LABELS_MASK, 1528 sizeof(p->labels))) 1529 goto nla_put_failure; 1530 1531 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1532 tcf_ct_dump_key_val(skb, 1533 &p->zone, TCA_CT_ZONE, 1534 NULL, TCA_CT_UNSPEC, 1535 sizeof(p->zone))) 1536 goto nla_put_failure; 1537 1538 if (tcf_ct_dump_nat(skb, p)) 1539 goto nla_put_failure; 1540 1541 if (tcf_ct_dump_helper(skb, p->helper)) 1542 goto nla_put_failure; 1543 1544 skip_dump: 1545 if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt)) 1546 goto nla_put_failure; 1547 1548 tcf_tm_dump(&t, &c->tcf_tm); 1549 if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) 1550 goto nla_put_failure; 1551 spin_unlock_bh(&c->tcf_lock); 1552 1553 return skb->len; 1554 nla_put_failure: 1555 spin_unlock_bh(&c->tcf_lock); 1556 nlmsg_trim(skb, b); 1557 return -1; 1558 } 1559 1560 static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, 1561 u64 drops, u64 lastuse, bool hw) 1562 { 1563 struct tcf_ct *c = to_ct(a); 1564 1565 tcf_action_update_stats(a, bytes, packets, drops, hw); 1566 c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); 1567 } 1568 1569 static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data, 1570 u32 *index_inc, bool bind, 1571 struct netlink_ext_ack *extack) 1572 { 1573 if (bind) { 1574 struct flow_action_entry *entry = entry_data; 1575 1576 if (tcf_ct_helper(act)) 1577 return -EOPNOTSUPP; 1578 1579 entry->id = FLOW_ACTION_CT; 1580 entry->ct.action = tcf_ct_action(act); 1581 entry->ct.zone = tcf_ct_zone(act); 1582 entry->ct.flow_table = tcf_ct_ft(act); 1583 *index_inc = 1; 1584 } else { 1585 struct flow_offload_action *fl_action = entry_data; 1586 1587 fl_action->id = FLOW_ACTION_CT; 1588 } 1589 1590 return 0; 1591 } 1592 1593 static struct tc_action_ops act_ct_ops = { 1594 .kind = "ct", 1595 .id = TCA_ID_CT, 1596 .owner = THIS_MODULE, 1597 .act = tcf_ct_act, 1598 .dump = tcf_ct_dump, 1599 .init = tcf_ct_init, 1600 .cleanup = tcf_ct_cleanup, 1601 .stats_update = tcf_stats_update, 1602 .offload_act_setup = tcf_ct_offload_act_setup, 1603 .size = sizeof(struct tcf_ct), 1604 }; 1605 1606 static __net_init int ct_init_net(struct net *net) 1607 { 1608 unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8; 1609 struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); 1610 1611 if (nf_connlabels_get(net, n_bits - 1)) { 1612 tn->labels = false; 1613 pr_err("act_ct: Failed to set connlabels length"); 1614 } else { 1615 tn->labels = true; 1616 } 1617 1618 return tc_action_net_init(net, &tn->tn, &act_ct_ops); 1619 } 1620 1621 static void __net_exit ct_exit_net(struct list_head *net_list) 1622 { 1623 struct net *net; 1624 1625 rtnl_lock(); 1626 list_for_each_entry(net, net_list, exit_list) { 1627 struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); 1628 1629 if (tn->labels) 1630 nf_connlabels_put(net); 1631 } 1632 rtnl_unlock(); 1633 1634 tc_action_net_exit(net_list, act_ct_ops.net_id); 1635 } 1636 1637 static struct pernet_operations ct_net_ops = { 1638 .init = ct_init_net, 1639 .exit_batch = ct_exit_net, 1640 .id = &act_ct_ops.net_id, 1641 .size = sizeof(struct tc_ct_action_net), 1642 }; 1643 1644 static int __init ct_init_module(void) 1645 { 1646 int err; 1647 1648 act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0); 1649 if (!act_ct_wq) 1650 return -ENOMEM; 1651 1652 err = tcf_ct_flow_tables_init(); 1653 if (err) 1654 goto err_tbl_init; 1655 1656 err = tcf_register_action(&act_ct_ops, &ct_net_ops); 1657 if (err) 1658 goto err_register; 1659 1660 static_branch_inc(&tcf_frag_xmit_count); 1661 1662 return 0; 1663 1664 err_register: 1665 tcf_ct_flow_tables_uninit(); 1666 err_tbl_init: 1667 destroy_workqueue(act_ct_wq); 1668 return err; 1669 } 1670 1671 static void __exit ct_cleanup_module(void) 1672 { 1673 static_branch_dec(&tcf_frag_xmit_count); 1674 tcf_unregister_action(&act_ct_ops, &ct_net_ops); 1675 tcf_ct_flow_tables_uninit(); 1676 destroy_workqueue(act_ct_wq); 1677 } 1678 1679 module_init(ct_init_module); 1680 module_exit(ct_cleanup_module); 1681 MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>"); 1682 MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>"); 1683 MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>"); 1684 MODULE_DESCRIPTION("Connection tracking action"); 1685 MODULE_LICENSE("GPL v2"); 1686