1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 /* - 3 * net/sched/act_ct.c Connection Tracking action 4 * 5 * Authors: Paul Blakey <paulb@mellanox.com> 6 * Yossi Kuperman <yossiku@mellanox.com> 7 * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> 8 */ 9 10 #include <linux/module.h> 11 #include <linux/init.h> 12 #include <linux/kernel.h> 13 #include <linux/skbuff.h> 14 #include <linux/rtnetlink.h> 15 #include <linux/pkt_cls.h> 16 #include <linux/ip.h> 17 #include <linux/ipv6.h> 18 #include <linux/rhashtable.h> 19 #include <net/netlink.h> 20 #include <net/pkt_sched.h> 21 #include <net/pkt_cls.h> 22 #include <net/act_api.h> 23 #include <net/ip.h> 24 #include <net/ipv6_frag.h> 25 #include <uapi/linux/tc_act/tc_ct.h> 26 #include <net/tc_act/tc_ct.h> 27 28 #include <net/netfilter/nf_flow_table.h> 29 #include <net/netfilter/nf_conntrack.h> 30 #include <net/netfilter/nf_conntrack_core.h> 31 #include <net/netfilter/nf_conntrack_zones.h> 32 #include <net/netfilter/nf_conntrack_helper.h> 33 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 34 #include <uapi/linux/netfilter/nf_nat.h> 35 36 static struct workqueue_struct *act_ct_wq; 37 static struct rhashtable zones_ht; 38 static DEFINE_MUTEX(zones_mutex); 39 40 struct tcf_ct_flow_table { 41 struct rhash_head node; /* In zones tables */ 42 43 struct rcu_work rwork; 44 struct nf_flowtable nf_ft; 45 refcount_t ref; 46 u16 zone; 47 48 bool dying; 49 }; 50 51 static const struct rhashtable_params zones_params = { 52 .head_offset = offsetof(struct tcf_ct_flow_table, node), 53 .key_offset = offsetof(struct tcf_ct_flow_table, zone), 54 .key_len = sizeof_field(struct tcf_ct_flow_table, zone), 55 .automatic_shrinking = true, 56 }; 57 58 static struct nf_flowtable_type flowtable_ct = { 59 .owner = THIS_MODULE, 60 }; 61 62 static int tcf_ct_flow_table_get(struct tcf_ct_params *params) 63 { 64 struct tcf_ct_flow_table *ct_ft; 65 int err = -ENOMEM; 66 67 mutex_lock(&zones_mutex); 68 ct_ft = rhashtable_lookup_fast(&zones_ht, ¶ms->zone, zones_params); 69 if (ct_ft && refcount_inc_not_zero(&ct_ft->ref)) 70 goto out_unlock; 71 72 ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL); 73 if (!ct_ft) 74 goto err_alloc; 75 refcount_set(&ct_ft->ref, 1); 76 77 ct_ft->zone = params->zone; 78 err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params); 79 if (err) 80 goto err_insert; 81 82 ct_ft->nf_ft.type = &flowtable_ct; 83 err = nf_flow_table_init(&ct_ft->nf_ft); 84 if (err) 85 goto err_init; 86 87 __module_get(THIS_MODULE); 88 out_unlock: 89 params->ct_ft = ct_ft; 90 mutex_unlock(&zones_mutex); 91 92 return 0; 93 94 err_init: 95 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); 96 err_insert: 97 kfree(ct_ft); 98 err_alloc: 99 mutex_unlock(&zones_mutex); 100 return err; 101 } 102 103 static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) 104 { 105 struct tcf_ct_flow_table *ct_ft; 106 107 ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table, 108 rwork); 109 nf_flow_table_free(&ct_ft->nf_ft); 110 kfree(ct_ft); 111 112 module_put(THIS_MODULE); 113 } 114 115 static void tcf_ct_flow_table_put(struct tcf_ct_params *params) 116 { 117 struct tcf_ct_flow_table *ct_ft = params->ct_ft; 118 119 if (refcount_dec_and_test(¶ms->ct_ft->ref)) { 120 rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); 121 INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); 122 queue_rcu_work(act_ct_wq, &ct_ft->rwork); 123 } 124 } 125 126 static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, 127 struct nf_conn *ct, 128 bool tcp) 129 { 130 struct flow_offload *entry; 131 int err; 132 133 if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status)) 134 return; 135 136 entry = flow_offload_alloc(ct); 137 if (!entry) { 138 WARN_ON_ONCE(1); 139 goto err_alloc; 140 } 141 142 if (tcp) { 143 ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 144 ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; 145 } 146 147 err = flow_offload_add(&ct_ft->nf_ft, entry); 148 if (err) 149 goto err_add; 150 151 return; 152 153 err_add: 154 flow_offload_free(entry); 155 err_alloc: 156 clear_bit(IPS_OFFLOAD_BIT, &ct->status); 157 } 158 159 static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, 160 struct nf_conn *ct, 161 enum ip_conntrack_info ctinfo) 162 { 163 bool tcp = false; 164 165 if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) 166 return; 167 168 switch (nf_ct_protonum(ct)) { 169 case IPPROTO_TCP: 170 tcp = true; 171 if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) 172 return; 173 break; 174 case IPPROTO_UDP: 175 break; 176 default: 177 return; 178 } 179 180 if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) || 181 ct->status & IPS_SEQ_ADJUST) 182 return; 183 184 tcf_ct_flow_table_add(ct_ft, ct, tcp); 185 } 186 187 static bool 188 tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb, 189 struct flow_offload_tuple *tuple, 190 struct tcphdr **tcph) 191 { 192 struct flow_ports *ports; 193 unsigned int thoff; 194 struct iphdr *iph; 195 196 if (!pskb_network_may_pull(skb, sizeof(*iph))) 197 return false; 198 199 iph = ip_hdr(skb); 200 thoff = iph->ihl * 4; 201 202 if (ip_is_fragment(iph) || 203 unlikely(thoff != sizeof(struct iphdr))) 204 return false; 205 206 if (iph->protocol != IPPROTO_TCP && 207 iph->protocol != IPPROTO_UDP) 208 return false; 209 210 if (iph->ttl <= 1) 211 return false; 212 213 if (!pskb_network_may_pull(skb, iph->protocol == IPPROTO_TCP ? 214 thoff + sizeof(struct tcphdr) : 215 thoff + sizeof(*ports))) 216 return false; 217 218 iph = ip_hdr(skb); 219 if (iph->protocol == IPPROTO_TCP) 220 *tcph = (void *)(skb_network_header(skb) + thoff); 221 222 ports = (struct flow_ports *)(skb_network_header(skb) + thoff); 223 tuple->src_v4.s_addr = iph->saddr; 224 tuple->dst_v4.s_addr = iph->daddr; 225 tuple->src_port = ports->source; 226 tuple->dst_port = ports->dest; 227 tuple->l3proto = AF_INET; 228 tuple->l4proto = iph->protocol; 229 230 return true; 231 } 232 233 static bool 234 tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb, 235 struct flow_offload_tuple *tuple, 236 struct tcphdr **tcph) 237 { 238 struct flow_ports *ports; 239 struct ipv6hdr *ip6h; 240 unsigned int thoff; 241 242 if (!pskb_network_may_pull(skb, sizeof(*ip6h))) 243 return false; 244 245 ip6h = ipv6_hdr(skb); 246 247 if (ip6h->nexthdr != IPPROTO_TCP && 248 ip6h->nexthdr != IPPROTO_UDP) 249 return false; 250 251 if (ip6h->hop_limit <= 1) 252 return false; 253 254 thoff = sizeof(*ip6h); 255 if (!pskb_network_may_pull(skb, ip6h->nexthdr == IPPROTO_TCP ? 256 thoff + sizeof(struct tcphdr) : 257 thoff + sizeof(*ports))) 258 return false; 259 260 ip6h = ipv6_hdr(skb); 261 if (ip6h->nexthdr == IPPROTO_TCP) 262 *tcph = (void *)(skb_network_header(skb) + thoff); 263 264 ports = (struct flow_ports *)(skb_network_header(skb) + thoff); 265 tuple->src_v6 = ip6h->saddr; 266 tuple->dst_v6 = ip6h->daddr; 267 tuple->src_port = ports->source; 268 tuple->dst_port = ports->dest; 269 tuple->l3proto = AF_INET6; 270 tuple->l4proto = ip6h->nexthdr; 271 272 return true; 273 } 274 275 static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, 276 struct sk_buff *skb, 277 u8 family) 278 { 279 struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft; 280 struct flow_offload_tuple_rhash *tuplehash; 281 struct flow_offload_tuple tuple = {}; 282 enum ip_conntrack_info ctinfo; 283 struct tcphdr *tcph = NULL; 284 struct flow_offload *flow; 285 struct nf_conn *ct; 286 u8 dir; 287 288 /* Previously seen or loopback */ 289 ct = nf_ct_get(skb, &ctinfo); 290 if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) 291 return false; 292 293 switch (family) { 294 case NFPROTO_IPV4: 295 if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph)) 296 return false; 297 break; 298 case NFPROTO_IPV6: 299 if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph)) 300 return false; 301 break; 302 default: 303 return false; 304 } 305 306 tuplehash = flow_offload_lookup(nf_ft, &tuple); 307 if (!tuplehash) 308 return false; 309 310 dir = tuplehash->tuple.dir; 311 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 312 ct = flow->ct; 313 314 if (tcph && (unlikely(tcph->fin || tcph->rst))) { 315 flow_offload_teardown(flow); 316 return false; 317 } 318 319 ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : 320 IP_CT_ESTABLISHED_REPLY; 321 322 nf_conntrack_get(&ct->ct_general); 323 nf_ct_set(skb, ct, ctinfo); 324 325 return true; 326 } 327 328 static int tcf_ct_flow_tables_init(void) 329 { 330 return rhashtable_init(&zones_ht, &zones_params); 331 } 332 333 static void tcf_ct_flow_tables_uninit(void) 334 { 335 rhashtable_destroy(&zones_ht); 336 } 337 338 static struct tc_action_ops act_ct_ops; 339 static unsigned int ct_net_id; 340 341 struct tc_ct_action_net { 342 struct tc_action_net tn; /* Must be first */ 343 bool labels; 344 }; 345 346 /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ 347 static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, 348 u16 zone_id, bool force) 349 { 350 enum ip_conntrack_info ctinfo; 351 struct nf_conn *ct; 352 353 ct = nf_ct_get(skb, &ctinfo); 354 if (!ct) 355 return false; 356 if (!net_eq(net, read_pnet(&ct->ct_net))) 357 return false; 358 if (nf_ct_zone(ct)->id != zone_id) 359 return false; 360 361 /* Force conntrack entry direction. */ 362 if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { 363 if (nf_ct_is_confirmed(ct)) 364 nf_ct_kill(ct); 365 366 nf_conntrack_put(&ct->ct_general); 367 nf_ct_set(skb, NULL, IP_CT_UNTRACKED); 368 369 return false; 370 } 371 372 return true; 373 } 374 375 /* Trim the skb to the length specified by the IP/IPv6 header, 376 * removing any trailing lower-layer padding. This prepares the skb 377 * for higher-layer processing that assumes skb->len excludes padding 378 * (such as nf_ip_checksum). The caller needs to pull the skb to the 379 * network header, and ensure ip_hdr/ipv6_hdr points to valid data. 380 */ 381 static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) 382 { 383 unsigned int len; 384 int err; 385 386 switch (family) { 387 case NFPROTO_IPV4: 388 len = ntohs(ip_hdr(skb)->tot_len); 389 break; 390 case NFPROTO_IPV6: 391 len = sizeof(struct ipv6hdr) 392 + ntohs(ipv6_hdr(skb)->payload_len); 393 break; 394 default: 395 len = skb->len; 396 } 397 398 err = pskb_trim_rcsum(skb, len); 399 400 return err; 401 } 402 403 static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) 404 { 405 u8 family = NFPROTO_UNSPEC; 406 407 switch (skb->protocol) { 408 case htons(ETH_P_IP): 409 family = NFPROTO_IPV4; 410 break; 411 case htons(ETH_P_IPV6): 412 family = NFPROTO_IPV6; 413 break; 414 default: 415 break; 416 } 417 418 return family; 419 } 420 421 static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag) 422 { 423 unsigned int len; 424 425 len = skb_network_offset(skb) + sizeof(struct iphdr); 426 if (unlikely(skb->len < len)) 427 return -EINVAL; 428 if (unlikely(!pskb_may_pull(skb, len))) 429 return -ENOMEM; 430 431 *frag = ip_is_fragment(ip_hdr(skb)); 432 return 0; 433 } 434 435 static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag) 436 { 437 unsigned int flags = 0, len, payload_ofs = 0; 438 unsigned short frag_off; 439 int nexthdr; 440 441 len = skb_network_offset(skb) + sizeof(struct ipv6hdr); 442 if (unlikely(skb->len < len)) 443 return -EINVAL; 444 if (unlikely(!pskb_may_pull(skb, len))) 445 return -ENOMEM; 446 447 nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); 448 if (unlikely(nexthdr < 0)) 449 return -EPROTO; 450 451 *frag = flags & IP6_FH_F_FRAG; 452 return 0; 453 } 454 455 static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, 456 u8 family, u16 zone) 457 { 458 enum ip_conntrack_info ctinfo; 459 struct nf_conn *ct; 460 int err = 0; 461 bool frag; 462 463 /* Previously seen (loopback)? Ignore. */ 464 ct = nf_ct_get(skb, &ctinfo); 465 if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED) 466 return 0; 467 468 if (family == NFPROTO_IPV4) 469 err = tcf_ct_ipv4_is_fragment(skb, &frag); 470 else 471 err = tcf_ct_ipv6_is_fragment(skb, &frag); 472 if (err || !frag) 473 return err; 474 475 skb_get(skb); 476 477 if (family == NFPROTO_IPV4) { 478 enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; 479 480 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 481 local_bh_disable(); 482 err = ip_defrag(net, skb, user); 483 local_bh_enable(); 484 if (err && err != -EINPROGRESS) 485 goto out_free; 486 } else { /* NFPROTO_IPV6 */ 487 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) 488 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; 489 490 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); 491 err = nf_ct_frag6_gather(net, skb, user); 492 if (err && err != -EINPROGRESS) 493 goto out_free; 494 #else 495 err = -EOPNOTSUPP; 496 goto out_free; 497 #endif 498 } 499 500 skb_clear_hash(skb); 501 skb->ignore_df = 1; 502 return err; 503 504 out_free: 505 kfree_skb(skb); 506 return err; 507 } 508 509 static void tcf_ct_params_free(struct rcu_head *head) 510 { 511 struct tcf_ct_params *params = container_of(head, 512 struct tcf_ct_params, rcu); 513 514 tcf_ct_flow_table_put(params); 515 516 if (params->tmpl) 517 nf_conntrack_put(¶ms->tmpl->ct_general); 518 kfree(params); 519 } 520 521 #if IS_ENABLED(CONFIG_NF_NAT) 522 /* Modelled after nf_nat_ipv[46]_fn(). 523 * range is only used for new, uninitialized NAT state. 524 * Returns either NF_ACCEPT or NF_DROP. 525 */ 526 static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, 527 enum ip_conntrack_info ctinfo, 528 const struct nf_nat_range2 *range, 529 enum nf_nat_manip_type maniptype) 530 { 531 int hooknum, err = NF_ACCEPT; 532 533 /* See HOOK2MANIP(). */ 534 if (maniptype == NF_NAT_MANIP_SRC) 535 hooknum = NF_INET_LOCAL_IN; /* Source NAT */ 536 else 537 hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ 538 539 switch (ctinfo) { 540 case IP_CT_RELATED: 541 case IP_CT_RELATED_REPLY: 542 if (skb->protocol == htons(ETH_P_IP) && 543 ip_hdr(skb)->protocol == IPPROTO_ICMP) { 544 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, 545 hooknum)) 546 err = NF_DROP; 547 goto out; 548 } else if (IS_ENABLED(CONFIG_IPV6) && 549 skb->protocol == htons(ETH_P_IPV6)) { 550 __be16 frag_off; 551 u8 nexthdr = ipv6_hdr(skb)->nexthdr; 552 int hdrlen = ipv6_skip_exthdr(skb, 553 sizeof(struct ipv6hdr), 554 &nexthdr, &frag_off); 555 556 if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { 557 if (!nf_nat_icmpv6_reply_translation(skb, ct, 558 ctinfo, 559 hooknum, 560 hdrlen)) 561 err = NF_DROP; 562 goto out; 563 } 564 } 565 /* Non-ICMP, fall thru to initialize if needed. */ 566 /* fall through */ 567 case IP_CT_NEW: 568 /* Seen it before? This can happen for loopback, retrans, 569 * or local packets. 570 */ 571 if (!nf_nat_initialized(ct, maniptype)) { 572 /* Initialize according to the NAT action. */ 573 err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) 574 /* Action is set up to establish a new 575 * mapping. 576 */ 577 ? nf_nat_setup_info(ct, range, maniptype) 578 : nf_nat_alloc_null_binding(ct, hooknum); 579 if (err != NF_ACCEPT) 580 goto out; 581 } 582 break; 583 584 case IP_CT_ESTABLISHED: 585 case IP_CT_ESTABLISHED_REPLY: 586 break; 587 588 default: 589 err = NF_DROP; 590 goto out; 591 } 592 593 err = nf_nat_packet(ct, ctinfo, hooknum, skb); 594 out: 595 return err; 596 } 597 #endif /* CONFIG_NF_NAT */ 598 599 static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) 600 { 601 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 602 u32 new_mark; 603 604 if (!mask) 605 return; 606 607 new_mark = mark | (ct->mark & ~(mask)); 608 if (ct->mark != new_mark) { 609 ct->mark = new_mark; 610 if (nf_ct_is_confirmed(ct)) 611 nf_conntrack_event_cache(IPCT_MARK, ct); 612 } 613 #endif 614 } 615 616 static void tcf_ct_act_set_labels(struct nf_conn *ct, 617 u32 *labels, 618 u32 *labels_m) 619 { 620 #if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) 621 size_t labels_sz = sizeof_field(struct tcf_ct_params, labels); 622 623 if (!memchr_inv(labels_m, 0, labels_sz)) 624 return; 625 626 nf_connlabels_replace(ct, labels, labels_m, 4); 627 #endif 628 } 629 630 static int tcf_ct_act_nat(struct sk_buff *skb, 631 struct nf_conn *ct, 632 enum ip_conntrack_info ctinfo, 633 int ct_action, 634 struct nf_nat_range2 *range, 635 bool commit) 636 { 637 #if IS_ENABLED(CONFIG_NF_NAT) 638 int err; 639 enum nf_nat_manip_type maniptype; 640 641 if (!(ct_action & TCA_CT_ACT_NAT)) 642 return NF_ACCEPT; 643 644 /* Add NAT extension if not confirmed yet. */ 645 if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) 646 return NF_DROP; /* Can't NAT. */ 647 648 if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && 649 (ctinfo != IP_CT_RELATED || commit)) { 650 /* NAT an established or related connection like before. */ 651 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) 652 /* This is the REPLY direction for a connection 653 * for which NAT was applied in the forward 654 * direction. Do the reverse NAT. 655 */ 656 maniptype = ct->status & IPS_SRC_NAT 657 ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; 658 else 659 maniptype = ct->status & IPS_SRC_NAT 660 ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; 661 } else if (ct_action & TCA_CT_ACT_NAT_SRC) { 662 maniptype = NF_NAT_MANIP_SRC; 663 } else if (ct_action & TCA_CT_ACT_NAT_DST) { 664 maniptype = NF_NAT_MANIP_DST; 665 } else { 666 return NF_ACCEPT; 667 } 668 669 err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); 670 if (err == NF_ACCEPT && 671 ct->status & IPS_SRC_NAT && ct->status & IPS_DST_NAT) { 672 if (maniptype == NF_NAT_MANIP_SRC) 673 maniptype = NF_NAT_MANIP_DST; 674 else 675 maniptype = NF_NAT_MANIP_SRC; 676 677 err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); 678 } 679 return err; 680 #else 681 return NF_ACCEPT; 682 #endif 683 } 684 685 static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, 686 struct tcf_result *res) 687 { 688 struct net *net = dev_net(skb->dev); 689 bool cached, commit, clear, force; 690 enum ip_conntrack_info ctinfo; 691 struct tcf_ct *c = to_ct(a); 692 struct nf_conn *tmpl = NULL; 693 struct nf_hook_state state; 694 int nh_ofs, err, retval; 695 struct tcf_ct_params *p; 696 bool skip_add = false; 697 struct nf_conn *ct; 698 u8 family; 699 700 p = rcu_dereference_bh(c->params); 701 702 retval = READ_ONCE(c->tcf_action); 703 commit = p->ct_action & TCA_CT_ACT_COMMIT; 704 clear = p->ct_action & TCA_CT_ACT_CLEAR; 705 force = p->ct_action & TCA_CT_ACT_FORCE; 706 tmpl = p->tmpl; 707 708 if (clear) { 709 ct = nf_ct_get(skb, &ctinfo); 710 if (ct) { 711 nf_conntrack_put(&ct->ct_general); 712 nf_ct_set(skb, NULL, IP_CT_UNTRACKED); 713 } 714 715 goto out; 716 } 717 718 family = tcf_ct_skb_nf_family(skb); 719 if (family == NFPROTO_UNSPEC) 720 goto drop; 721 722 /* The conntrack module expects to be working at L3. 723 * We also try to pull the IPv4/6 header to linear area 724 */ 725 nh_ofs = skb_network_offset(skb); 726 skb_pull_rcsum(skb, nh_ofs); 727 err = tcf_ct_handle_fragments(net, skb, family, p->zone); 728 if (err == -EINPROGRESS) { 729 retval = TC_ACT_STOLEN; 730 goto out; 731 } 732 if (err) 733 goto drop; 734 735 err = tcf_ct_skb_network_trim(skb, family); 736 if (err) 737 goto drop; 738 739 /* If we are recirculating packets to match on ct fields and 740 * committing with a separate ct action, then we don't need to 741 * actually run the packet through conntrack twice unless it's for a 742 * different zone. 743 */ 744 cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); 745 if (!cached) { 746 if (!commit && tcf_ct_flow_table_lookup(p, skb, family)) { 747 skip_add = true; 748 goto do_nat; 749 } 750 751 /* Associate skb with specified zone. */ 752 if (tmpl) { 753 ct = nf_ct_get(skb, &ctinfo); 754 if (skb_nfct(skb)) 755 nf_conntrack_put(skb_nfct(skb)); 756 nf_conntrack_get(&tmpl->ct_general); 757 nf_ct_set(skb, tmpl, IP_CT_NEW); 758 } 759 760 state.hook = NF_INET_PRE_ROUTING; 761 state.net = net; 762 state.pf = family; 763 err = nf_conntrack_in(skb, &state); 764 if (err != NF_ACCEPT) 765 goto out_push; 766 } 767 768 do_nat: 769 ct = nf_ct_get(skb, &ctinfo); 770 if (!ct) 771 goto out_push; 772 nf_ct_deliver_cached_events(ct); 773 774 err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit); 775 if (err != NF_ACCEPT) 776 goto drop; 777 778 if (commit) { 779 tcf_ct_act_set_mark(ct, p->mark, p->mark_mask); 780 tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); 781 782 /* This will take care of sending queued events 783 * even if the connection is already confirmed. 784 */ 785 nf_conntrack_confirm(skb); 786 } else if (!skip_add) { 787 tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo); 788 } 789 790 out_push: 791 skb_push_rcsum(skb, nh_ofs); 792 793 out: 794 tcf_action_update_bstats(&c->common, skb); 795 return retval; 796 797 drop: 798 tcf_action_inc_drop_qstats(&c->common); 799 return TC_ACT_SHOT; 800 } 801 802 static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { 803 [TCA_CT_ACTION] = { .type = NLA_U16 }, 804 [TCA_CT_PARMS] = { .type = NLA_EXACT_LEN, .len = sizeof(struct tc_ct) }, 805 [TCA_CT_ZONE] = { .type = NLA_U16 }, 806 [TCA_CT_MARK] = { .type = NLA_U32 }, 807 [TCA_CT_MARK_MASK] = { .type = NLA_U32 }, 808 [TCA_CT_LABELS] = { .type = NLA_BINARY, 809 .len = 128 / BITS_PER_BYTE }, 810 [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY, 811 .len = 128 / BITS_PER_BYTE }, 812 [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 }, 813 [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 }, 814 [TCA_CT_NAT_IPV6_MIN] = { .type = NLA_EXACT_LEN, 815 .len = sizeof(struct in6_addr) }, 816 [TCA_CT_NAT_IPV6_MAX] = { .type = NLA_EXACT_LEN, 817 .len = sizeof(struct in6_addr) }, 818 [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 }, 819 [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 }, 820 }; 821 822 static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, 823 struct tc_ct *parm, 824 struct nlattr **tb, 825 struct netlink_ext_ack *extack) 826 { 827 struct nf_nat_range2 *range; 828 829 if (!(p->ct_action & TCA_CT_ACT_NAT)) 830 return 0; 831 832 if (!IS_ENABLED(CONFIG_NF_NAT)) { 833 NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel"); 834 return -EOPNOTSUPP; 835 } 836 837 if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) 838 return 0; 839 840 if ((p->ct_action & TCA_CT_ACT_NAT_SRC) && 841 (p->ct_action & TCA_CT_ACT_NAT_DST)) { 842 NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time"); 843 return -EOPNOTSUPP; 844 } 845 846 range = &p->range; 847 if (tb[TCA_CT_NAT_IPV4_MIN]) { 848 struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX]; 849 850 p->ipv4_range = true; 851 range->flags |= NF_NAT_RANGE_MAP_IPS; 852 range->min_addr.ip = 853 nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]); 854 855 range->max_addr.ip = max_attr ? 856 nla_get_in_addr(max_attr) : 857 range->min_addr.ip; 858 } else if (tb[TCA_CT_NAT_IPV6_MIN]) { 859 struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX]; 860 861 p->ipv4_range = false; 862 range->flags |= NF_NAT_RANGE_MAP_IPS; 863 range->min_addr.in6 = 864 nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]); 865 866 range->max_addr.in6 = max_attr ? 867 nla_get_in6_addr(max_attr) : 868 range->min_addr.in6; 869 } 870 871 if (tb[TCA_CT_NAT_PORT_MIN]) { 872 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 873 range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]); 874 875 range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ? 876 nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) : 877 range->min_proto.all; 878 } 879 880 return 0; 881 } 882 883 static void tcf_ct_set_key_val(struct nlattr **tb, 884 void *val, int val_type, 885 void *mask, int mask_type, 886 int len) 887 { 888 if (!tb[val_type]) 889 return; 890 nla_memcpy(val, tb[val_type], len); 891 892 if (!mask) 893 return; 894 895 if (mask_type == TCA_CT_UNSPEC || !tb[mask_type]) 896 memset(mask, 0xff, len); 897 else 898 nla_memcpy(mask, tb[mask_type], len); 899 } 900 901 static int tcf_ct_fill_params(struct net *net, 902 struct tcf_ct_params *p, 903 struct tc_ct *parm, 904 struct nlattr **tb, 905 struct netlink_ext_ack *extack) 906 { 907 struct tc_ct_action_net *tn = net_generic(net, ct_net_id); 908 struct nf_conntrack_zone zone; 909 struct nf_conn *tmpl; 910 int err; 911 912 p->zone = NF_CT_DEFAULT_ZONE_ID; 913 914 tcf_ct_set_key_val(tb, 915 &p->ct_action, TCA_CT_ACTION, 916 NULL, TCA_CT_UNSPEC, 917 sizeof(p->ct_action)); 918 919 if (p->ct_action & TCA_CT_ACT_CLEAR) 920 return 0; 921 922 err = tcf_ct_fill_params_nat(p, parm, tb, extack); 923 if (err) 924 return err; 925 926 if (tb[TCA_CT_MARK]) { 927 if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) { 928 NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled."); 929 return -EOPNOTSUPP; 930 } 931 tcf_ct_set_key_val(tb, 932 &p->mark, TCA_CT_MARK, 933 &p->mark_mask, TCA_CT_MARK_MASK, 934 sizeof(p->mark)); 935 } 936 937 if (tb[TCA_CT_LABELS]) { 938 if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) { 939 NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled."); 940 return -EOPNOTSUPP; 941 } 942 943 if (!tn->labels) { 944 NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length"); 945 return -EOPNOTSUPP; 946 } 947 tcf_ct_set_key_val(tb, 948 p->labels, TCA_CT_LABELS, 949 p->labels_mask, TCA_CT_LABELS_MASK, 950 sizeof(p->labels)); 951 } 952 953 if (tb[TCA_CT_ZONE]) { 954 if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) { 955 NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled."); 956 return -EOPNOTSUPP; 957 } 958 959 tcf_ct_set_key_val(tb, 960 &p->zone, TCA_CT_ZONE, 961 NULL, TCA_CT_UNSPEC, 962 sizeof(p->zone)); 963 } 964 965 if (p->zone == NF_CT_DEFAULT_ZONE_ID) 966 return 0; 967 968 nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0); 969 tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL); 970 if (!tmpl) { 971 NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template"); 972 return -ENOMEM; 973 } 974 __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); 975 nf_conntrack_get(&tmpl->ct_general); 976 p->tmpl = tmpl; 977 978 return 0; 979 } 980 981 static int tcf_ct_init(struct net *net, struct nlattr *nla, 982 struct nlattr *est, struct tc_action **a, 983 int replace, int bind, bool rtnl_held, 984 struct tcf_proto *tp, u32 flags, 985 struct netlink_ext_ack *extack) 986 { 987 struct tc_action_net *tn = net_generic(net, ct_net_id); 988 struct tcf_ct_params *params = NULL; 989 struct nlattr *tb[TCA_CT_MAX + 1]; 990 struct tcf_chain *goto_ch = NULL; 991 struct tc_ct *parm; 992 struct tcf_ct *c; 993 int err, res = 0; 994 u32 index; 995 996 if (!nla) { 997 NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed"); 998 return -EINVAL; 999 } 1000 1001 err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack); 1002 if (err < 0) 1003 return err; 1004 1005 if (!tb[TCA_CT_PARMS]) { 1006 NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters"); 1007 return -EINVAL; 1008 } 1009 parm = nla_data(tb[TCA_CT_PARMS]); 1010 index = parm->index; 1011 err = tcf_idr_check_alloc(tn, &index, a, bind); 1012 if (err < 0) 1013 return err; 1014 1015 if (!err) { 1016 err = tcf_idr_create_from_flags(tn, index, est, a, 1017 &act_ct_ops, bind, flags); 1018 if (err) { 1019 tcf_idr_cleanup(tn, index); 1020 return err; 1021 } 1022 res = ACT_P_CREATED; 1023 } else { 1024 if (bind) 1025 return 0; 1026 1027 if (!replace) { 1028 tcf_idr_release(*a, bind); 1029 return -EEXIST; 1030 } 1031 } 1032 err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); 1033 if (err < 0) 1034 goto cleanup; 1035 1036 c = to_ct(*a); 1037 1038 params = kzalloc(sizeof(*params), GFP_KERNEL); 1039 if (unlikely(!params)) { 1040 err = -ENOMEM; 1041 goto cleanup; 1042 } 1043 1044 err = tcf_ct_fill_params(net, params, parm, tb, extack); 1045 if (err) 1046 goto cleanup; 1047 1048 err = tcf_ct_flow_table_get(params); 1049 if (err) 1050 goto cleanup; 1051 1052 spin_lock_bh(&c->tcf_lock); 1053 goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); 1054 params = rcu_replace_pointer(c->params, params, 1055 lockdep_is_held(&c->tcf_lock)); 1056 spin_unlock_bh(&c->tcf_lock); 1057 1058 if (goto_ch) 1059 tcf_chain_put_by_act(goto_ch); 1060 if (params) 1061 kfree_rcu(params, rcu); 1062 if (res == ACT_P_CREATED) 1063 tcf_idr_insert(tn, *a); 1064 1065 return res; 1066 1067 cleanup: 1068 if (goto_ch) 1069 tcf_chain_put_by_act(goto_ch); 1070 kfree(params); 1071 tcf_idr_release(*a, bind); 1072 return err; 1073 } 1074 1075 static void tcf_ct_cleanup(struct tc_action *a) 1076 { 1077 struct tcf_ct_params *params; 1078 struct tcf_ct *c = to_ct(a); 1079 1080 params = rcu_dereference_protected(c->params, 1); 1081 if (params) 1082 call_rcu(¶ms->rcu, tcf_ct_params_free); 1083 } 1084 1085 static int tcf_ct_dump_key_val(struct sk_buff *skb, 1086 void *val, int val_type, 1087 void *mask, int mask_type, 1088 int len) 1089 { 1090 int err; 1091 1092 if (mask && !memchr_inv(mask, 0, len)) 1093 return 0; 1094 1095 err = nla_put(skb, val_type, len, val); 1096 if (err) 1097 return err; 1098 1099 if (mask_type != TCA_CT_UNSPEC) { 1100 err = nla_put(skb, mask_type, len, mask); 1101 if (err) 1102 return err; 1103 } 1104 1105 return 0; 1106 } 1107 1108 static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) 1109 { 1110 struct nf_nat_range2 *range = &p->range; 1111 1112 if (!(p->ct_action & TCA_CT_ACT_NAT)) 1113 return 0; 1114 1115 if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST))) 1116 return 0; 1117 1118 if (range->flags & NF_NAT_RANGE_MAP_IPS) { 1119 if (p->ipv4_range) { 1120 if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN, 1121 range->min_addr.ip)) 1122 return -1; 1123 if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX, 1124 range->max_addr.ip)) 1125 return -1; 1126 } else { 1127 if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN, 1128 &range->min_addr.in6)) 1129 return -1; 1130 if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX, 1131 &range->max_addr.in6)) 1132 return -1; 1133 } 1134 } 1135 1136 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 1137 if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN, 1138 range->min_proto.all)) 1139 return -1; 1140 if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX, 1141 range->max_proto.all)) 1142 return -1; 1143 } 1144 1145 return 0; 1146 } 1147 1148 static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, 1149 int bind, int ref) 1150 { 1151 unsigned char *b = skb_tail_pointer(skb); 1152 struct tcf_ct *c = to_ct(a); 1153 struct tcf_ct_params *p; 1154 1155 struct tc_ct opt = { 1156 .index = c->tcf_index, 1157 .refcnt = refcount_read(&c->tcf_refcnt) - ref, 1158 .bindcnt = atomic_read(&c->tcf_bindcnt) - bind, 1159 }; 1160 struct tcf_t t; 1161 1162 spin_lock_bh(&c->tcf_lock); 1163 p = rcu_dereference_protected(c->params, 1164 lockdep_is_held(&c->tcf_lock)); 1165 opt.action = c->tcf_action; 1166 1167 if (tcf_ct_dump_key_val(skb, 1168 &p->ct_action, TCA_CT_ACTION, 1169 NULL, TCA_CT_UNSPEC, 1170 sizeof(p->ct_action))) 1171 goto nla_put_failure; 1172 1173 if (p->ct_action & TCA_CT_ACT_CLEAR) 1174 goto skip_dump; 1175 1176 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 1177 tcf_ct_dump_key_val(skb, 1178 &p->mark, TCA_CT_MARK, 1179 &p->mark_mask, TCA_CT_MARK_MASK, 1180 sizeof(p->mark))) 1181 goto nla_put_failure; 1182 1183 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1184 tcf_ct_dump_key_val(skb, 1185 p->labels, TCA_CT_LABELS, 1186 p->labels_mask, TCA_CT_LABELS_MASK, 1187 sizeof(p->labels))) 1188 goto nla_put_failure; 1189 1190 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1191 tcf_ct_dump_key_val(skb, 1192 &p->zone, TCA_CT_ZONE, 1193 NULL, TCA_CT_UNSPEC, 1194 sizeof(p->zone))) 1195 goto nla_put_failure; 1196 1197 if (tcf_ct_dump_nat(skb, p)) 1198 goto nla_put_failure; 1199 1200 skip_dump: 1201 if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt)) 1202 goto nla_put_failure; 1203 1204 tcf_tm_dump(&t, &c->tcf_tm); 1205 if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) 1206 goto nla_put_failure; 1207 spin_unlock_bh(&c->tcf_lock); 1208 1209 return skb->len; 1210 nla_put_failure: 1211 spin_unlock_bh(&c->tcf_lock); 1212 nlmsg_trim(skb, b); 1213 return -1; 1214 } 1215 1216 static int tcf_ct_walker(struct net *net, struct sk_buff *skb, 1217 struct netlink_callback *cb, int type, 1218 const struct tc_action_ops *ops, 1219 struct netlink_ext_ack *extack) 1220 { 1221 struct tc_action_net *tn = net_generic(net, ct_net_id); 1222 1223 return tcf_generic_walker(tn, skb, cb, type, ops, extack); 1224 } 1225 1226 static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index) 1227 { 1228 struct tc_action_net *tn = net_generic(net, ct_net_id); 1229 1230 return tcf_idr_search(tn, a, index); 1231 } 1232 1233 static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets, 1234 u64 lastuse, bool hw) 1235 { 1236 struct tcf_ct *c = to_ct(a); 1237 1238 tcf_action_update_stats(a, bytes, packets, false, hw); 1239 c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse); 1240 } 1241 1242 static struct tc_action_ops act_ct_ops = { 1243 .kind = "ct", 1244 .id = TCA_ID_CT, 1245 .owner = THIS_MODULE, 1246 .act = tcf_ct_act, 1247 .dump = tcf_ct_dump, 1248 .init = tcf_ct_init, 1249 .cleanup = tcf_ct_cleanup, 1250 .walk = tcf_ct_walker, 1251 .lookup = tcf_ct_search, 1252 .stats_update = tcf_stats_update, 1253 .size = sizeof(struct tcf_ct), 1254 }; 1255 1256 static __net_init int ct_init_net(struct net *net) 1257 { 1258 unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8; 1259 struct tc_ct_action_net *tn = net_generic(net, ct_net_id); 1260 1261 if (nf_connlabels_get(net, n_bits - 1)) { 1262 tn->labels = false; 1263 pr_err("act_ct: Failed to set connlabels length"); 1264 } else { 1265 tn->labels = true; 1266 } 1267 1268 return tc_action_net_init(net, &tn->tn, &act_ct_ops); 1269 } 1270 1271 static void __net_exit ct_exit_net(struct list_head *net_list) 1272 { 1273 struct net *net; 1274 1275 rtnl_lock(); 1276 list_for_each_entry(net, net_list, exit_list) { 1277 struct tc_ct_action_net *tn = net_generic(net, ct_net_id); 1278 1279 if (tn->labels) 1280 nf_connlabels_put(net); 1281 } 1282 rtnl_unlock(); 1283 1284 tc_action_net_exit(net_list, ct_net_id); 1285 } 1286 1287 static struct pernet_operations ct_net_ops = { 1288 .init = ct_init_net, 1289 .exit_batch = ct_exit_net, 1290 .id = &ct_net_id, 1291 .size = sizeof(struct tc_ct_action_net), 1292 }; 1293 1294 static int __init ct_init_module(void) 1295 { 1296 int err; 1297 1298 act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0); 1299 if (!act_ct_wq) 1300 return -ENOMEM; 1301 1302 err = tcf_ct_flow_tables_init(); 1303 if (err) 1304 goto err_tbl_init; 1305 1306 err = tcf_register_action(&act_ct_ops, &ct_net_ops); 1307 if (err) 1308 goto err_register; 1309 1310 return 0; 1311 1312 err_tbl_init: 1313 destroy_workqueue(act_ct_wq); 1314 err_register: 1315 tcf_ct_flow_tables_uninit(); 1316 return err; 1317 } 1318 1319 static void __exit ct_cleanup_module(void) 1320 { 1321 tcf_unregister_action(&act_ct_ops, &ct_net_ops); 1322 tcf_ct_flow_tables_uninit(); 1323 destroy_workqueue(act_ct_wq); 1324 } 1325 1326 module_init(ct_init_module); 1327 module_exit(ct_cleanup_module); 1328 MODULE_AUTHOR("Paul Blakey <paulb@mellanox.com>"); 1329 MODULE_AUTHOR("Yossi Kuperman <yossiku@mellanox.com>"); 1330 MODULE_AUTHOR("Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>"); 1331 MODULE_DESCRIPTION("Connection tracking action"); 1332 MODULE_LICENSE("GPL v2"); 1333 1334