1 /* 2 * Copyright (c) 2015 Nicira, Inc. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 14 #include <linux/module.h> 15 #include <linux/openvswitch.h> 16 #include <linux/tcp.h> 17 #include <linux/udp.h> 18 #include <linux/sctp.h> 19 #include <net/ip.h> 20 #include <net/netfilter/nf_conntrack_core.h> 21 #include <net/netfilter/nf_conntrack_helper.h> 22 #include <net/netfilter/nf_conntrack_labels.h> 23 #include <net/netfilter/nf_conntrack_seqadj.h> 24 #include <net/netfilter/nf_conntrack_zones.h> 25 #include <net/netfilter/ipv6/nf_defrag_ipv6.h> 26 27 #ifdef CONFIG_NF_NAT_NEEDED 28 #include <linux/netfilter/nf_nat.h> 29 #include <net/netfilter/nf_nat_core.h> 30 #include <net/netfilter/nf_nat_l3proto.h> 31 #endif 32 33 #include "datapath.h" 34 #include "conntrack.h" 35 #include "flow.h" 36 #include "flow_netlink.h" 37 38 struct ovs_ct_len_tbl { 39 int maxlen; 40 int minlen; 41 }; 42 43 /* Metadata mark for masked write to conntrack mark */ 44 struct md_mark { 45 u32 value; 46 u32 mask; 47 }; 48 49 /* Metadata label for masked write to conntrack label. */ 50 struct md_labels { 51 struct ovs_key_ct_labels value; 52 struct ovs_key_ct_labels mask; 53 }; 54 55 enum ovs_ct_nat { 56 OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ 57 OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ 58 OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ 59 }; 60 61 /* Conntrack action context for execution. */ 62 struct ovs_conntrack_info { 63 struct nf_conntrack_helper *helper; 64 struct nf_conntrack_zone zone; 65 struct nf_conn *ct; 66 u8 commit : 1; 67 u8 nat : 3; /* enum ovs_ct_nat */ 68 u8 force : 1; 69 u16 family; 70 struct md_mark mark; 71 struct md_labels labels; 72 #ifdef CONFIG_NF_NAT_NEEDED 73 struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */ 74 #endif 75 }; 76 77 static bool labels_nonzero(const struct ovs_key_ct_labels *labels); 78 79 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); 80 81 static u16 key_to_nfproto(const struct sw_flow_key *key) 82 { 83 switch (ntohs(key->eth.type)) { 84 case ETH_P_IP: 85 return NFPROTO_IPV4; 86 case ETH_P_IPV6: 87 return NFPROTO_IPV6; 88 default: 89 return NFPROTO_UNSPEC; 90 } 91 } 92 93 /* Map SKB connection state into the values used by flow definition. */ 94 static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) 95 { 96 u8 ct_state = OVS_CS_F_TRACKED; 97 98 switch (ctinfo) { 99 case IP_CT_ESTABLISHED_REPLY: 100 case IP_CT_RELATED_REPLY: 101 ct_state |= OVS_CS_F_REPLY_DIR; 102 break; 103 default: 104 break; 105 } 106 107 switch (ctinfo) { 108 case IP_CT_ESTABLISHED: 109 case IP_CT_ESTABLISHED_REPLY: 110 ct_state |= OVS_CS_F_ESTABLISHED; 111 break; 112 case IP_CT_RELATED: 113 case IP_CT_RELATED_REPLY: 114 ct_state |= OVS_CS_F_RELATED; 115 break; 116 case IP_CT_NEW: 117 ct_state |= OVS_CS_F_NEW; 118 break; 119 default: 120 break; 121 } 122 123 return ct_state; 124 } 125 126 static u32 ovs_ct_get_mark(const struct nf_conn *ct) 127 { 128 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 129 return ct ? ct->mark : 0; 130 #else 131 return 0; 132 #endif 133 } 134 135 /* Guard against conntrack labels max size shrinking below 128 bits. */ 136 #if NF_CT_LABELS_MAX_SIZE < 16 137 #error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes 138 #endif 139 140 static void ovs_ct_get_labels(const struct nf_conn *ct, 141 struct ovs_key_ct_labels *labels) 142 { 143 struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; 144 145 if (cl) 146 memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); 147 else 148 memset(labels, 0, OVS_CT_LABELS_LEN); 149 } 150 151 static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, 152 const struct nf_conntrack_tuple *orig, 153 u8 icmp_proto) 154 { 155 key->ct_orig_proto = orig->dst.protonum; 156 if (orig->dst.protonum == icmp_proto) { 157 key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); 158 key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); 159 } else { 160 key->ct.orig_tp.src = orig->src.u.all; 161 key->ct.orig_tp.dst = orig->dst.u.all; 162 } 163 } 164 165 static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, 166 const struct nf_conntrack_zone *zone, 167 const struct nf_conn *ct) 168 { 169 key->ct_state = state; 170 key->ct_zone = zone->id; 171 key->ct.mark = ovs_ct_get_mark(ct); 172 ovs_ct_get_labels(ct, &key->ct.labels); 173 174 if (ct) { 175 const struct nf_conntrack_tuple *orig; 176 177 /* Use the master if we have one. */ 178 if (ct->master) 179 ct = ct->master; 180 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 181 182 /* IP version must match with the master connection. */ 183 if (key->eth.type == htons(ETH_P_IP) && 184 nf_ct_l3num(ct) == NFPROTO_IPV4) { 185 key->ipv4.ct_orig.src = orig->src.u3.ip; 186 key->ipv4.ct_orig.dst = orig->dst.u3.ip; 187 __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); 188 return; 189 } else if (key->eth.type == htons(ETH_P_IPV6) && 190 !sw_flow_key_is_nd(key) && 191 nf_ct_l3num(ct) == NFPROTO_IPV6) { 192 key->ipv6.ct_orig.src = orig->src.u3.in6; 193 key->ipv6.ct_orig.dst = orig->dst.u3.in6; 194 __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); 195 return; 196 } 197 } 198 /* Clear 'ct_orig_proto' to mark the non-existence of conntrack 199 * original direction key fields. 200 */ 201 key->ct_orig_proto = 0; 202 } 203 204 /* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has 205 * previously sent the packet to conntrack via the ct action. If 206 * 'keep_nat_flags' is true, the existing NAT flags retained, else they are 207 * initialized from the connection status. 208 */ 209 static void ovs_ct_update_key(const struct sk_buff *skb, 210 const struct ovs_conntrack_info *info, 211 struct sw_flow_key *key, bool post_ct, 212 bool keep_nat_flags) 213 { 214 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 215 enum ip_conntrack_info ctinfo; 216 struct nf_conn *ct; 217 u8 state = 0; 218 219 ct = nf_ct_get(skb, &ctinfo); 220 if (ct) { 221 state = ovs_ct_get_state(ctinfo); 222 /* All unconfirmed entries are NEW connections. */ 223 if (!nf_ct_is_confirmed(ct)) 224 state |= OVS_CS_F_NEW; 225 /* OVS persists the related flag for the duration of the 226 * connection. 227 */ 228 if (ct->master) 229 state |= OVS_CS_F_RELATED; 230 if (keep_nat_flags) { 231 state |= key->ct_state & OVS_CS_F_NAT_MASK; 232 } else { 233 if (ct->status & IPS_SRC_NAT) 234 state |= OVS_CS_F_SRC_NAT; 235 if (ct->status & IPS_DST_NAT) 236 state |= OVS_CS_F_DST_NAT; 237 } 238 zone = nf_ct_zone(ct); 239 } else if (post_ct) { 240 state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; 241 if (info) 242 zone = &info->zone; 243 } 244 __ovs_ct_update_key(key, state, zone, ct); 245 } 246 247 /* This is called to initialize CT key fields possibly coming in from the local 248 * stack. 249 */ 250 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) 251 { 252 ovs_ct_update_key(skb, NULL, key, false, false); 253 } 254 255 #define IN6_ADDR_INITIALIZER(ADDR) \ 256 { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ 257 (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } 258 259 int ovs_ct_put_key(const struct sw_flow_key *swkey, 260 const struct sw_flow_key *output, struct sk_buff *skb) 261 { 262 if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state)) 263 return -EMSGSIZE; 264 265 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 266 nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone)) 267 return -EMSGSIZE; 268 269 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 270 nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark)) 271 return -EMSGSIZE; 272 273 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 274 nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels), 275 &output->ct.labels)) 276 return -EMSGSIZE; 277 278 if (swkey->ct_orig_proto) { 279 if (swkey->eth.type == htons(ETH_P_IP)) { 280 struct ovs_key_ct_tuple_ipv4 orig = { 281 output->ipv4.ct_orig.src, 282 output->ipv4.ct_orig.dst, 283 output->ct.orig_tp.src, 284 output->ct.orig_tp.dst, 285 output->ct_orig_proto, 286 }; 287 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, 288 sizeof(orig), &orig)) 289 return -EMSGSIZE; 290 } else if (swkey->eth.type == htons(ETH_P_IPV6)) { 291 struct ovs_key_ct_tuple_ipv6 orig = { 292 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), 293 IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), 294 output->ct.orig_tp.src, 295 output->ct.orig_tp.dst, 296 output->ct_orig_proto, 297 }; 298 if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, 299 sizeof(orig), &orig)) 300 return -EMSGSIZE; 301 } 302 } 303 304 return 0; 305 } 306 307 static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, 308 u32 ct_mark, u32 mask) 309 { 310 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) 311 u32 new_mark; 312 313 new_mark = ct_mark | (ct->mark & ~(mask)); 314 if (ct->mark != new_mark) { 315 ct->mark = new_mark; 316 if (nf_ct_is_confirmed(ct)) 317 nf_conntrack_event_cache(IPCT_MARK, ct); 318 key->ct.mark = new_mark; 319 } 320 321 return 0; 322 #else 323 return -ENOTSUPP; 324 #endif 325 } 326 327 static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct) 328 { 329 struct nf_conn_labels *cl; 330 331 cl = nf_ct_labels_find(ct); 332 if (!cl) { 333 nf_ct_labels_ext_add(ct); 334 cl = nf_ct_labels_find(ct); 335 } 336 337 return cl; 338 } 339 340 /* Initialize labels for a new, yet to be committed conntrack entry. Note that 341 * since the new connection is not yet confirmed, and thus no-one else has 342 * access to it's labels, we simply write them over. 343 */ 344 static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, 345 const struct ovs_key_ct_labels *labels, 346 const struct ovs_key_ct_labels *mask) 347 { 348 struct nf_conn_labels *cl, *master_cl; 349 bool have_mask = labels_nonzero(mask); 350 351 /* Inherit master's labels to the related connection? */ 352 master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL; 353 354 if (!master_cl && !have_mask) 355 return 0; /* Nothing to do. */ 356 357 cl = ovs_ct_get_conn_labels(ct); 358 if (!cl) 359 return -ENOSPC; 360 361 /* Inherit the master's labels, if any. */ 362 if (master_cl) 363 *cl = *master_cl; 364 365 if (have_mask) { 366 u32 *dst = (u32 *)cl->bits; 367 int i; 368 369 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) 370 dst[i] = (dst[i] & ~mask->ct_labels_32[i]) | 371 (labels->ct_labels_32[i] 372 & mask->ct_labels_32[i]); 373 } 374 375 /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the 376 * IPCT_LABEL bit it set in the event cache. 377 */ 378 nf_conntrack_event_cache(IPCT_LABEL, ct); 379 380 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); 381 382 return 0; 383 } 384 385 static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, 386 const struct ovs_key_ct_labels *labels, 387 const struct ovs_key_ct_labels *mask) 388 { 389 struct nf_conn_labels *cl; 390 int err; 391 392 cl = ovs_ct_get_conn_labels(ct); 393 if (!cl) 394 return -ENOSPC; 395 396 err = nf_connlabels_replace(ct, labels->ct_labels_32, 397 mask->ct_labels_32, 398 OVS_CT_LABELS_LEN_32); 399 if (err) 400 return err; 401 402 memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); 403 404 return 0; 405 } 406 407 /* 'skb' should already be pulled to nh_ofs. */ 408 static int ovs_ct_helper(struct sk_buff *skb, u16 proto) 409 { 410 const struct nf_conntrack_helper *helper; 411 const struct nf_conn_help *help; 412 enum ip_conntrack_info ctinfo; 413 unsigned int protoff; 414 struct nf_conn *ct; 415 int err; 416 417 ct = nf_ct_get(skb, &ctinfo); 418 if (!ct || ctinfo == IP_CT_RELATED_REPLY) 419 return NF_ACCEPT; 420 421 help = nfct_help(ct); 422 if (!help) 423 return NF_ACCEPT; 424 425 helper = rcu_dereference(help->helper); 426 if (!helper) 427 return NF_ACCEPT; 428 429 switch (proto) { 430 case NFPROTO_IPV4: 431 protoff = ip_hdrlen(skb); 432 break; 433 case NFPROTO_IPV6: { 434 u8 nexthdr = ipv6_hdr(skb)->nexthdr; 435 __be16 frag_off; 436 int ofs; 437 438 ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, 439 &frag_off); 440 if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { 441 pr_debug("proto header not found\n"); 442 return NF_ACCEPT; 443 } 444 protoff = ofs; 445 break; 446 } 447 default: 448 WARN_ONCE(1, "helper invoked on non-IP family!"); 449 return NF_DROP; 450 } 451 452 err = helper->help(skb, protoff, ct, ctinfo); 453 if (err != NF_ACCEPT) 454 return err; 455 456 /* Adjust seqs after helper. This is needed due to some helpers (e.g., 457 * FTP with NAT) adusting the TCP payload size when mangling IP 458 * addresses and/or port numbers in the text-based control connection. 459 */ 460 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 461 !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) 462 return NF_DROP; 463 return NF_ACCEPT; 464 } 465 466 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 467 * value if 'skb' is freed. 468 */ 469 static int handle_fragments(struct net *net, struct sw_flow_key *key, 470 u16 zone, struct sk_buff *skb) 471 { 472 struct ovs_skb_cb ovs_cb = *OVS_CB(skb); 473 int err; 474 475 if (key->eth.type == htons(ETH_P_IP)) { 476 enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; 477 478 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 479 err = ip_defrag(net, skb, user); 480 if (err) 481 return err; 482 483 ovs_cb.mru = IPCB(skb)->frag_max_size; 484 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) 485 } else if (key->eth.type == htons(ETH_P_IPV6)) { 486 enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; 487 488 skb_orphan(skb); 489 memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); 490 err = nf_ct_frag6_gather(net, skb, user); 491 if (err) { 492 if (err != -EINPROGRESS) 493 kfree_skb(skb); 494 return err; 495 } 496 497 key->ip.proto = ipv6_hdr(skb)->nexthdr; 498 ovs_cb.mru = IP6CB(skb)->frag_max_size; 499 #endif 500 } else { 501 kfree_skb(skb); 502 return -EPFNOSUPPORT; 503 } 504 505 key->ip.frag = OVS_FRAG_TYPE_NONE; 506 skb_clear_hash(skb); 507 skb->ignore_df = 1; 508 *OVS_CB(skb) = ovs_cb; 509 510 return 0; 511 } 512 513 static struct nf_conntrack_expect * 514 ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, 515 u16 proto, const struct sk_buff *skb) 516 { 517 struct nf_conntrack_tuple tuple; 518 519 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) 520 return NULL; 521 return __nf_ct_expect_find(net, zone, &tuple); 522 } 523 524 /* This replicates logic from nf_conntrack_core.c that is not exported. */ 525 static enum ip_conntrack_info 526 ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) 527 { 528 const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 529 530 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) 531 return IP_CT_ESTABLISHED_REPLY; 532 /* Once we've had two way comms, always ESTABLISHED. */ 533 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 534 return IP_CT_ESTABLISHED; 535 if (test_bit(IPS_EXPECTED_BIT, &ct->status)) 536 return IP_CT_RELATED; 537 return IP_CT_NEW; 538 } 539 540 /* Find an existing connection which this packet belongs to without 541 * re-attributing statistics or modifying the connection state. This allows an 542 * skb->_nfct lost due to an upcall to be recovered during actions execution. 543 * 544 * Must be called with rcu_read_lock. 545 * 546 * On success, populates skb->_nfct and returns the connection. Returns NULL 547 * if there is no existing entry. 548 */ 549 static struct nf_conn * 550 ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, 551 u8 l3num, struct sk_buff *skb, bool natted) 552 { 553 struct nf_conntrack_l3proto *l3proto; 554 struct nf_conntrack_l4proto *l4proto; 555 struct nf_conntrack_tuple tuple; 556 struct nf_conntrack_tuple_hash *h; 557 struct nf_conn *ct; 558 unsigned int dataoff; 559 u8 protonum; 560 561 l3proto = __nf_ct_l3proto_find(l3num); 562 if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, 563 &protonum) <= 0) { 564 pr_debug("ovs_ct_find_existing: Can't get protonum\n"); 565 return NULL; 566 } 567 l4proto = __nf_ct_l4proto_find(l3num, protonum); 568 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 569 protonum, net, &tuple, l3proto, l4proto)) { 570 pr_debug("ovs_ct_find_existing: Can't get tuple\n"); 571 return NULL; 572 } 573 574 /* Must invert the tuple if skb has been transformed by NAT. */ 575 if (natted) { 576 struct nf_conntrack_tuple inverse; 577 578 if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) { 579 pr_debug("ovs_ct_find_existing: Inversion failed!\n"); 580 return NULL; 581 } 582 tuple = inverse; 583 } 584 585 /* look for tuple match */ 586 h = nf_conntrack_find_get(net, zone, &tuple); 587 if (!h) 588 return NULL; /* Not found. */ 589 590 ct = nf_ct_tuplehash_to_ctrack(h); 591 592 /* Inverted packet tuple matches the reverse direction conntrack tuple, 593 * select the other tuplehash to get the right 'ctinfo' bits for this 594 * packet. 595 */ 596 if (natted) 597 h = &ct->tuplehash[!h->tuple.dst.dir]; 598 599 nf_ct_set(skb, ct, ovs_ct_get_info(h)); 600 return ct; 601 } 602 603 /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ 604 static bool skb_nfct_cached(struct net *net, 605 const struct sw_flow_key *key, 606 const struct ovs_conntrack_info *info, 607 struct sk_buff *skb) 608 { 609 enum ip_conntrack_info ctinfo; 610 struct nf_conn *ct; 611 612 ct = nf_ct_get(skb, &ctinfo); 613 /* If no ct, check if we have evidence that an existing conntrack entry 614 * might be found for this skb. This happens when we lose a skb->_nfct 615 * due to an upcall. If the connection was not confirmed, it is not 616 * cached and needs to be run through conntrack again. 617 */ 618 if (!ct && key->ct_state & OVS_CS_F_TRACKED && 619 !(key->ct_state & OVS_CS_F_INVALID) && 620 key->ct_zone == info->zone.id) { 621 ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, 622 !!(key->ct_state 623 & OVS_CS_F_NAT_MASK)); 624 if (ct) 625 nf_ct_get(skb, &ctinfo); 626 } 627 if (!ct) 628 return false; 629 if (!net_eq(net, read_pnet(&ct->ct_net))) 630 return false; 631 if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) 632 return false; 633 if (info->helper) { 634 struct nf_conn_help *help; 635 636 help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); 637 if (help && rcu_access_pointer(help->helper) != info->helper) 638 return false; 639 } 640 /* Force conntrack entry direction to the current packet? */ 641 if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { 642 /* Delete the conntrack entry if confirmed, else just release 643 * the reference. 644 */ 645 if (nf_ct_is_confirmed(ct)) 646 nf_ct_delete(ct, 0, 0); 647 else 648 nf_conntrack_put(&ct->ct_general); 649 nf_ct_set(skb, NULL, 0); 650 return false; 651 } 652 653 return true; 654 } 655 656 #ifdef CONFIG_NF_NAT_NEEDED 657 /* Modelled after nf_nat_ipv[46]_fn(). 658 * range is only used for new, uninitialized NAT state. 659 * Returns either NF_ACCEPT or NF_DROP. 660 */ 661 static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, 662 enum ip_conntrack_info ctinfo, 663 const struct nf_nat_range *range, 664 enum nf_nat_manip_type maniptype) 665 { 666 int hooknum, nh_off, err = NF_ACCEPT; 667 668 nh_off = skb_network_offset(skb); 669 skb_pull_rcsum(skb, nh_off); 670 671 /* See HOOK2MANIP(). */ 672 if (maniptype == NF_NAT_MANIP_SRC) 673 hooknum = NF_INET_LOCAL_IN; /* Source NAT */ 674 else 675 hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ 676 677 switch (ctinfo) { 678 case IP_CT_RELATED: 679 case IP_CT_RELATED_REPLY: 680 if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && 681 skb->protocol == htons(ETH_P_IP) && 682 ip_hdr(skb)->protocol == IPPROTO_ICMP) { 683 if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, 684 hooknum)) 685 err = NF_DROP; 686 goto push; 687 } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && 688 skb->protocol == htons(ETH_P_IPV6)) { 689 __be16 frag_off; 690 u8 nexthdr = ipv6_hdr(skb)->nexthdr; 691 int hdrlen = ipv6_skip_exthdr(skb, 692 sizeof(struct ipv6hdr), 693 &nexthdr, &frag_off); 694 695 if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { 696 if (!nf_nat_icmpv6_reply_translation(skb, ct, 697 ctinfo, 698 hooknum, 699 hdrlen)) 700 err = NF_DROP; 701 goto push; 702 } 703 } 704 /* Non-ICMP, fall thru to initialize if needed. */ 705 case IP_CT_NEW: 706 /* Seen it before? This can happen for loopback, retrans, 707 * or local packets. 708 */ 709 if (!nf_nat_initialized(ct, maniptype)) { 710 /* Initialize according to the NAT action. */ 711 err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) 712 /* Action is set up to establish a new 713 * mapping. 714 */ 715 ? nf_nat_setup_info(ct, range, maniptype) 716 : nf_nat_alloc_null_binding(ct, hooknum); 717 if (err != NF_ACCEPT) 718 goto push; 719 } 720 break; 721 722 case IP_CT_ESTABLISHED: 723 case IP_CT_ESTABLISHED_REPLY: 724 break; 725 726 default: 727 err = NF_DROP; 728 goto push; 729 } 730 731 err = nf_nat_packet(ct, ctinfo, hooknum, skb); 732 push: 733 skb_push(skb, nh_off); 734 skb_postpush_rcsum(skb, skb->data, nh_off); 735 736 return err; 737 } 738 739 static void ovs_nat_update_key(struct sw_flow_key *key, 740 const struct sk_buff *skb, 741 enum nf_nat_manip_type maniptype) 742 { 743 if (maniptype == NF_NAT_MANIP_SRC) { 744 __be16 src; 745 746 key->ct_state |= OVS_CS_F_SRC_NAT; 747 if (key->eth.type == htons(ETH_P_IP)) 748 key->ipv4.addr.src = ip_hdr(skb)->saddr; 749 else if (key->eth.type == htons(ETH_P_IPV6)) 750 memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, 751 sizeof(key->ipv6.addr.src)); 752 else 753 return; 754 755 if (key->ip.proto == IPPROTO_UDP) 756 src = udp_hdr(skb)->source; 757 else if (key->ip.proto == IPPROTO_TCP) 758 src = tcp_hdr(skb)->source; 759 else if (key->ip.proto == IPPROTO_SCTP) 760 src = sctp_hdr(skb)->source; 761 else 762 return; 763 764 key->tp.src = src; 765 } else { 766 __be16 dst; 767 768 key->ct_state |= OVS_CS_F_DST_NAT; 769 if (key->eth.type == htons(ETH_P_IP)) 770 key->ipv4.addr.dst = ip_hdr(skb)->daddr; 771 else if (key->eth.type == htons(ETH_P_IPV6)) 772 memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, 773 sizeof(key->ipv6.addr.dst)); 774 else 775 return; 776 777 if (key->ip.proto == IPPROTO_UDP) 778 dst = udp_hdr(skb)->dest; 779 else if (key->ip.proto == IPPROTO_TCP) 780 dst = tcp_hdr(skb)->dest; 781 else if (key->ip.proto == IPPROTO_SCTP) 782 dst = sctp_hdr(skb)->dest; 783 else 784 return; 785 786 key->tp.dst = dst; 787 } 788 } 789 790 /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ 791 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, 792 const struct ovs_conntrack_info *info, 793 struct sk_buff *skb, struct nf_conn *ct, 794 enum ip_conntrack_info ctinfo) 795 { 796 enum nf_nat_manip_type maniptype; 797 int err; 798 799 if (nf_ct_is_untracked(ct)) { 800 /* A NAT action may only be performed on tracked packets. */ 801 return NF_ACCEPT; 802 } 803 804 /* Add NAT extension if not confirmed yet. */ 805 if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) 806 return NF_ACCEPT; /* Can't NAT. */ 807 808 /* Determine NAT type. 809 * Check if the NAT type can be deduced from the tracked connection. 810 * Make sure new expected connections (IP_CT_RELATED) are NATted only 811 * when committing. 812 */ 813 if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && 814 ct->status & IPS_NAT_MASK && 815 (ctinfo != IP_CT_RELATED || info->commit)) { 816 /* NAT an established or related connection like before. */ 817 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) 818 /* This is the REPLY direction for a connection 819 * for which NAT was applied in the forward 820 * direction. Do the reverse NAT. 821 */ 822 maniptype = ct->status & IPS_SRC_NAT 823 ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; 824 else 825 maniptype = ct->status & IPS_SRC_NAT 826 ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; 827 } else if (info->nat & OVS_CT_SRC_NAT) { 828 maniptype = NF_NAT_MANIP_SRC; 829 } else if (info->nat & OVS_CT_DST_NAT) { 830 maniptype = NF_NAT_MANIP_DST; 831 } else { 832 return NF_ACCEPT; /* Connection is not NATed. */ 833 } 834 err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); 835 836 /* Mark NAT done if successful and update the flow key. */ 837 if (err == NF_ACCEPT) 838 ovs_nat_update_key(key, skb, maniptype); 839 840 return err; 841 } 842 #else /* !CONFIG_NF_NAT_NEEDED */ 843 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, 844 const struct ovs_conntrack_info *info, 845 struct sk_buff *skb, struct nf_conn *ct, 846 enum ip_conntrack_info ctinfo) 847 { 848 return NF_ACCEPT; 849 } 850 #endif 851 852 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if 853 * not done already. Update key with new CT state after passing the packet 854 * through conntrack. 855 * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be 856 * set to NULL and 0 will be returned. 857 */ 858 static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 859 const struct ovs_conntrack_info *info, 860 struct sk_buff *skb) 861 { 862 /* If we are recirculating packets to match on conntrack fields and 863 * committing with a separate conntrack action, then we don't need to 864 * actually run the packet through conntrack twice unless it's for a 865 * different zone. 866 */ 867 bool cached = skb_nfct_cached(net, key, info, skb); 868 enum ip_conntrack_info ctinfo; 869 struct nf_conn *ct; 870 871 if (!cached) { 872 struct nf_conn *tmpl = info->ct; 873 int err; 874 875 /* Associate skb with specified zone. */ 876 if (tmpl) { 877 if (skb_nfct(skb)) 878 nf_conntrack_put(skb_nfct(skb)); 879 nf_conntrack_get(&tmpl->ct_general); 880 nf_ct_set(skb, tmpl, IP_CT_NEW); 881 } 882 883 err = nf_conntrack_in(net, info->family, 884 NF_INET_PRE_ROUTING, skb); 885 if (err != NF_ACCEPT) 886 return -ENOENT; 887 888 /* Clear CT state NAT flags to mark that we have not yet done 889 * NAT after the nf_conntrack_in() call. We can actually clear 890 * the whole state, as it will be re-initialized below. 891 */ 892 key->ct_state = 0; 893 894 /* Update the key, but keep the NAT flags. */ 895 ovs_ct_update_key(skb, info, key, true, true); 896 } 897 898 ct = nf_ct_get(skb, &ctinfo); 899 if (ct) { 900 /* Packets starting a new connection must be NATted before the 901 * helper, so that the helper knows about the NAT. We enforce 902 * this by delaying both NAT and helper calls for unconfirmed 903 * connections until the committing CT action. For later 904 * packets NAT and Helper may be called in either order. 905 * 906 * NAT will be done only if the CT action has NAT, and only 907 * once per packet (per zone), as guarded by the NAT bits in 908 * the key->ct_state. 909 */ 910 if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) && 911 (nf_ct_is_confirmed(ct) || info->commit) && 912 ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { 913 return -EINVAL; 914 } 915 916 /* Userspace may decide to perform a ct lookup without a helper 917 * specified followed by a (recirculate and) commit with one. 918 * Therefore, for unconfirmed connections which we will commit, 919 * we need to attach the helper here. 920 */ 921 if (!nf_ct_is_confirmed(ct) && info->commit && 922 info->helper && !nfct_help(ct)) { 923 int err = __nf_ct_try_assign_helper(ct, info->ct, 924 GFP_ATOMIC); 925 if (err) 926 return err; 927 } 928 929 /* Call the helper only if: 930 * - nf_conntrack_in() was executed above ("!cached") for a 931 * confirmed connection, or 932 * - When committing an unconfirmed connection. 933 */ 934 if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) && 935 ovs_ct_helper(skb, info->family) != NF_ACCEPT) { 936 return -EINVAL; 937 } 938 } 939 940 return 0; 941 } 942 943 /* Lookup connection and read fields into key. */ 944 static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, 945 const struct ovs_conntrack_info *info, 946 struct sk_buff *skb) 947 { 948 struct nf_conntrack_expect *exp; 949 950 /* If we pass an expected packet through nf_conntrack_in() the 951 * expectation is typically removed, but the packet could still be 952 * lost in upcall processing. To prevent this from happening we 953 * perform an explicit expectation lookup. Expected connections are 954 * always new, and will be passed through conntrack only when they are 955 * committed, as it is OK to remove the expectation at that time. 956 */ 957 exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); 958 if (exp) { 959 u8 state; 960 961 /* NOTE: New connections are NATted and Helped only when 962 * committed, so we are not calling into NAT here. 963 */ 964 state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; 965 __ovs_ct_update_key(key, state, &info->zone, exp->master); 966 } else { 967 struct nf_conn *ct; 968 int err; 969 970 err = __ovs_ct_lookup(net, key, info, skb); 971 if (err) 972 return err; 973 974 ct = (struct nf_conn *)skb_nfct(skb); 975 if (ct) 976 nf_ct_deliver_cached_events(ct); 977 } 978 979 return 0; 980 } 981 982 static bool labels_nonzero(const struct ovs_key_ct_labels *labels) 983 { 984 size_t i; 985 986 for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) 987 if (labels->ct_labels_32[i]) 988 return true; 989 990 return false; 991 } 992 993 /* Lookup connection and confirm if unconfirmed. */ 994 static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, 995 const struct ovs_conntrack_info *info, 996 struct sk_buff *skb) 997 { 998 enum ip_conntrack_info ctinfo; 999 struct nf_conn *ct; 1000 int err; 1001 1002 err = __ovs_ct_lookup(net, key, info, skb); 1003 if (err) 1004 return err; 1005 1006 /* The connection could be invalid, in which case this is a no-op.*/ 1007 ct = nf_ct_get(skb, &ctinfo); 1008 if (!ct) 1009 return 0; 1010 1011 /* Apply changes before confirming the connection so that the initial 1012 * conntrack NEW netlink event carries the values given in the CT 1013 * action. 1014 */ 1015 if (info->mark.mask) { 1016 err = ovs_ct_set_mark(ct, key, info->mark.value, 1017 info->mark.mask); 1018 if (err) 1019 return err; 1020 } 1021 if (!nf_ct_is_confirmed(ct)) { 1022 err = ovs_ct_init_labels(ct, key, &info->labels.value, 1023 &info->labels.mask); 1024 if (err) 1025 return err; 1026 } else if (labels_nonzero(&info->labels.mask)) { 1027 err = ovs_ct_set_labels(ct, key, &info->labels.value, 1028 &info->labels.mask); 1029 if (err) 1030 return err; 1031 } 1032 /* This will take care of sending queued events even if the connection 1033 * is already confirmed. 1034 */ 1035 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 1036 return -EINVAL; 1037 1038 return 0; 1039 } 1040 1041 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero 1042 * value if 'skb' is freed. 1043 */ 1044 int ovs_ct_execute(struct net *net, struct sk_buff *skb, 1045 struct sw_flow_key *key, 1046 const struct ovs_conntrack_info *info) 1047 { 1048 int nh_ofs; 1049 int err; 1050 1051 /* The conntrack module expects to be working at L3. */ 1052 nh_ofs = skb_network_offset(skb); 1053 skb_pull_rcsum(skb, nh_ofs); 1054 1055 if (key->ip.frag != OVS_FRAG_TYPE_NONE) { 1056 err = handle_fragments(net, key, info->zone.id, skb); 1057 if (err) 1058 return err; 1059 } 1060 1061 if (info->commit) 1062 err = ovs_ct_commit(net, key, info, skb); 1063 else 1064 err = ovs_ct_lookup(net, key, info, skb); 1065 1066 skb_push(skb, nh_ofs); 1067 skb_postpush_rcsum(skb, skb->data, nh_ofs); 1068 if (err) 1069 kfree_skb(skb); 1070 return err; 1071 } 1072 1073 static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, 1074 const struct sw_flow_key *key, bool log) 1075 { 1076 struct nf_conntrack_helper *helper; 1077 struct nf_conn_help *help; 1078 1079 helper = nf_conntrack_helper_try_module_get(name, info->family, 1080 key->ip.proto); 1081 if (!helper) { 1082 OVS_NLERR(log, "Unknown helper \"%s\"", name); 1083 return -EINVAL; 1084 } 1085 1086 help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); 1087 if (!help) { 1088 module_put(helper->me); 1089 return -ENOMEM; 1090 } 1091 1092 rcu_assign_pointer(help->helper, helper); 1093 info->helper = helper; 1094 return 0; 1095 } 1096 1097 #ifdef CONFIG_NF_NAT_NEEDED 1098 static int parse_nat(const struct nlattr *attr, 1099 struct ovs_conntrack_info *info, bool log) 1100 { 1101 struct nlattr *a; 1102 int rem; 1103 bool have_ip_max = false; 1104 bool have_proto_max = false; 1105 bool ip_vers = (info->family == NFPROTO_IPV6); 1106 1107 nla_for_each_nested(a, attr, rem) { 1108 static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { 1109 [OVS_NAT_ATTR_SRC] = {0, 0}, 1110 [OVS_NAT_ATTR_DST] = {0, 0}, 1111 [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), 1112 sizeof(struct in6_addr)}, 1113 [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), 1114 sizeof(struct in6_addr)}, 1115 [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, 1116 [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, 1117 [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, 1118 [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, 1119 [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, 1120 }; 1121 int type = nla_type(a); 1122 1123 if (type > OVS_NAT_ATTR_MAX) { 1124 OVS_NLERR(log, 1125 "Unknown NAT attribute (type=%d, max=%d).\n", 1126 type, OVS_NAT_ATTR_MAX); 1127 return -EINVAL; 1128 } 1129 1130 if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { 1131 OVS_NLERR(log, 1132 "NAT attribute type %d has unexpected length (%d != %d).\n", 1133 type, nla_len(a), 1134 ovs_nat_attr_lens[type][ip_vers]); 1135 return -EINVAL; 1136 } 1137 1138 switch (type) { 1139 case OVS_NAT_ATTR_SRC: 1140 case OVS_NAT_ATTR_DST: 1141 if (info->nat) { 1142 OVS_NLERR(log, 1143 "Only one type of NAT may be specified.\n" 1144 ); 1145 return -ERANGE; 1146 } 1147 info->nat |= OVS_CT_NAT; 1148 info->nat |= ((type == OVS_NAT_ATTR_SRC) 1149 ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); 1150 break; 1151 1152 case OVS_NAT_ATTR_IP_MIN: 1153 nla_memcpy(&info->range.min_addr, a, 1154 sizeof(info->range.min_addr)); 1155 info->range.flags |= NF_NAT_RANGE_MAP_IPS; 1156 break; 1157 1158 case OVS_NAT_ATTR_IP_MAX: 1159 have_ip_max = true; 1160 nla_memcpy(&info->range.max_addr, a, 1161 sizeof(info->range.max_addr)); 1162 info->range.flags |= NF_NAT_RANGE_MAP_IPS; 1163 break; 1164 1165 case OVS_NAT_ATTR_PROTO_MIN: 1166 info->range.min_proto.all = htons(nla_get_u16(a)); 1167 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1168 break; 1169 1170 case OVS_NAT_ATTR_PROTO_MAX: 1171 have_proto_max = true; 1172 info->range.max_proto.all = htons(nla_get_u16(a)); 1173 info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 1174 break; 1175 1176 case OVS_NAT_ATTR_PERSISTENT: 1177 info->range.flags |= NF_NAT_RANGE_PERSISTENT; 1178 break; 1179 1180 case OVS_NAT_ATTR_PROTO_HASH: 1181 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; 1182 break; 1183 1184 case OVS_NAT_ATTR_PROTO_RANDOM: 1185 info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; 1186 break; 1187 1188 default: 1189 OVS_NLERR(log, "Unknown nat attribute (%d).\n", type); 1190 return -EINVAL; 1191 } 1192 } 1193 1194 if (rem > 0) { 1195 OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem); 1196 return -EINVAL; 1197 } 1198 if (!info->nat) { 1199 /* Do not allow flags if no type is given. */ 1200 if (info->range.flags) { 1201 OVS_NLERR(log, 1202 "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n" 1203 ); 1204 return -EINVAL; 1205 } 1206 info->nat = OVS_CT_NAT; /* NAT existing connections. */ 1207 } else if (!info->commit) { 1208 OVS_NLERR(log, 1209 "NAT attributes may be specified only when CT COMMIT flag is also specified.\n" 1210 ); 1211 return -EINVAL; 1212 } 1213 /* Allow missing IP_MAX. */ 1214 if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { 1215 memcpy(&info->range.max_addr, &info->range.min_addr, 1216 sizeof(info->range.max_addr)); 1217 } 1218 /* Allow missing PROTO_MAX. */ 1219 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && 1220 !have_proto_max) { 1221 info->range.max_proto.all = info->range.min_proto.all; 1222 } 1223 return 0; 1224 } 1225 #endif 1226 1227 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { 1228 [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1229 [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 }, 1230 [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), 1231 .maxlen = sizeof(u16) }, 1232 [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), 1233 .maxlen = sizeof(struct md_mark) }, 1234 [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), 1235 .maxlen = sizeof(struct md_labels) }, 1236 [OVS_CT_ATTR_HELPER] = { .minlen = 1, 1237 .maxlen = NF_CT_HELPER_NAME_LEN }, 1238 #ifdef CONFIG_NF_NAT_NEEDED 1239 /* NAT length is checked when parsing the nested attributes. */ 1240 [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, 1241 #endif 1242 }; 1243 1244 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, 1245 const char **helper, bool log) 1246 { 1247 struct nlattr *a; 1248 int rem; 1249 1250 nla_for_each_nested(a, attr, rem) { 1251 int type = nla_type(a); 1252 int maxlen = ovs_ct_attr_lens[type].maxlen; 1253 int minlen = ovs_ct_attr_lens[type].minlen; 1254 1255 if (type > OVS_CT_ATTR_MAX) { 1256 OVS_NLERR(log, 1257 "Unknown conntrack attr (type=%d, max=%d)", 1258 type, OVS_CT_ATTR_MAX); 1259 return -EINVAL; 1260 } 1261 if (nla_len(a) < minlen || nla_len(a) > maxlen) { 1262 OVS_NLERR(log, 1263 "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", 1264 type, nla_len(a), maxlen); 1265 return -EINVAL; 1266 } 1267 1268 switch (type) { 1269 case OVS_CT_ATTR_FORCE_COMMIT: 1270 info->force = true; 1271 /* fall through. */ 1272 case OVS_CT_ATTR_COMMIT: 1273 info->commit = true; 1274 break; 1275 #ifdef CONFIG_NF_CONNTRACK_ZONES 1276 case OVS_CT_ATTR_ZONE: 1277 info->zone.id = nla_get_u16(a); 1278 break; 1279 #endif 1280 #ifdef CONFIG_NF_CONNTRACK_MARK 1281 case OVS_CT_ATTR_MARK: { 1282 struct md_mark *mark = nla_data(a); 1283 1284 if (!mark->mask) { 1285 OVS_NLERR(log, "ct_mark mask cannot be 0"); 1286 return -EINVAL; 1287 } 1288 info->mark = *mark; 1289 break; 1290 } 1291 #endif 1292 #ifdef CONFIG_NF_CONNTRACK_LABELS 1293 case OVS_CT_ATTR_LABELS: { 1294 struct md_labels *labels = nla_data(a); 1295 1296 if (!labels_nonzero(&labels->mask)) { 1297 OVS_NLERR(log, "ct_labels mask cannot be 0"); 1298 return -EINVAL; 1299 } 1300 info->labels = *labels; 1301 break; 1302 } 1303 #endif 1304 case OVS_CT_ATTR_HELPER: 1305 *helper = nla_data(a); 1306 if (!memchr(*helper, '\0', nla_len(a))) { 1307 OVS_NLERR(log, "Invalid conntrack helper"); 1308 return -EINVAL; 1309 } 1310 break; 1311 #ifdef CONFIG_NF_NAT_NEEDED 1312 case OVS_CT_ATTR_NAT: { 1313 int err = parse_nat(a, info, log); 1314 1315 if (err) 1316 return err; 1317 break; 1318 } 1319 #endif 1320 default: 1321 OVS_NLERR(log, "Unknown conntrack attr (%d)", 1322 type); 1323 return -EINVAL; 1324 } 1325 } 1326 1327 #ifdef CONFIG_NF_CONNTRACK_MARK 1328 if (!info->commit && info->mark.mask) { 1329 OVS_NLERR(log, 1330 "Setting conntrack mark requires 'commit' flag."); 1331 return -EINVAL; 1332 } 1333 #endif 1334 #ifdef CONFIG_NF_CONNTRACK_LABELS 1335 if (!info->commit && labels_nonzero(&info->labels.mask)) { 1336 OVS_NLERR(log, 1337 "Setting conntrack labels requires 'commit' flag."); 1338 return -EINVAL; 1339 } 1340 #endif 1341 if (rem > 0) { 1342 OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); 1343 return -EINVAL; 1344 } 1345 1346 return 0; 1347 } 1348 1349 bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) 1350 { 1351 if (attr == OVS_KEY_ATTR_CT_STATE) 1352 return true; 1353 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1354 attr == OVS_KEY_ATTR_CT_ZONE) 1355 return true; 1356 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && 1357 attr == OVS_KEY_ATTR_CT_MARK) 1358 return true; 1359 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1360 attr == OVS_KEY_ATTR_CT_LABELS) { 1361 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 1362 1363 return ovs_net->xt_label; 1364 } 1365 1366 return false; 1367 } 1368 1369 int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, 1370 const struct sw_flow_key *key, 1371 struct sw_flow_actions **sfa, bool log) 1372 { 1373 struct ovs_conntrack_info ct_info; 1374 const char *helper = NULL; 1375 u16 family; 1376 int err; 1377 1378 family = key_to_nfproto(key); 1379 if (family == NFPROTO_UNSPEC) { 1380 OVS_NLERR(log, "ct family unspecified"); 1381 return -EINVAL; 1382 } 1383 1384 memset(&ct_info, 0, sizeof(ct_info)); 1385 ct_info.family = family; 1386 1387 nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, 1388 NF_CT_DEFAULT_ZONE_DIR, 0); 1389 1390 err = parse_ct(attr, &ct_info, &helper, log); 1391 if (err) 1392 return err; 1393 1394 /* Set up template for tracking connections in specific zones. */ 1395 ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); 1396 if (!ct_info.ct) { 1397 OVS_NLERR(log, "Failed to allocate conntrack template"); 1398 return -ENOMEM; 1399 } 1400 1401 __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); 1402 nf_conntrack_get(&ct_info.ct->ct_general); 1403 1404 if (helper) { 1405 err = ovs_ct_add_helper(&ct_info, helper, key, log); 1406 if (err) 1407 goto err_free_ct; 1408 } 1409 1410 err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, 1411 sizeof(ct_info), log); 1412 if (err) 1413 goto err_free_ct; 1414 1415 return 0; 1416 err_free_ct: 1417 __ovs_ct_free_action(&ct_info); 1418 return err; 1419 } 1420 1421 #ifdef CONFIG_NF_NAT_NEEDED 1422 static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, 1423 struct sk_buff *skb) 1424 { 1425 struct nlattr *start; 1426 1427 start = nla_nest_start(skb, OVS_CT_ATTR_NAT); 1428 if (!start) 1429 return false; 1430 1431 if (info->nat & OVS_CT_SRC_NAT) { 1432 if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) 1433 return false; 1434 } else if (info->nat & OVS_CT_DST_NAT) { 1435 if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) 1436 return false; 1437 } else { 1438 goto out; 1439 } 1440 1441 if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { 1442 if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && 1443 info->family == NFPROTO_IPV4) { 1444 if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, 1445 info->range.min_addr.ip) || 1446 (info->range.max_addr.ip 1447 != info->range.min_addr.ip && 1448 (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, 1449 info->range.max_addr.ip)))) 1450 return false; 1451 } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && 1452 info->family == NFPROTO_IPV6) { 1453 if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, 1454 &info->range.min_addr.in6) || 1455 (memcmp(&info->range.max_addr.in6, 1456 &info->range.min_addr.in6, 1457 sizeof(info->range.max_addr.in6)) && 1458 (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, 1459 &info->range.max_addr.in6)))) 1460 return false; 1461 } else { 1462 return false; 1463 } 1464 } 1465 if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && 1466 (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, 1467 ntohs(info->range.min_proto.all)) || 1468 (info->range.max_proto.all != info->range.min_proto.all && 1469 nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, 1470 ntohs(info->range.max_proto.all))))) 1471 return false; 1472 1473 if (info->range.flags & NF_NAT_RANGE_PERSISTENT && 1474 nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) 1475 return false; 1476 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && 1477 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH)) 1478 return false; 1479 if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && 1480 nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) 1481 return false; 1482 out: 1483 nla_nest_end(skb, start); 1484 1485 return true; 1486 } 1487 #endif 1488 1489 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, 1490 struct sk_buff *skb) 1491 { 1492 struct nlattr *start; 1493 1494 start = nla_nest_start(skb, OVS_ACTION_ATTR_CT); 1495 if (!start) 1496 return -EMSGSIZE; 1497 1498 if (ct_info->commit && nla_put_flag(skb, ct_info->force 1499 ? OVS_CT_ATTR_FORCE_COMMIT 1500 : OVS_CT_ATTR_COMMIT)) 1501 return -EMSGSIZE; 1502 if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && 1503 nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) 1504 return -EMSGSIZE; 1505 if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && 1506 nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), 1507 &ct_info->mark)) 1508 return -EMSGSIZE; 1509 if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && 1510 labels_nonzero(&ct_info->labels.mask) && 1511 nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), 1512 &ct_info->labels)) 1513 return -EMSGSIZE; 1514 if (ct_info->helper) { 1515 if (nla_put_string(skb, OVS_CT_ATTR_HELPER, 1516 ct_info->helper->name)) 1517 return -EMSGSIZE; 1518 } 1519 #ifdef CONFIG_NF_NAT_NEEDED 1520 if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) 1521 return -EMSGSIZE; 1522 #endif 1523 nla_nest_end(skb, start); 1524 1525 return 0; 1526 } 1527 1528 void ovs_ct_free_action(const struct nlattr *a) 1529 { 1530 struct ovs_conntrack_info *ct_info = nla_data(a); 1531 1532 __ovs_ct_free_action(ct_info); 1533 } 1534 1535 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) 1536 { 1537 if (ct_info->helper) 1538 module_put(ct_info->helper->me); 1539 if (ct_info->ct) 1540 nf_ct_tmpl_free(ct_info->ct); 1541 } 1542 1543 void ovs_ct_init(struct net *net) 1544 { 1545 unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; 1546 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 1547 1548 if (nf_connlabels_get(net, n_bits - 1)) { 1549 ovs_net->xt_label = false; 1550 OVS_NLERR(true, "Failed to set connlabel length"); 1551 } else { 1552 ovs_net->xt_label = true; 1553 } 1554 } 1555 1556 void ovs_ct_exit(struct net *net) 1557 { 1558 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 1559 1560 if (ovs_net->xt_label) 1561 nf_connlabels_put(net); 1562 } 1563