1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 1999-2001 Paul `Rusty' Russell 4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 5 * (C) 2011 Patrick McHardy <kaber@trash.net> 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/module.h> 11 #include <linux/types.h> 12 #include <linux/timer.h> 13 #include <linux/skbuff.h> 14 #include <linux/gfp.h> 15 #include <net/xfrm.h> 16 #include <linux/siphash.h> 17 #include <linux/rtnetlink.h> 18 19 #include <net/netfilter/nf_conntrack.h> 20 #include <net/netfilter/nf_conntrack_core.h> 21 #include <net/netfilter/nf_conntrack_helper.h> 22 #include <net/netfilter/nf_conntrack_seqadj.h> 23 #include <net/netfilter/nf_conntrack_zones.h> 24 #include <net/netfilter/nf_nat.h> 25 #include <net/netfilter/nf_nat_helper.h> 26 #include <uapi/linux/netfilter/nf_nat.h> 27 28 #include "nf_internals.h" 29 30 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; 31 32 static DEFINE_MUTEX(nf_nat_proto_mutex); 33 static unsigned int nat_net_id __read_mostly; 34 35 static struct hlist_head *nf_nat_bysource __read_mostly; 36 static unsigned int nf_nat_htable_size __read_mostly; 37 static siphash_aligned_key_t nf_nat_hash_rnd; 38 39 struct nf_nat_lookup_hook_priv { 40 struct nf_hook_entries __rcu *entries; 41 42 struct rcu_head rcu_head; 43 }; 44 45 struct nf_nat_hooks_net { 46 struct nf_hook_ops *nat_hook_ops; 47 unsigned int users; 48 }; 49 50 struct nat_net { 51 struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; 52 }; 53 54 #ifdef CONFIG_XFRM 55 static void nf_nat_ipv4_decode_session(struct sk_buff *skb, 56 const struct nf_conn *ct, 57 enum ip_conntrack_dir dir, 58 unsigned long statusbit, 59 struct flowi *fl) 60 { 61 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 62 struct flowi4 *fl4 = &fl->u.ip4; 63 64 if (ct->status & statusbit) { 65 fl4->daddr = t->dst.u3.ip; 66 if (t->dst.protonum == IPPROTO_TCP || 67 t->dst.protonum == IPPROTO_UDP || 68 t->dst.protonum == IPPROTO_UDPLITE || 69 t->dst.protonum == IPPROTO_DCCP || 70 t->dst.protonum == IPPROTO_SCTP) 71 fl4->fl4_dport = t->dst.u.all; 72 } 73 74 statusbit ^= IPS_NAT_MASK; 75 76 if (ct->status & statusbit) { 77 fl4->saddr = t->src.u3.ip; 78 if (t->dst.protonum == IPPROTO_TCP || 79 t->dst.protonum == IPPROTO_UDP || 80 t->dst.protonum == IPPROTO_UDPLITE || 81 t->dst.protonum == IPPROTO_DCCP || 82 t->dst.protonum == IPPROTO_SCTP) 83 fl4->fl4_sport = t->src.u.all; 84 } 85 } 86 87 static void nf_nat_ipv6_decode_session(struct sk_buff *skb, 88 const struct nf_conn *ct, 89 enum ip_conntrack_dir dir, 90 unsigned long statusbit, 91 struct flowi *fl) 92 { 93 #if IS_ENABLED(CONFIG_IPV6) 94 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 95 struct flowi6 *fl6 = &fl->u.ip6; 96 97 if (ct->status & statusbit) { 98 fl6->daddr = t->dst.u3.in6; 99 if (t->dst.protonum == IPPROTO_TCP || 100 t->dst.protonum == IPPROTO_UDP || 101 t->dst.protonum == IPPROTO_UDPLITE || 102 t->dst.protonum == IPPROTO_DCCP || 103 t->dst.protonum == IPPROTO_SCTP) 104 fl6->fl6_dport = t->dst.u.all; 105 } 106 107 statusbit ^= IPS_NAT_MASK; 108 109 if (ct->status & statusbit) { 110 fl6->saddr = t->src.u3.in6; 111 if (t->dst.protonum == IPPROTO_TCP || 112 t->dst.protonum == IPPROTO_UDP || 113 t->dst.protonum == IPPROTO_UDPLITE || 114 t->dst.protonum == IPPROTO_DCCP || 115 t->dst.protonum == IPPROTO_SCTP) 116 fl6->fl6_sport = t->src.u.all; 117 } 118 #endif 119 } 120 121 static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) 122 { 123 const struct nf_conn *ct; 124 enum ip_conntrack_info ctinfo; 125 enum ip_conntrack_dir dir; 126 unsigned long statusbit; 127 u8 family; 128 129 ct = nf_ct_get(skb, &ctinfo); 130 if (ct == NULL) 131 return; 132 133 family = nf_ct_l3num(ct); 134 dir = CTINFO2DIR(ctinfo); 135 if (dir == IP_CT_DIR_ORIGINAL) 136 statusbit = IPS_DST_NAT; 137 else 138 statusbit = IPS_SRC_NAT; 139 140 switch (family) { 141 case NFPROTO_IPV4: 142 nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); 143 return; 144 case NFPROTO_IPV6: 145 nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); 146 return; 147 } 148 } 149 #endif /* CONFIG_XFRM */ 150 151 /* We keep an extra hash for each conntrack, for fast searching. */ 152 static unsigned int 153 hash_by_src(const struct net *net, 154 const struct nf_conntrack_zone *zone, 155 const struct nf_conntrack_tuple *tuple) 156 { 157 unsigned int hash; 158 struct { 159 struct nf_conntrack_man src; 160 u32 net_mix; 161 u32 protonum; 162 u32 zone; 163 } __aligned(SIPHASH_ALIGNMENT) combined; 164 165 get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); 166 167 memset(&combined, 0, sizeof(combined)); 168 169 /* Original src, to ensure we map it consistently if poss. */ 170 combined.src = tuple->src; 171 combined.net_mix = net_hash_mix(net); 172 combined.protonum = tuple->dst.protonum; 173 174 /* Zone ID can be used provided its valid for both directions */ 175 if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) 176 combined.zone = zone->id; 177 178 hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); 179 180 return reciprocal_scale(hash, nf_nat_htable_size); 181 } 182 183 /* Is this tuple already taken? (not by us) */ 184 static int 185 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, 186 const struct nf_conn *ignored_conntrack) 187 { 188 /* Conntrack tracking doesn't keep track of outgoing tuples; only 189 * incoming ones. NAT means they don't have a fixed mapping, 190 * so we invert the tuple and look for the incoming reply. 191 * 192 * We could keep a separate hash if this proves too slow. 193 */ 194 struct nf_conntrack_tuple reply; 195 196 nf_ct_invert_tuple(&reply, tuple); 197 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 198 } 199 200 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, 201 const struct nf_nat_range2 *range) 202 { 203 if (t->src.l3num == NFPROTO_IPV4) 204 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && 205 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); 206 207 return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && 208 ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; 209 } 210 211 /* Is the manipable part of the tuple between min and max incl? */ 212 static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, 213 enum nf_nat_manip_type maniptype, 214 const union nf_conntrack_man_proto *min, 215 const union nf_conntrack_man_proto *max) 216 { 217 __be16 port; 218 219 switch (tuple->dst.protonum) { 220 case IPPROTO_ICMP: 221 case IPPROTO_ICMPV6: 222 return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && 223 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); 224 case IPPROTO_GRE: /* all fall though */ 225 case IPPROTO_TCP: 226 case IPPROTO_UDP: 227 case IPPROTO_UDPLITE: 228 case IPPROTO_DCCP: 229 case IPPROTO_SCTP: 230 if (maniptype == NF_NAT_MANIP_SRC) 231 port = tuple->src.u.all; 232 else 233 port = tuple->dst.u.all; 234 235 return ntohs(port) >= ntohs(min->all) && 236 ntohs(port) <= ntohs(max->all); 237 default: 238 return true; 239 } 240 } 241 242 /* If we source map this tuple so reply looks like reply_tuple, will 243 * that meet the constraints of range. 244 */ 245 static int in_range(const struct nf_conntrack_tuple *tuple, 246 const struct nf_nat_range2 *range) 247 { 248 /* If we are supposed to map IPs, then we must be in the 249 * range specified, otherwise let this drag us onto a new src IP. 250 */ 251 if (range->flags & NF_NAT_RANGE_MAP_IPS && 252 !nf_nat_inet_in_range(tuple, range)) 253 return 0; 254 255 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) 256 return 1; 257 258 return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, 259 &range->min_proto, &range->max_proto); 260 } 261 262 static inline int 263 same_src(const struct nf_conn *ct, 264 const struct nf_conntrack_tuple *tuple) 265 { 266 const struct nf_conntrack_tuple *t; 267 268 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 269 return (t->dst.protonum == tuple->dst.protonum && 270 nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && 271 t->src.u.all == tuple->src.u.all); 272 } 273 274 /* Only called for SRC manip */ 275 static int 276 find_appropriate_src(struct net *net, 277 const struct nf_conntrack_zone *zone, 278 const struct nf_conntrack_tuple *tuple, 279 struct nf_conntrack_tuple *result, 280 const struct nf_nat_range2 *range) 281 { 282 unsigned int h = hash_by_src(net, zone, tuple); 283 const struct nf_conn *ct; 284 285 hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { 286 if (same_src(ct, tuple) && 287 net_eq(net, nf_ct_net(ct)) && 288 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { 289 /* Copy source part from reply tuple. */ 290 nf_ct_invert_tuple(result, 291 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 292 result->dst = tuple->dst; 293 294 if (in_range(result, range)) 295 return 1; 296 } 297 } 298 return 0; 299 } 300 301 /* For [FUTURE] fragmentation handling, we want the least-used 302 * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus 303 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports 304 * 1-65535, we don't do pro-rata allocation based on ports; we choose 305 * the ip with the lowest src-ip/dst-ip/proto usage. 306 */ 307 static void 308 find_best_ips_proto(const struct nf_conntrack_zone *zone, 309 struct nf_conntrack_tuple *tuple, 310 const struct nf_nat_range2 *range, 311 const struct nf_conn *ct, 312 enum nf_nat_manip_type maniptype) 313 { 314 union nf_inet_addr *var_ipp; 315 unsigned int i, max; 316 /* Host order */ 317 u32 minip, maxip, j, dist; 318 bool full_range; 319 320 /* No IP mapping? Do nothing. */ 321 if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) 322 return; 323 324 if (maniptype == NF_NAT_MANIP_SRC) 325 var_ipp = &tuple->src.u3; 326 else 327 var_ipp = &tuple->dst.u3; 328 329 /* Fast path: only one choice. */ 330 if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { 331 *var_ipp = range->min_addr; 332 return; 333 } 334 335 if (nf_ct_l3num(ct) == NFPROTO_IPV4) 336 max = sizeof(var_ipp->ip) / sizeof(u32) - 1; 337 else 338 max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; 339 340 /* Hashing source and destination IPs gives a fairly even 341 * spread in practice (if there are a small number of IPs 342 * involved, there usually aren't that many connections 343 * anyway). The consistency means that servers see the same 344 * client coming from the same IP (some Internet Banking sites 345 * like this), even across reboots. 346 */ 347 j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), 348 range->flags & NF_NAT_RANGE_PERSISTENT ? 349 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); 350 351 full_range = false; 352 for (i = 0; i <= max; i++) { 353 /* If first bytes of the address are at the maximum, use the 354 * distance. Otherwise use the full range. 355 */ 356 if (!full_range) { 357 minip = ntohl((__force __be32)range->min_addr.all[i]); 358 maxip = ntohl((__force __be32)range->max_addr.all[i]); 359 dist = maxip - minip + 1; 360 } else { 361 minip = 0; 362 dist = ~0; 363 } 364 365 var_ipp->all[i] = (__force __u32) 366 htonl(minip + reciprocal_scale(j, dist)); 367 if (var_ipp->all[i] != range->max_addr.all[i]) 368 full_range = true; 369 370 if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) 371 j ^= (__force u32)tuple->dst.u3.all[i]; 372 } 373 } 374 375 /* Alter the per-proto part of the tuple (depending on maniptype), to 376 * give a unique tuple in the given range if possible. 377 * 378 * Per-protocol part of tuple is initialized to the incoming packet. 379 */ 380 static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, 381 const struct nf_nat_range2 *range, 382 enum nf_nat_manip_type maniptype, 383 const struct nf_conn *ct) 384 { 385 unsigned int range_size, min, max, i, attempts; 386 __be16 *keyptr; 387 u16 off; 388 static const unsigned int max_attempts = 128; 389 390 switch (tuple->dst.protonum) { 391 case IPPROTO_ICMP: 392 case IPPROTO_ICMPV6: 393 /* id is same for either direction... */ 394 keyptr = &tuple->src.u.icmp.id; 395 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 396 min = 0; 397 range_size = 65536; 398 } else { 399 min = ntohs(range->min_proto.icmp.id); 400 range_size = ntohs(range->max_proto.icmp.id) - 401 ntohs(range->min_proto.icmp.id) + 1; 402 } 403 goto find_free_id; 404 #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) 405 case IPPROTO_GRE: 406 /* If there is no master conntrack we are not PPTP, 407 do not change tuples */ 408 if (!ct->master) 409 return; 410 411 if (maniptype == NF_NAT_MANIP_SRC) 412 keyptr = &tuple->src.u.gre.key; 413 else 414 keyptr = &tuple->dst.u.gre.key; 415 416 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 417 min = 1; 418 range_size = 65535; 419 } else { 420 min = ntohs(range->min_proto.gre.key); 421 range_size = ntohs(range->max_proto.gre.key) - min + 1; 422 } 423 goto find_free_id; 424 #endif 425 case IPPROTO_UDP: 426 case IPPROTO_UDPLITE: 427 case IPPROTO_TCP: 428 case IPPROTO_SCTP: 429 case IPPROTO_DCCP: 430 if (maniptype == NF_NAT_MANIP_SRC) 431 keyptr = &tuple->src.u.all; 432 else 433 keyptr = &tuple->dst.u.all; 434 435 break; 436 default: 437 return; 438 } 439 440 /* If no range specified... */ 441 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 442 /* If it's dst rewrite, can't change port */ 443 if (maniptype == NF_NAT_MANIP_DST) 444 return; 445 446 if (ntohs(*keyptr) < 1024) { 447 /* Loose convention: >> 512 is credential passing */ 448 if (ntohs(*keyptr) < 512) { 449 min = 1; 450 range_size = 511 - min + 1; 451 } else { 452 min = 600; 453 range_size = 1023 - min + 1; 454 } 455 } else { 456 min = 1024; 457 range_size = 65535 - 1024 + 1; 458 } 459 } else { 460 min = ntohs(range->min_proto.all); 461 max = ntohs(range->max_proto.all); 462 if (unlikely(max < min)) 463 swap(max, min); 464 range_size = max - min + 1; 465 } 466 467 find_free_id: 468 if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) 469 off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); 470 else 471 off = prandom_u32(); 472 473 attempts = range_size; 474 if (attempts > max_attempts) 475 attempts = max_attempts; 476 477 /* We are in softirq; doing a search of the entire range risks 478 * soft lockup when all tuples are already used. 479 * 480 * If we can't find any free port from first offset, pick a new 481 * one and try again, with ever smaller search window. 482 */ 483 another_round: 484 for (i = 0; i < attempts; i++, off++) { 485 *keyptr = htons(min + off % range_size); 486 if (!nf_nat_used_tuple(tuple, ct)) 487 return; 488 } 489 490 if (attempts >= range_size || attempts < 16) 491 return; 492 attempts /= 2; 493 off = prandom_u32(); 494 goto another_round; 495 } 496 497 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, 498 * we change the source to map into the range. For NF_INET_PRE_ROUTING 499 * and NF_INET_LOCAL_OUT, we change the destination to map into the 500 * range. It might not be possible to get a unique tuple, but we try. 501 * At worst (or if we race), we will end up with a final duplicate in 502 * __nf_conntrack_confirm and drop the packet. */ 503 static void 504 get_unique_tuple(struct nf_conntrack_tuple *tuple, 505 const struct nf_conntrack_tuple *orig_tuple, 506 const struct nf_nat_range2 *range, 507 struct nf_conn *ct, 508 enum nf_nat_manip_type maniptype) 509 { 510 const struct nf_conntrack_zone *zone; 511 struct net *net = nf_ct_net(ct); 512 513 zone = nf_ct_zone(ct); 514 515 /* 1) If this srcip/proto/src-proto-part is currently mapped, 516 * and that same mapping gives a unique tuple within the given 517 * range, use that. 518 * 519 * This is only required for source (ie. NAT/masq) mappings. 520 * So far, we don't do local source mappings, so multiple 521 * manips not an issue. 522 */ 523 if (maniptype == NF_NAT_MANIP_SRC && 524 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 525 /* try the original tuple first */ 526 if (in_range(orig_tuple, range)) { 527 if (!nf_nat_used_tuple(orig_tuple, ct)) { 528 *tuple = *orig_tuple; 529 return; 530 } 531 } else if (find_appropriate_src(net, zone, 532 orig_tuple, tuple, range)) { 533 pr_debug("get_unique_tuple: Found current src map\n"); 534 if (!nf_nat_used_tuple(tuple, ct)) 535 return; 536 } 537 } 538 539 /* 2) Select the least-used IP/proto combination in the given range */ 540 *tuple = *orig_tuple; 541 find_best_ips_proto(zone, tuple, range, ct, maniptype); 542 543 /* 3) The per-protocol part of the manip is made to map into 544 * the range to make a unique tuple. 545 */ 546 547 /* Only bother mapping if it's not already in range and unique */ 548 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 549 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 550 if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && 551 l4proto_in_range(tuple, maniptype, 552 &range->min_proto, 553 &range->max_proto) && 554 (range->min_proto.all == range->max_proto.all || 555 !nf_nat_used_tuple(tuple, ct))) 556 return; 557 } else if (!nf_nat_used_tuple(tuple, ct)) { 558 return; 559 } 560 } 561 562 /* Last chance: get protocol to try to obtain unique tuple. */ 563 nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); 564 } 565 566 struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) 567 { 568 struct nf_conn_nat *nat = nfct_nat(ct); 569 if (nat) 570 return nat; 571 572 if (!nf_ct_is_confirmed(ct)) 573 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); 574 575 return nat; 576 } 577 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); 578 579 unsigned int 580 nf_nat_setup_info(struct nf_conn *ct, 581 const struct nf_nat_range2 *range, 582 enum nf_nat_manip_type maniptype) 583 { 584 struct net *net = nf_ct_net(ct); 585 struct nf_conntrack_tuple curr_tuple, new_tuple; 586 587 /* Can't setup nat info for confirmed ct. */ 588 if (nf_ct_is_confirmed(ct)) 589 return NF_ACCEPT; 590 591 WARN_ON(maniptype != NF_NAT_MANIP_SRC && 592 maniptype != NF_NAT_MANIP_DST); 593 594 if (WARN_ON(nf_nat_initialized(ct, maniptype))) 595 return NF_DROP; 596 597 /* What we've got will look like inverse of reply. Normally 598 * this is what is in the conntrack, except for prior 599 * manipulations (future optimization: if num_manips == 0, 600 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 601 */ 602 nf_ct_invert_tuple(&curr_tuple, 603 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 604 605 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); 606 607 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { 608 struct nf_conntrack_tuple reply; 609 610 /* Alter conntrack table so will recognize replies. */ 611 nf_ct_invert_tuple(&reply, &new_tuple); 612 nf_conntrack_alter_reply(ct, &reply); 613 614 /* Non-atomic: we own this at the moment. */ 615 if (maniptype == NF_NAT_MANIP_SRC) 616 ct->status |= IPS_SRC_NAT; 617 else 618 ct->status |= IPS_DST_NAT; 619 620 if (nfct_help(ct) && !nfct_seqadj(ct)) 621 if (!nfct_seqadj_ext_add(ct)) 622 return NF_DROP; 623 } 624 625 if (maniptype == NF_NAT_MANIP_SRC) { 626 unsigned int srchash; 627 spinlock_t *lock; 628 629 srchash = hash_by_src(net, nf_ct_zone(ct), 630 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 631 lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; 632 spin_lock_bh(lock); 633 hlist_add_head_rcu(&ct->nat_bysource, 634 &nf_nat_bysource[srchash]); 635 spin_unlock_bh(lock); 636 } 637 638 /* It's done. */ 639 if (maniptype == NF_NAT_MANIP_DST) 640 ct->status |= IPS_DST_NAT_DONE; 641 else 642 ct->status |= IPS_SRC_NAT_DONE; 643 644 return NF_ACCEPT; 645 } 646 EXPORT_SYMBOL(nf_nat_setup_info); 647 648 static unsigned int 649 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) 650 { 651 /* Force range to this IP; let proto decide mapping for 652 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 653 * Use reply in case it's already been mangled (eg local packet). 654 */ 655 union nf_inet_addr ip = 656 (manip == NF_NAT_MANIP_SRC ? 657 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 658 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 659 struct nf_nat_range2 range = { 660 .flags = NF_NAT_RANGE_MAP_IPS, 661 .min_addr = ip, 662 .max_addr = ip, 663 }; 664 return nf_nat_setup_info(ct, &range, manip); 665 } 666 667 unsigned int 668 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 669 { 670 return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); 671 } 672 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); 673 674 /* Do packet manipulations according to nf_nat_setup_info. */ 675 unsigned int nf_nat_packet(struct nf_conn *ct, 676 enum ip_conntrack_info ctinfo, 677 unsigned int hooknum, 678 struct sk_buff *skb) 679 { 680 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 681 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 682 unsigned int verdict = NF_ACCEPT; 683 unsigned long statusbit; 684 685 if (mtype == NF_NAT_MANIP_SRC) 686 statusbit = IPS_SRC_NAT; 687 else 688 statusbit = IPS_DST_NAT; 689 690 /* Invert if this is reply dir. */ 691 if (dir == IP_CT_DIR_REPLY) 692 statusbit ^= IPS_NAT_MASK; 693 694 /* Non-atomic: these bits don't change. */ 695 if (ct->status & statusbit) 696 verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); 697 698 return verdict; 699 } 700 EXPORT_SYMBOL_GPL(nf_nat_packet); 701 702 static bool in_vrf_postrouting(const struct nf_hook_state *state) 703 { 704 #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) 705 if (state->hook == NF_INET_POST_ROUTING && 706 netif_is_l3_master(state->out)) 707 return true; 708 #endif 709 return false; 710 } 711 712 unsigned int 713 nf_nat_inet_fn(void *priv, struct sk_buff *skb, 714 const struct nf_hook_state *state) 715 { 716 struct nf_conn *ct; 717 enum ip_conntrack_info ctinfo; 718 struct nf_conn_nat *nat; 719 /* maniptype == SRC for postrouting. */ 720 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 721 722 ct = nf_ct_get(skb, &ctinfo); 723 /* Can't track? It's not due to stress, or conntrack would 724 * have dropped it. Hence it's the user's responsibilty to 725 * packet filter it out, or implement conntrack/NAT for that 726 * protocol. 8) --RR 727 */ 728 if (!ct || in_vrf_postrouting(state)) 729 return NF_ACCEPT; 730 731 nat = nfct_nat(ct); 732 733 switch (ctinfo) { 734 case IP_CT_RELATED: 735 case IP_CT_RELATED_REPLY: 736 /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ 737 case IP_CT_NEW: 738 /* Seen it before? This can happen for loopback, retrans, 739 * or local packets. 740 */ 741 if (!nf_nat_initialized(ct, maniptype)) { 742 struct nf_nat_lookup_hook_priv *lpriv = priv; 743 struct nf_hook_entries *e = rcu_dereference(lpriv->entries); 744 unsigned int ret; 745 int i; 746 747 if (!e) 748 goto null_bind; 749 750 for (i = 0; i < e->num_hook_entries; i++) { 751 ret = e->hooks[i].hook(e->hooks[i].priv, skb, 752 state); 753 if (ret != NF_ACCEPT) 754 return ret; 755 if (nf_nat_initialized(ct, maniptype)) 756 goto do_nat; 757 } 758 null_bind: 759 ret = nf_nat_alloc_null_binding(ct, state->hook); 760 if (ret != NF_ACCEPT) 761 return ret; 762 } else { 763 pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", 764 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", 765 ct, ct->status); 766 if (nf_nat_oif_changed(state->hook, ctinfo, nat, 767 state->out)) 768 goto oif_changed; 769 } 770 break; 771 default: 772 /* ESTABLISHED */ 773 WARN_ON(ctinfo != IP_CT_ESTABLISHED && 774 ctinfo != IP_CT_ESTABLISHED_REPLY); 775 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) 776 goto oif_changed; 777 } 778 do_nat: 779 return nf_nat_packet(ct, ctinfo, state->hook, skb); 780 781 oif_changed: 782 nf_ct_kill_acct(ct, ctinfo, skb); 783 return NF_DROP; 784 } 785 EXPORT_SYMBOL_GPL(nf_nat_inet_fn); 786 787 struct nf_nat_proto_clean { 788 u8 l3proto; 789 u8 l4proto; 790 }; 791 792 /* kill conntracks with affected NAT section */ 793 static int nf_nat_proto_remove(struct nf_conn *i, void *data) 794 { 795 const struct nf_nat_proto_clean *clean = data; 796 797 if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || 798 (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) 799 return 0; 800 801 return i->status & IPS_NAT_MASK ? 1 : 0; 802 } 803 804 static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 805 { 806 unsigned int h; 807 808 h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 809 spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 810 hlist_del_rcu(&ct->nat_bysource); 811 spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 812 } 813 814 static int nf_nat_proto_clean(struct nf_conn *ct, void *data) 815 { 816 if (nf_nat_proto_remove(ct, data)) 817 return 1; 818 819 /* This module is being removed and conntrack has nat null binding. 820 * Remove it from bysource hash, as the table will be freed soon. 821 * 822 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() 823 * will delete entry from already-freed table. 824 */ 825 if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) 826 nf_nat_cleanup_conntrack(ct); 827 828 /* don't delete conntrack. Although that would make things a lot 829 * simpler, we'd end up flushing all conntracks on nat rmmod. 830 */ 831 return 0; 832 } 833 834 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 835 836 #include <linux/netfilter/nfnetlink.h> 837 #include <linux/netfilter/nfnetlink_conntrack.h> 838 839 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 840 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 841 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 842 }; 843 844 static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], 845 struct nf_nat_range2 *range) 846 { 847 if (tb[CTA_PROTONAT_PORT_MIN]) { 848 range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); 849 range->max_proto.all = range->min_proto.all; 850 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 851 } 852 if (tb[CTA_PROTONAT_PORT_MAX]) { 853 range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); 854 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 855 } 856 return 0; 857 } 858 859 static int nfnetlink_parse_nat_proto(struct nlattr *attr, 860 const struct nf_conn *ct, 861 struct nf_nat_range2 *range) 862 { 863 struct nlattr *tb[CTA_PROTONAT_MAX+1]; 864 int err; 865 866 err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, 867 protonat_nla_policy, NULL); 868 if (err < 0) 869 return err; 870 871 return nf_nat_l4proto_nlattr_to_range(tb, range); 872 } 873 874 static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { 875 [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, 876 [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, 877 [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, 878 [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, 879 [CTA_NAT_PROTO] = { .type = NLA_NESTED }, 880 }; 881 882 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], 883 struct nf_nat_range2 *range) 884 { 885 if (tb[CTA_NAT_V4_MINIP]) { 886 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); 887 range->flags |= NF_NAT_RANGE_MAP_IPS; 888 } 889 890 if (tb[CTA_NAT_V4_MAXIP]) 891 range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); 892 else 893 range->max_addr.ip = range->min_addr.ip; 894 895 return 0; 896 } 897 898 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], 899 struct nf_nat_range2 *range) 900 { 901 if (tb[CTA_NAT_V6_MINIP]) { 902 nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], 903 sizeof(struct in6_addr)); 904 range->flags |= NF_NAT_RANGE_MAP_IPS; 905 } 906 907 if (tb[CTA_NAT_V6_MAXIP]) 908 nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], 909 sizeof(struct in6_addr)); 910 else 911 range->max_addr = range->min_addr; 912 913 return 0; 914 } 915 916 static int 917 nfnetlink_parse_nat(const struct nlattr *nat, 918 const struct nf_conn *ct, struct nf_nat_range2 *range) 919 { 920 struct nlattr *tb[CTA_NAT_MAX+1]; 921 int err; 922 923 memset(range, 0, sizeof(*range)); 924 925 err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, 926 nat_nla_policy, NULL); 927 if (err < 0) 928 return err; 929 930 switch (nf_ct_l3num(ct)) { 931 case NFPROTO_IPV4: 932 err = nf_nat_ipv4_nlattr_to_range(tb, range); 933 break; 934 case NFPROTO_IPV6: 935 err = nf_nat_ipv6_nlattr_to_range(tb, range); 936 break; 937 default: 938 err = -EPROTONOSUPPORT; 939 break; 940 } 941 942 if (err) 943 return err; 944 945 if (!tb[CTA_NAT_PROTO]) 946 return 0; 947 948 return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); 949 } 950 951 /* This function is called under rcu_read_lock() */ 952 static int 953 nfnetlink_parse_nat_setup(struct nf_conn *ct, 954 enum nf_nat_manip_type manip, 955 const struct nlattr *attr) 956 { 957 struct nf_nat_range2 range; 958 int err; 959 960 /* Should not happen, restricted to creating new conntracks 961 * via ctnetlink. 962 */ 963 if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) 964 return -EEXIST; 965 966 /* No NAT information has been passed, allocate the null-binding */ 967 if (attr == NULL) 968 return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; 969 970 err = nfnetlink_parse_nat(attr, ct, &range); 971 if (err < 0) 972 return err; 973 974 return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; 975 } 976 #else 977 static int 978 nfnetlink_parse_nat_setup(struct nf_conn *ct, 979 enum nf_nat_manip_type manip, 980 const struct nlattr *attr) 981 { 982 return -EOPNOTSUPP; 983 } 984 #endif 985 986 static struct nf_ct_helper_expectfn follow_master_nat = { 987 .name = "nat-follow-master", 988 .expectfn = nf_nat_follow_master, 989 }; 990 991 int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 992 const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) 993 { 994 struct nat_net *nat_net = net_generic(net, nat_net_id); 995 struct nf_nat_hooks_net *nat_proto_net; 996 struct nf_nat_lookup_hook_priv *priv; 997 unsigned int hooknum = ops->hooknum; 998 struct nf_hook_ops *nat_ops; 999 int i, ret; 1000 1001 if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) 1002 return -EINVAL; 1003 1004 nat_proto_net = &nat_net->nat_proto_net[pf]; 1005 1006 for (i = 0; i < ops_count; i++) { 1007 if (orig_nat_ops[i].hooknum == hooknum) { 1008 hooknum = i; 1009 break; 1010 } 1011 } 1012 1013 if (WARN_ON_ONCE(i == ops_count)) 1014 return -EINVAL; 1015 1016 mutex_lock(&nf_nat_proto_mutex); 1017 if (!nat_proto_net->nat_hook_ops) { 1018 WARN_ON(nat_proto_net->users != 0); 1019 1020 nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); 1021 if (!nat_ops) { 1022 mutex_unlock(&nf_nat_proto_mutex); 1023 return -ENOMEM; 1024 } 1025 1026 for (i = 0; i < ops_count; i++) { 1027 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 1028 if (priv) { 1029 nat_ops[i].priv = priv; 1030 continue; 1031 } 1032 mutex_unlock(&nf_nat_proto_mutex); 1033 while (i) 1034 kfree(nat_ops[--i].priv); 1035 kfree(nat_ops); 1036 return -ENOMEM; 1037 } 1038 1039 ret = nf_register_net_hooks(net, nat_ops, ops_count); 1040 if (ret < 0) { 1041 mutex_unlock(&nf_nat_proto_mutex); 1042 for (i = 0; i < ops_count; i++) 1043 kfree(nat_ops[i].priv); 1044 kfree(nat_ops); 1045 return ret; 1046 } 1047 1048 nat_proto_net->nat_hook_ops = nat_ops; 1049 } 1050 1051 nat_ops = nat_proto_net->nat_hook_ops; 1052 priv = nat_ops[hooknum].priv; 1053 if (WARN_ON_ONCE(!priv)) { 1054 mutex_unlock(&nf_nat_proto_mutex); 1055 return -EOPNOTSUPP; 1056 } 1057 1058 ret = nf_hook_entries_insert_raw(&priv->entries, ops); 1059 if (ret == 0) 1060 nat_proto_net->users++; 1061 1062 mutex_unlock(&nf_nat_proto_mutex); 1063 return ret; 1064 } 1065 1066 void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 1067 unsigned int ops_count) 1068 { 1069 struct nat_net *nat_net = net_generic(net, nat_net_id); 1070 struct nf_nat_hooks_net *nat_proto_net; 1071 struct nf_nat_lookup_hook_priv *priv; 1072 struct nf_hook_ops *nat_ops; 1073 int hooknum = ops->hooknum; 1074 int i; 1075 1076 if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) 1077 return; 1078 1079 nat_proto_net = &nat_net->nat_proto_net[pf]; 1080 1081 mutex_lock(&nf_nat_proto_mutex); 1082 if (WARN_ON(nat_proto_net->users == 0)) 1083 goto unlock; 1084 1085 nat_proto_net->users--; 1086 1087 nat_ops = nat_proto_net->nat_hook_ops; 1088 for (i = 0; i < ops_count; i++) { 1089 if (nat_ops[i].hooknum == hooknum) { 1090 hooknum = i; 1091 break; 1092 } 1093 } 1094 if (WARN_ON_ONCE(i == ops_count)) 1095 goto unlock; 1096 priv = nat_ops[hooknum].priv; 1097 nf_hook_entries_delete_raw(&priv->entries, ops); 1098 1099 if (nat_proto_net->users == 0) { 1100 nf_unregister_net_hooks(net, nat_ops, ops_count); 1101 1102 for (i = 0; i < ops_count; i++) { 1103 priv = nat_ops[i].priv; 1104 kfree_rcu(priv, rcu_head); 1105 } 1106 1107 nat_proto_net->nat_hook_ops = NULL; 1108 kfree(nat_ops); 1109 } 1110 unlock: 1111 mutex_unlock(&nf_nat_proto_mutex); 1112 } 1113 1114 static struct pernet_operations nat_net_ops = { 1115 .id = &nat_net_id, 1116 .size = sizeof(struct nat_net), 1117 }; 1118 1119 static const struct nf_nat_hook nat_hook = { 1120 .parse_nat_setup = nfnetlink_parse_nat_setup, 1121 #ifdef CONFIG_XFRM 1122 .decode_session = __nf_nat_decode_session, 1123 #endif 1124 .manip_pkt = nf_nat_manip_pkt, 1125 .remove_nat_bysrc = nf_nat_cleanup_conntrack, 1126 }; 1127 1128 static int __init nf_nat_init(void) 1129 { 1130 int ret, i; 1131 1132 /* Leave them the same for the moment. */ 1133 nf_nat_htable_size = nf_conntrack_htable_size; 1134 if (nf_nat_htable_size < CONNTRACK_LOCKS) 1135 nf_nat_htable_size = CONNTRACK_LOCKS; 1136 1137 nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); 1138 if (!nf_nat_bysource) 1139 return -ENOMEM; 1140 1141 for (i = 0; i < CONNTRACK_LOCKS; i++) 1142 spin_lock_init(&nf_nat_locks[i]); 1143 1144 ret = register_pernet_subsys(&nat_net_ops); 1145 if (ret < 0) { 1146 kvfree(nf_nat_bysource); 1147 return ret; 1148 } 1149 1150 nf_ct_helper_expectfn_register(&follow_master_nat); 1151 1152 WARN_ON(nf_nat_hook != NULL); 1153 RCU_INIT_POINTER(nf_nat_hook, &nat_hook); 1154 1155 return 0; 1156 } 1157 1158 static void __exit nf_nat_cleanup(void) 1159 { 1160 struct nf_nat_proto_clean clean = {}; 1161 1162 nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); 1163 1164 nf_ct_helper_expectfn_unregister(&follow_master_nat); 1165 RCU_INIT_POINTER(nf_nat_hook, NULL); 1166 1167 synchronize_net(); 1168 kvfree(nf_nat_bysource); 1169 unregister_pernet_subsys(&nat_net_ops); 1170 } 1171 1172 MODULE_LICENSE("GPL"); 1173 1174 module_init(nf_nat_init); 1175 module_exit(nf_nat_cleanup); 1176