1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * (C) 1999-2001 Paul `Rusty' Russell 4 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 5 * (C) 2011 Patrick McHardy <kaber@trash.net> 6 */ 7 8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 9 10 #include <linux/module.h> 11 #include <linux/types.h> 12 #include <linux/timer.h> 13 #include <linux/skbuff.h> 14 #include <linux/gfp.h> 15 #include <net/xfrm.h> 16 #include <linux/jhash.h> 17 #include <linux/rtnetlink.h> 18 19 #include <net/netfilter/nf_conntrack.h> 20 #include <net/netfilter/nf_conntrack_core.h> 21 #include <net/netfilter/nf_conntrack_helper.h> 22 #include <net/netfilter/nf_conntrack_seqadj.h> 23 #include <net/netfilter/nf_conntrack_zones.h> 24 #include <net/netfilter/nf_nat.h> 25 #include <net/netfilter/nf_nat_helper.h> 26 #include <uapi/linux/netfilter/nf_nat.h> 27 28 #include "nf_internals.h" 29 30 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; 31 32 static DEFINE_MUTEX(nf_nat_proto_mutex); 33 static unsigned int nat_net_id __read_mostly; 34 35 static struct hlist_head *nf_nat_bysource __read_mostly; 36 static unsigned int nf_nat_htable_size __read_mostly; 37 static unsigned int nf_nat_hash_rnd __read_mostly; 38 39 struct nf_nat_lookup_hook_priv { 40 struct nf_hook_entries __rcu *entries; 41 42 struct rcu_head rcu_head; 43 }; 44 45 struct nf_nat_hooks_net { 46 struct nf_hook_ops *nat_hook_ops; 47 unsigned int users; 48 }; 49 50 struct nat_net { 51 struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; 52 }; 53 54 #ifdef CONFIG_XFRM 55 static void nf_nat_ipv4_decode_session(struct sk_buff *skb, 56 const struct nf_conn *ct, 57 enum ip_conntrack_dir dir, 58 unsigned long statusbit, 59 struct flowi *fl) 60 { 61 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 62 struct flowi4 *fl4 = &fl->u.ip4; 63 64 if (ct->status & statusbit) { 65 fl4->daddr = t->dst.u3.ip; 66 if (t->dst.protonum == IPPROTO_TCP || 67 t->dst.protonum == IPPROTO_UDP || 68 t->dst.protonum == IPPROTO_UDPLITE || 69 t->dst.protonum == IPPROTO_DCCP || 70 t->dst.protonum == IPPROTO_SCTP) 71 fl4->fl4_dport = t->dst.u.all; 72 } 73 74 statusbit ^= IPS_NAT_MASK; 75 76 if (ct->status & statusbit) { 77 fl4->saddr = t->src.u3.ip; 78 if (t->dst.protonum == IPPROTO_TCP || 79 t->dst.protonum == IPPROTO_UDP || 80 t->dst.protonum == IPPROTO_UDPLITE || 81 t->dst.protonum == IPPROTO_DCCP || 82 t->dst.protonum == IPPROTO_SCTP) 83 fl4->fl4_sport = t->src.u.all; 84 } 85 } 86 87 static void nf_nat_ipv6_decode_session(struct sk_buff *skb, 88 const struct nf_conn *ct, 89 enum ip_conntrack_dir dir, 90 unsigned long statusbit, 91 struct flowi *fl) 92 { 93 #if IS_ENABLED(CONFIG_IPV6) 94 const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; 95 struct flowi6 *fl6 = &fl->u.ip6; 96 97 if (ct->status & statusbit) { 98 fl6->daddr = t->dst.u3.in6; 99 if (t->dst.protonum == IPPROTO_TCP || 100 t->dst.protonum == IPPROTO_UDP || 101 t->dst.protonum == IPPROTO_UDPLITE || 102 t->dst.protonum == IPPROTO_DCCP || 103 t->dst.protonum == IPPROTO_SCTP) 104 fl6->fl6_dport = t->dst.u.all; 105 } 106 107 statusbit ^= IPS_NAT_MASK; 108 109 if (ct->status & statusbit) { 110 fl6->saddr = t->src.u3.in6; 111 if (t->dst.protonum == IPPROTO_TCP || 112 t->dst.protonum == IPPROTO_UDP || 113 t->dst.protonum == IPPROTO_UDPLITE || 114 t->dst.protonum == IPPROTO_DCCP || 115 t->dst.protonum == IPPROTO_SCTP) 116 fl6->fl6_sport = t->src.u.all; 117 } 118 #endif 119 } 120 121 static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) 122 { 123 const struct nf_conn *ct; 124 enum ip_conntrack_info ctinfo; 125 enum ip_conntrack_dir dir; 126 unsigned long statusbit; 127 u8 family; 128 129 ct = nf_ct_get(skb, &ctinfo); 130 if (ct == NULL) 131 return; 132 133 family = nf_ct_l3num(ct); 134 dir = CTINFO2DIR(ctinfo); 135 if (dir == IP_CT_DIR_ORIGINAL) 136 statusbit = IPS_DST_NAT; 137 else 138 statusbit = IPS_SRC_NAT; 139 140 switch (family) { 141 case NFPROTO_IPV4: 142 nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); 143 return; 144 case NFPROTO_IPV6: 145 nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); 146 return; 147 } 148 } 149 #endif /* CONFIG_XFRM */ 150 151 /* We keep an extra hash for each conntrack, for fast searching. */ 152 static unsigned int 153 hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) 154 { 155 unsigned int hash; 156 157 get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); 158 159 /* Original src, to ensure we map it consistently if poss. */ 160 hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), 161 tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); 162 163 return reciprocal_scale(hash, nf_nat_htable_size); 164 } 165 166 /* Is this tuple already taken? (not by us) */ 167 static int 168 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, 169 const struct nf_conn *ignored_conntrack) 170 { 171 /* Conntrack tracking doesn't keep track of outgoing tuples; only 172 * incoming ones. NAT means they don't have a fixed mapping, 173 * so we invert the tuple and look for the incoming reply. 174 * 175 * We could keep a separate hash if this proves too slow. 176 */ 177 struct nf_conntrack_tuple reply; 178 179 nf_ct_invert_tuple(&reply, tuple); 180 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 181 } 182 183 static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, 184 const struct nf_nat_range2 *range) 185 { 186 if (t->src.l3num == NFPROTO_IPV4) 187 return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && 188 ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); 189 190 return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && 191 ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; 192 } 193 194 /* Is the manipable part of the tuple between min and max incl? */ 195 static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, 196 enum nf_nat_manip_type maniptype, 197 const union nf_conntrack_man_proto *min, 198 const union nf_conntrack_man_proto *max) 199 { 200 __be16 port; 201 202 switch (tuple->dst.protonum) { 203 case IPPROTO_ICMP: 204 case IPPROTO_ICMPV6: 205 return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && 206 ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); 207 case IPPROTO_GRE: /* all fall though */ 208 case IPPROTO_TCP: 209 case IPPROTO_UDP: 210 case IPPROTO_UDPLITE: 211 case IPPROTO_DCCP: 212 case IPPROTO_SCTP: 213 if (maniptype == NF_NAT_MANIP_SRC) 214 port = tuple->src.u.all; 215 else 216 port = tuple->dst.u.all; 217 218 return ntohs(port) >= ntohs(min->all) && 219 ntohs(port) <= ntohs(max->all); 220 default: 221 return true; 222 } 223 } 224 225 /* If we source map this tuple so reply looks like reply_tuple, will 226 * that meet the constraints of range. 227 */ 228 static int in_range(const struct nf_conntrack_tuple *tuple, 229 const struct nf_nat_range2 *range) 230 { 231 /* If we are supposed to map IPs, then we must be in the 232 * range specified, otherwise let this drag us onto a new src IP. 233 */ 234 if (range->flags & NF_NAT_RANGE_MAP_IPS && 235 !nf_nat_inet_in_range(tuple, range)) 236 return 0; 237 238 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) 239 return 1; 240 241 return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, 242 &range->min_proto, &range->max_proto); 243 } 244 245 static inline int 246 same_src(const struct nf_conn *ct, 247 const struct nf_conntrack_tuple *tuple) 248 { 249 const struct nf_conntrack_tuple *t; 250 251 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 252 return (t->dst.protonum == tuple->dst.protonum && 253 nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && 254 t->src.u.all == tuple->src.u.all); 255 } 256 257 /* Only called for SRC manip */ 258 static int 259 find_appropriate_src(struct net *net, 260 const struct nf_conntrack_zone *zone, 261 const struct nf_conntrack_tuple *tuple, 262 struct nf_conntrack_tuple *result, 263 const struct nf_nat_range2 *range) 264 { 265 unsigned int h = hash_by_src(net, tuple); 266 const struct nf_conn *ct; 267 268 hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { 269 if (same_src(ct, tuple) && 270 net_eq(net, nf_ct_net(ct)) && 271 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { 272 /* Copy source part from reply tuple. */ 273 nf_ct_invert_tuple(result, 274 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 275 result->dst = tuple->dst; 276 277 if (in_range(result, range)) 278 return 1; 279 } 280 } 281 return 0; 282 } 283 284 /* For [FUTURE] fragmentation handling, we want the least-used 285 * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus 286 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports 287 * 1-65535, we don't do pro-rata allocation based on ports; we choose 288 * the ip with the lowest src-ip/dst-ip/proto usage. 289 */ 290 static void 291 find_best_ips_proto(const struct nf_conntrack_zone *zone, 292 struct nf_conntrack_tuple *tuple, 293 const struct nf_nat_range2 *range, 294 const struct nf_conn *ct, 295 enum nf_nat_manip_type maniptype) 296 { 297 union nf_inet_addr *var_ipp; 298 unsigned int i, max; 299 /* Host order */ 300 u32 minip, maxip, j, dist; 301 bool full_range; 302 303 /* No IP mapping? Do nothing. */ 304 if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) 305 return; 306 307 if (maniptype == NF_NAT_MANIP_SRC) 308 var_ipp = &tuple->src.u3; 309 else 310 var_ipp = &tuple->dst.u3; 311 312 /* Fast path: only one choice. */ 313 if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { 314 *var_ipp = range->min_addr; 315 return; 316 } 317 318 if (nf_ct_l3num(ct) == NFPROTO_IPV4) 319 max = sizeof(var_ipp->ip) / sizeof(u32) - 1; 320 else 321 max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; 322 323 /* Hashing source and destination IPs gives a fairly even 324 * spread in practice (if there are a small number of IPs 325 * involved, there usually aren't that many connections 326 * anyway). The consistency means that servers see the same 327 * client coming from the same IP (some Internet Banking sites 328 * like this), even across reboots. 329 */ 330 j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), 331 range->flags & NF_NAT_RANGE_PERSISTENT ? 332 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); 333 334 full_range = false; 335 for (i = 0; i <= max; i++) { 336 /* If first bytes of the address are at the maximum, use the 337 * distance. Otherwise use the full range. 338 */ 339 if (!full_range) { 340 minip = ntohl((__force __be32)range->min_addr.all[i]); 341 maxip = ntohl((__force __be32)range->max_addr.all[i]); 342 dist = maxip - minip + 1; 343 } else { 344 minip = 0; 345 dist = ~0; 346 } 347 348 var_ipp->all[i] = (__force __u32) 349 htonl(minip + reciprocal_scale(j, dist)); 350 if (var_ipp->all[i] != range->max_addr.all[i]) 351 full_range = true; 352 353 if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) 354 j ^= (__force u32)tuple->dst.u3.all[i]; 355 } 356 } 357 358 /* Alter the per-proto part of the tuple (depending on maniptype), to 359 * give a unique tuple in the given range if possible. 360 * 361 * Per-protocol part of tuple is initialized to the incoming packet. 362 */ 363 static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, 364 const struct nf_nat_range2 *range, 365 enum nf_nat_manip_type maniptype, 366 const struct nf_conn *ct) 367 { 368 unsigned int range_size, min, max, i, attempts; 369 __be16 *keyptr; 370 u16 off; 371 static const unsigned int max_attempts = 128; 372 373 switch (tuple->dst.protonum) { 374 case IPPROTO_ICMP: 375 case IPPROTO_ICMPV6: 376 /* id is same for either direction... */ 377 keyptr = &tuple->src.u.icmp.id; 378 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 379 min = 0; 380 range_size = 65536; 381 } else { 382 min = ntohs(range->min_proto.icmp.id); 383 range_size = ntohs(range->max_proto.icmp.id) - 384 ntohs(range->min_proto.icmp.id) + 1; 385 } 386 goto find_free_id; 387 #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) 388 case IPPROTO_GRE: 389 /* If there is no master conntrack we are not PPTP, 390 do not change tuples */ 391 if (!ct->master) 392 return; 393 394 if (maniptype == NF_NAT_MANIP_SRC) 395 keyptr = &tuple->src.u.gre.key; 396 else 397 keyptr = &tuple->dst.u.gre.key; 398 399 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 400 min = 1; 401 range_size = 65535; 402 } else { 403 min = ntohs(range->min_proto.gre.key); 404 range_size = ntohs(range->max_proto.gre.key) - min + 1; 405 } 406 goto find_free_id; 407 #endif 408 case IPPROTO_UDP: 409 case IPPROTO_UDPLITE: 410 case IPPROTO_TCP: 411 case IPPROTO_SCTP: 412 case IPPROTO_DCCP: 413 if (maniptype == NF_NAT_MANIP_SRC) 414 keyptr = &tuple->src.u.all; 415 else 416 keyptr = &tuple->dst.u.all; 417 418 break; 419 default: 420 return; 421 } 422 423 /* If no range specified... */ 424 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { 425 /* If it's dst rewrite, can't change port */ 426 if (maniptype == NF_NAT_MANIP_DST) 427 return; 428 429 if (ntohs(*keyptr) < 1024) { 430 /* Loose convention: >> 512 is credential passing */ 431 if (ntohs(*keyptr) < 512) { 432 min = 1; 433 range_size = 511 - min + 1; 434 } else { 435 min = 600; 436 range_size = 1023 - min + 1; 437 } 438 } else { 439 min = 1024; 440 range_size = 65535 - 1024 + 1; 441 } 442 } else { 443 min = ntohs(range->min_proto.all); 444 max = ntohs(range->max_proto.all); 445 if (unlikely(max < min)) 446 swap(max, min); 447 range_size = max - min + 1; 448 } 449 450 find_free_id: 451 if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) 452 off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); 453 else 454 off = prandom_u32(); 455 456 attempts = range_size; 457 if (attempts > max_attempts) 458 attempts = max_attempts; 459 460 /* We are in softirq; doing a search of the entire range risks 461 * soft lockup when all tuples are already used. 462 * 463 * If we can't find any free port from first offset, pick a new 464 * one and try again, with ever smaller search window. 465 */ 466 another_round: 467 for (i = 0; i < attempts; i++, off++) { 468 *keyptr = htons(min + off % range_size); 469 if (!nf_nat_used_tuple(tuple, ct)) 470 return; 471 } 472 473 if (attempts >= range_size || attempts < 16) 474 return; 475 attempts /= 2; 476 off = prandom_u32(); 477 goto another_round; 478 } 479 480 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, 481 * we change the source to map into the range. For NF_INET_PRE_ROUTING 482 * and NF_INET_LOCAL_OUT, we change the destination to map into the 483 * range. It might not be possible to get a unique tuple, but we try. 484 * At worst (or if we race), we will end up with a final duplicate in 485 * __nf_conntrack_confirm and drop the packet. */ 486 static void 487 get_unique_tuple(struct nf_conntrack_tuple *tuple, 488 const struct nf_conntrack_tuple *orig_tuple, 489 const struct nf_nat_range2 *range, 490 struct nf_conn *ct, 491 enum nf_nat_manip_type maniptype) 492 { 493 const struct nf_conntrack_zone *zone; 494 struct net *net = nf_ct_net(ct); 495 496 zone = nf_ct_zone(ct); 497 498 /* 1) If this srcip/proto/src-proto-part is currently mapped, 499 * and that same mapping gives a unique tuple within the given 500 * range, use that. 501 * 502 * This is only required for source (ie. NAT/masq) mappings. 503 * So far, we don't do local source mappings, so multiple 504 * manips not an issue. 505 */ 506 if (maniptype == NF_NAT_MANIP_SRC && 507 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 508 /* try the original tuple first */ 509 if (in_range(orig_tuple, range)) { 510 if (!nf_nat_used_tuple(orig_tuple, ct)) { 511 *tuple = *orig_tuple; 512 return; 513 } 514 } else if (find_appropriate_src(net, zone, 515 orig_tuple, tuple, range)) { 516 pr_debug("get_unique_tuple: Found current src map\n"); 517 if (!nf_nat_used_tuple(tuple, ct)) 518 return; 519 } 520 } 521 522 /* 2) Select the least-used IP/proto combination in the given range */ 523 *tuple = *orig_tuple; 524 find_best_ips_proto(zone, tuple, range, ct, maniptype); 525 526 /* 3) The per-protocol part of the manip is made to map into 527 * the range to make a unique tuple. 528 */ 529 530 /* Only bother mapping if it's not already in range and unique */ 531 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 532 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 533 if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && 534 l4proto_in_range(tuple, maniptype, 535 &range->min_proto, 536 &range->max_proto) && 537 (range->min_proto.all == range->max_proto.all || 538 !nf_nat_used_tuple(tuple, ct))) 539 return; 540 } else if (!nf_nat_used_tuple(tuple, ct)) { 541 return; 542 } 543 } 544 545 /* Last chance: get protocol to try to obtain unique tuple. */ 546 nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); 547 } 548 549 struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) 550 { 551 struct nf_conn_nat *nat = nfct_nat(ct); 552 if (nat) 553 return nat; 554 555 if (!nf_ct_is_confirmed(ct)) 556 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); 557 558 return nat; 559 } 560 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); 561 562 unsigned int 563 nf_nat_setup_info(struct nf_conn *ct, 564 const struct nf_nat_range2 *range, 565 enum nf_nat_manip_type maniptype) 566 { 567 struct net *net = nf_ct_net(ct); 568 struct nf_conntrack_tuple curr_tuple, new_tuple; 569 570 /* Can't setup nat info for confirmed ct. */ 571 if (nf_ct_is_confirmed(ct)) 572 return NF_ACCEPT; 573 574 WARN_ON(maniptype != NF_NAT_MANIP_SRC && 575 maniptype != NF_NAT_MANIP_DST); 576 577 if (WARN_ON(nf_nat_initialized(ct, maniptype))) 578 return NF_DROP; 579 580 /* What we've got will look like inverse of reply. Normally 581 * this is what is in the conntrack, except for prior 582 * manipulations (future optimization: if num_manips == 0, 583 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 584 */ 585 nf_ct_invert_tuple(&curr_tuple, 586 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 587 588 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); 589 590 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { 591 struct nf_conntrack_tuple reply; 592 593 /* Alter conntrack table so will recognize replies. */ 594 nf_ct_invert_tuple(&reply, &new_tuple); 595 nf_conntrack_alter_reply(ct, &reply); 596 597 /* Non-atomic: we own this at the moment. */ 598 if (maniptype == NF_NAT_MANIP_SRC) 599 ct->status |= IPS_SRC_NAT; 600 else 601 ct->status |= IPS_DST_NAT; 602 603 if (nfct_help(ct) && !nfct_seqadj(ct)) 604 if (!nfct_seqadj_ext_add(ct)) 605 return NF_DROP; 606 } 607 608 if (maniptype == NF_NAT_MANIP_SRC) { 609 unsigned int srchash; 610 spinlock_t *lock; 611 612 srchash = hash_by_src(net, 613 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 614 lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; 615 spin_lock_bh(lock); 616 hlist_add_head_rcu(&ct->nat_bysource, 617 &nf_nat_bysource[srchash]); 618 spin_unlock_bh(lock); 619 } 620 621 /* It's done. */ 622 if (maniptype == NF_NAT_MANIP_DST) 623 ct->status |= IPS_DST_NAT_DONE; 624 else 625 ct->status |= IPS_SRC_NAT_DONE; 626 627 return NF_ACCEPT; 628 } 629 EXPORT_SYMBOL(nf_nat_setup_info); 630 631 static unsigned int 632 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) 633 { 634 /* Force range to this IP; let proto decide mapping for 635 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 636 * Use reply in case it's already been mangled (eg local packet). 637 */ 638 union nf_inet_addr ip = 639 (manip == NF_NAT_MANIP_SRC ? 640 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 641 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 642 struct nf_nat_range2 range = { 643 .flags = NF_NAT_RANGE_MAP_IPS, 644 .min_addr = ip, 645 .max_addr = ip, 646 }; 647 return nf_nat_setup_info(ct, &range, manip); 648 } 649 650 unsigned int 651 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 652 { 653 return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); 654 } 655 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); 656 657 /* Do packet manipulations according to nf_nat_setup_info. */ 658 unsigned int nf_nat_packet(struct nf_conn *ct, 659 enum ip_conntrack_info ctinfo, 660 unsigned int hooknum, 661 struct sk_buff *skb) 662 { 663 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 664 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 665 unsigned int verdict = NF_ACCEPT; 666 unsigned long statusbit; 667 668 if (mtype == NF_NAT_MANIP_SRC) 669 statusbit = IPS_SRC_NAT; 670 else 671 statusbit = IPS_DST_NAT; 672 673 /* Invert if this is reply dir. */ 674 if (dir == IP_CT_DIR_REPLY) 675 statusbit ^= IPS_NAT_MASK; 676 677 /* Non-atomic: these bits don't change. */ 678 if (ct->status & statusbit) 679 verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); 680 681 return verdict; 682 } 683 EXPORT_SYMBOL_GPL(nf_nat_packet); 684 685 unsigned int 686 nf_nat_inet_fn(void *priv, struct sk_buff *skb, 687 const struct nf_hook_state *state) 688 { 689 struct nf_conn *ct; 690 enum ip_conntrack_info ctinfo; 691 struct nf_conn_nat *nat; 692 /* maniptype == SRC for postrouting. */ 693 enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); 694 695 ct = nf_ct_get(skb, &ctinfo); 696 /* Can't track? It's not due to stress, or conntrack would 697 * have dropped it. Hence it's the user's responsibilty to 698 * packet filter it out, or implement conntrack/NAT for that 699 * protocol. 8) --RR 700 */ 701 if (!ct) 702 return NF_ACCEPT; 703 704 nat = nfct_nat(ct); 705 706 switch (ctinfo) { 707 case IP_CT_RELATED: 708 case IP_CT_RELATED_REPLY: 709 /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ 710 case IP_CT_NEW: 711 /* Seen it before? This can happen for loopback, retrans, 712 * or local packets. 713 */ 714 if (!nf_nat_initialized(ct, maniptype)) { 715 struct nf_nat_lookup_hook_priv *lpriv = priv; 716 struct nf_hook_entries *e = rcu_dereference(lpriv->entries); 717 unsigned int ret; 718 int i; 719 720 if (!e) 721 goto null_bind; 722 723 for (i = 0; i < e->num_hook_entries; i++) { 724 ret = e->hooks[i].hook(e->hooks[i].priv, skb, 725 state); 726 if (ret != NF_ACCEPT) 727 return ret; 728 if (nf_nat_initialized(ct, maniptype)) 729 goto do_nat; 730 } 731 null_bind: 732 ret = nf_nat_alloc_null_binding(ct, state->hook); 733 if (ret != NF_ACCEPT) 734 return ret; 735 } else { 736 pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", 737 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", 738 ct, ct->status); 739 if (nf_nat_oif_changed(state->hook, ctinfo, nat, 740 state->out)) 741 goto oif_changed; 742 } 743 break; 744 default: 745 /* ESTABLISHED */ 746 WARN_ON(ctinfo != IP_CT_ESTABLISHED && 747 ctinfo != IP_CT_ESTABLISHED_REPLY); 748 if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) 749 goto oif_changed; 750 } 751 do_nat: 752 return nf_nat_packet(ct, ctinfo, state->hook, skb); 753 754 oif_changed: 755 nf_ct_kill_acct(ct, ctinfo, skb); 756 return NF_DROP; 757 } 758 EXPORT_SYMBOL_GPL(nf_nat_inet_fn); 759 760 struct nf_nat_proto_clean { 761 u8 l3proto; 762 u8 l4proto; 763 }; 764 765 /* kill conntracks with affected NAT section */ 766 static int nf_nat_proto_remove(struct nf_conn *i, void *data) 767 { 768 const struct nf_nat_proto_clean *clean = data; 769 770 if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || 771 (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) 772 return 0; 773 774 return i->status & IPS_NAT_MASK ? 1 : 0; 775 } 776 777 static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) 778 { 779 unsigned int h; 780 781 h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 782 spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 783 hlist_del_rcu(&ct->nat_bysource); 784 spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); 785 } 786 787 static int nf_nat_proto_clean(struct nf_conn *ct, void *data) 788 { 789 if (nf_nat_proto_remove(ct, data)) 790 return 1; 791 792 /* This module is being removed and conntrack has nat null binding. 793 * Remove it from bysource hash, as the table will be freed soon. 794 * 795 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() 796 * will delete entry from already-freed table. 797 */ 798 if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) 799 __nf_nat_cleanup_conntrack(ct); 800 801 /* don't delete conntrack. Although that would make things a lot 802 * simpler, we'd end up flushing all conntracks on nat rmmod. 803 */ 804 return 0; 805 } 806 807 /* No one using conntrack by the time this called. */ 808 static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 809 { 810 if (ct->status & IPS_SRC_NAT_DONE) 811 __nf_nat_cleanup_conntrack(ct); 812 } 813 814 static struct nf_ct_ext_type nat_extend __read_mostly = { 815 .len = sizeof(struct nf_conn_nat), 816 .align = __alignof__(struct nf_conn_nat), 817 .destroy = nf_nat_cleanup_conntrack, 818 .id = NF_CT_EXT_NAT, 819 }; 820 821 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 822 823 #include <linux/netfilter/nfnetlink.h> 824 #include <linux/netfilter/nfnetlink_conntrack.h> 825 826 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 827 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 828 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 829 }; 830 831 static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], 832 struct nf_nat_range2 *range) 833 { 834 if (tb[CTA_PROTONAT_PORT_MIN]) { 835 range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); 836 range->max_proto.all = range->min_proto.all; 837 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 838 } 839 if (tb[CTA_PROTONAT_PORT_MAX]) { 840 range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); 841 range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; 842 } 843 return 0; 844 } 845 846 static int nfnetlink_parse_nat_proto(struct nlattr *attr, 847 const struct nf_conn *ct, 848 struct nf_nat_range2 *range) 849 { 850 struct nlattr *tb[CTA_PROTONAT_MAX+1]; 851 int err; 852 853 err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, 854 protonat_nla_policy, NULL); 855 if (err < 0) 856 return err; 857 858 return nf_nat_l4proto_nlattr_to_range(tb, range); 859 } 860 861 static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { 862 [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, 863 [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, 864 [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, 865 [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, 866 [CTA_NAT_PROTO] = { .type = NLA_NESTED }, 867 }; 868 869 static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], 870 struct nf_nat_range2 *range) 871 { 872 if (tb[CTA_NAT_V4_MINIP]) { 873 range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); 874 range->flags |= NF_NAT_RANGE_MAP_IPS; 875 } 876 877 if (tb[CTA_NAT_V4_MAXIP]) 878 range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); 879 else 880 range->max_addr.ip = range->min_addr.ip; 881 882 return 0; 883 } 884 885 static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], 886 struct nf_nat_range2 *range) 887 { 888 if (tb[CTA_NAT_V6_MINIP]) { 889 nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], 890 sizeof(struct in6_addr)); 891 range->flags |= NF_NAT_RANGE_MAP_IPS; 892 } 893 894 if (tb[CTA_NAT_V6_MAXIP]) 895 nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], 896 sizeof(struct in6_addr)); 897 else 898 range->max_addr = range->min_addr; 899 900 return 0; 901 } 902 903 static int 904 nfnetlink_parse_nat(const struct nlattr *nat, 905 const struct nf_conn *ct, struct nf_nat_range2 *range) 906 { 907 struct nlattr *tb[CTA_NAT_MAX+1]; 908 int err; 909 910 memset(range, 0, sizeof(*range)); 911 912 err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, 913 nat_nla_policy, NULL); 914 if (err < 0) 915 return err; 916 917 switch (nf_ct_l3num(ct)) { 918 case NFPROTO_IPV4: 919 err = nf_nat_ipv4_nlattr_to_range(tb, range); 920 break; 921 case NFPROTO_IPV6: 922 err = nf_nat_ipv6_nlattr_to_range(tb, range); 923 break; 924 default: 925 err = -EPROTONOSUPPORT; 926 break; 927 } 928 929 if (err) 930 return err; 931 932 if (!tb[CTA_NAT_PROTO]) 933 return 0; 934 935 return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); 936 } 937 938 /* This function is called under rcu_read_lock() */ 939 static int 940 nfnetlink_parse_nat_setup(struct nf_conn *ct, 941 enum nf_nat_manip_type manip, 942 const struct nlattr *attr) 943 { 944 struct nf_nat_range2 range; 945 int err; 946 947 /* Should not happen, restricted to creating new conntracks 948 * via ctnetlink. 949 */ 950 if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) 951 return -EEXIST; 952 953 /* No NAT information has been passed, allocate the null-binding */ 954 if (attr == NULL) 955 return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; 956 957 err = nfnetlink_parse_nat(attr, ct, &range); 958 if (err < 0) 959 return err; 960 961 return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; 962 } 963 #else 964 static int 965 nfnetlink_parse_nat_setup(struct nf_conn *ct, 966 enum nf_nat_manip_type manip, 967 const struct nlattr *attr) 968 { 969 return -EOPNOTSUPP; 970 } 971 #endif 972 973 static struct nf_ct_helper_expectfn follow_master_nat = { 974 .name = "nat-follow-master", 975 .expectfn = nf_nat_follow_master, 976 }; 977 978 int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 979 const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) 980 { 981 struct nat_net *nat_net = net_generic(net, nat_net_id); 982 struct nf_nat_hooks_net *nat_proto_net; 983 struct nf_nat_lookup_hook_priv *priv; 984 unsigned int hooknum = ops->hooknum; 985 struct nf_hook_ops *nat_ops; 986 int i, ret; 987 988 if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) 989 return -EINVAL; 990 991 nat_proto_net = &nat_net->nat_proto_net[pf]; 992 993 for (i = 0; i < ops_count; i++) { 994 if (orig_nat_ops[i].hooknum == hooknum) { 995 hooknum = i; 996 break; 997 } 998 } 999 1000 if (WARN_ON_ONCE(i == ops_count)) 1001 return -EINVAL; 1002 1003 mutex_lock(&nf_nat_proto_mutex); 1004 if (!nat_proto_net->nat_hook_ops) { 1005 WARN_ON(nat_proto_net->users != 0); 1006 1007 nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); 1008 if (!nat_ops) { 1009 mutex_unlock(&nf_nat_proto_mutex); 1010 return -ENOMEM; 1011 } 1012 1013 for (i = 0; i < ops_count; i++) { 1014 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 1015 if (priv) { 1016 nat_ops[i].priv = priv; 1017 continue; 1018 } 1019 mutex_unlock(&nf_nat_proto_mutex); 1020 while (i) 1021 kfree(nat_ops[--i].priv); 1022 kfree(nat_ops); 1023 return -ENOMEM; 1024 } 1025 1026 ret = nf_register_net_hooks(net, nat_ops, ops_count); 1027 if (ret < 0) { 1028 mutex_unlock(&nf_nat_proto_mutex); 1029 for (i = 0; i < ops_count; i++) 1030 kfree(nat_ops[i].priv); 1031 kfree(nat_ops); 1032 return ret; 1033 } 1034 1035 nat_proto_net->nat_hook_ops = nat_ops; 1036 } 1037 1038 nat_ops = nat_proto_net->nat_hook_ops; 1039 priv = nat_ops[hooknum].priv; 1040 if (WARN_ON_ONCE(!priv)) { 1041 mutex_unlock(&nf_nat_proto_mutex); 1042 return -EOPNOTSUPP; 1043 } 1044 1045 ret = nf_hook_entries_insert_raw(&priv->entries, ops); 1046 if (ret == 0) 1047 nat_proto_net->users++; 1048 1049 mutex_unlock(&nf_nat_proto_mutex); 1050 return ret; 1051 } 1052 1053 void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, 1054 unsigned int ops_count) 1055 { 1056 struct nat_net *nat_net = net_generic(net, nat_net_id); 1057 struct nf_nat_hooks_net *nat_proto_net; 1058 struct nf_nat_lookup_hook_priv *priv; 1059 struct nf_hook_ops *nat_ops; 1060 int hooknum = ops->hooknum; 1061 int i; 1062 1063 if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) 1064 return; 1065 1066 nat_proto_net = &nat_net->nat_proto_net[pf]; 1067 1068 mutex_lock(&nf_nat_proto_mutex); 1069 if (WARN_ON(nat_proto_net->users == 0)) 1070 goto unlock; 1071 1072 nat_proto_net->users--; 1073 1074 nat_ops = nat_proto_net->nat_hook_ops; 1075 for (i = 0; i < ops_count; i++) { 1076 if (nat_ops[i].hooknum == hooknum) { 1077 hooknum = i; 1078 break; 1079 } 1080 } 1081 if (WARN_ON_ONCE(i == ops_count)) 1082 goto unlock; 1083 priv = nat_ops[hooknum].priv; 1084 nf_hook_entries_delete_raw(&priv->entries, ops); 1085 1086 if (nat_proto_net->users == 0) { 1087 nf_unregister_net_hooks(net, nat_ops, ops_count); 1088 1089 for (i = 0; i < ops_count; i++) { 1090 priv = nat_ops[i].priv; 1091 kfree_rcu(priv, rcu_head); 1092 } 1093 1094 nat_proto_net->nat_hook_ops = NULL; 1095 kfree(nat_ops); 1096 } 1097 unlock: 1098 mutex_unlock(&nf_nat_proto_mutex); 1099 } 1100 1101 static struct pernet_operations nat_net_ops = { 1102 .id = &nat_net_id, 1103 .size = sizeof(struct nat_net), 1104 }; 1105 1106 static struct nf_nat_hook nat_hook = { 1107 .parse_nat_setup = nfnetlink_parse_nat_setup, 1108 #ifdef CONFIG_XFRM 1109 .decode_session = __nf_nat_decode_session, 1110 #endif 1111 .manip_pkt = nf_nat_manip_pkt, 1112 }; 1113 1114 static int __init nf_nat_init(void) 1115 { 1116 int ret, i; 1117 1118 /* Leave them the same for the moment. */ 1119 nf_nat_htable_size = nf_conntrack_htable_size; 1120 if (nf_nat_htable_size < CONNTRACK_LOCKS) 1121 nf_nat_htable_size = CONNTRACK_LOCKS; 1122 1123 nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); 1124 if (!nf_nat_bysource) 1125 return -ENOMEM; 1126 1127 ret = nf_ct_extend_register(&nat_extend); 1128 if (ret < 0) { 1129 kvfree(nf_nat_bysource); 1130 pr_err("Unable to register extension\n"); 1131 return ret; 1132 } 1133 1134 for (i = 0; i < CONNTRACK_LOCKS; i++) 1135 spin_lock_init(&nf_nat_locks[i]); 1136 1137 ret = register_pernet_subsys(&nat_net_ops); 1138 if (ret < 0) { 1139 nf_ct_extend_unregister(&nat_extend); 1140 kvfree(nf_nat_bysource); 1141 return ret; 1142 } 1143 1144 nf_ct_helper_expectfn_register(&follow_master_nat); 1145 1146 WARN_ON(nf_nat_hook != NULL); 1147 RCU_INIT_POINTER(nf_nat_hook, &nat_hook); 1148 1149 return 0; 1150 } 1151 1152 static void __exit nf_nat_cleanup(void) 1153 { 1154 struct nf_nat_proto_clean clean = {}; 1155 1156 nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); 1157 1158 nf_ct_extend_unregister(&nat_extend); 1159 nf_ct_helper_expectfn_unregister(&follow_master_nat); 1160 RCU_INIT_POINTER(nf_nat_hook, NULL); 1161 1162 synchronize_net(); 1163 kvfree(nf_nat_bysource); 1164 unregister_pernet_subsys(&nat_net_ops); 1165 } 1166 1167 MODULE_LICENSE("GPL"); 1168 1169 module_init(nf_nat_init); 1170 module_exit(nf_nat_cleanup); 1171