1 /* 2 * (C) 1999-2001 Paul `Rusty' Russell 3 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 4 * (C) 2011 Patrick McHardy <kaber@trash.net> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11 #include <linux/module.h> 12 #include <linux/types.h> 13 #include <linux/timer.h> 14 #include <linux/skbuff.h> 15 #include <linux/gfp.h> 16 #include <net/xfrm.h> 17 #include <linux/jhash.h> 18 #include <linux/rtnetlink.h> 19 20 #include <net/netfilter/nf_conntrack.h> 21 #include <net/netfilter/nf_conntrack_core.h> 22 #include <net/netfilter/nf_nat.h> 23 #include <net/netfilter/nf_nat_l3proto.h> 24 #include <net/netfilter/nf_nat_l4proto.h> 25 #include <net/netfilter/nf_nat_core.h> 26 #include <net/netfilter/nf_nat_helper.h> 27 #include <net/netfilter/nf_conntrack_helper.h> 28 #include <net/netfilter/nf_conntrack_seqadj.h> 29 #include <net/netfilter/nf_conntrack_l3proto.h> 30 #include <net/netfilter/nf_conntrack_zones.h> 31 #include <linux/netfilter/nf_nat.h> 32 33 static DEFINE_SPINLOCK(nf_nat_lock); 34 35 static DEFINE_MUTEX(nf_nat_proto_mutex); 36 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] 37 __read_mostly; 38 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] 39 __read_mostly; 40 41 42 inline const struct nf_nat_l3proto * 43 __nf_nat_l3proto_find(u8 family) 44 { 45 return rcu_dereference(nf_nat_l3protos[family]); 46 } 47 48 inline const struct nf_nat_l4proto * 49 __nf_nat_l4proto_find(u8 family, u8 protonum) 50 { 51 return rcu_dereference(nf_nat_l4protos[family][protonum]); 52 } 53 EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find); 54 55 #ifdef CONFIG_XFRM 56 static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) 57 { 58 const struct nf_nat_l3proto *l3proto; 59 const struct nf_conn *ct; 60 enum ip_conntrack_info ctinfo; 61 enum ip_conntrack_dir dir; 62 unsigned long statusbit; 63 u8 family; 64 65 ct = nf_ct_get(skb, &ctinfo); 66 if (ct == NULL) 67 return; 68 69 family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; 70 rcu_read_lock(); 71 l3proto = __nf_nat_l3proto_find(family); 72 if (l3proto == NULL) 73 goto out; 74 75 dir = CTINFO2DIR(ctinfo); 76 if (dir == IP_CT_DIR_ORIGINAL) 77 statusbit = IPS_DST_NAT; 78 else 79 statusbit = IPS_SRC_NAT; 80 81 l3proto->decode_session(skb, ct, dir, statusbit, fl); 82 out: 83 rcu_read_unlock(); 84 } 85 86 int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) 87 { 88 struct flowi fl; 89 unsigned int hh_len; 90 struct dst_entry *dst; 91 int err; 92 93 err = xfrm_decode_session(skb, &fl, family); 94 if (err < 0) 95 return err; 96 97 dst = skb_dst(skb); 98 if (dst->xfrm) 99 dst = ((struct xfrm_dst *)dst)->route; 100 dst_hold(dst); 101 102 dst = xfrm_lookup(net, dst, &fl, skb->sk, 0); 103 if (IS_ERR(dst)) 104 return PTR_ERR(dst); 105 106 skb_dst_drop(skb); 107 skb_dst_set(skb, dst); 108 109 /* Change in oif may mean change in hh_len. */ 110 hh_len = skb_dst(skb)->dev->hard_header_len; 111 if (skb_headroom(skb) < hh_len && 112 pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) 113 return -ENOMEM; 114 return 0; 115 } 116 EXPORT_SYMBOL(nf_xfrm_me_harder); 117 #endif /* CONFIG_XFRM */ 118 119 /* We keep an extra hash for each conntrack, for fast searching. */ 120 static inline unsigned int 121 hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) 122 { 123 unsigned int hash; 124 125 /* Original src, to ensure we map it consistently if poss. */ 126 hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), 127 tuple->dst.protonum ^ nf_conntrack_hash_rnd); 128 129 return reciprocal_scale(hash, net->ct.nat_htable_size); 130 } 131 132 /* Is this tuple already taken? (not by us) */ 133 int 134 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, 135 const struct nf_conn *ignored_conntrack) 136 { 137 /* Conntrack tracking doesn't keep track of outgoing tuples; only 138 * incoming ones. NAT means they don't have a fixed mapping, 139 * so we invert the tuple and look for the incoming reply. 140 * 141 * We could keep a separate hash if this proves too slow. 142 */ 143 struct nf_conntrack_tuple reply; 144 145 nf_ct_invert_tuplepr(&reply, tuple); 146 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 147 } 148 EXPORT_SYMBOL(nf_nat_used_tuple); 149 150 /* If we source map this tuple so reply looks like reply_tuple, will 151 * that meet the constraints of range. 152 */ 153 static int in_range(const struct nf_nat_l3proto *l3proto, 154 const struct nf_nat_l4proto *l4proto, 155 const struct nf_conntrack_tuple *tuple, 156 const struct nf_nat_range *range) 157 { 158 /* If we are supposed to map IPs, then we must be in the 159 * range specified, otherwise let this drag us onto a new src IP. 160 */ 161 if (range->flags & NF_NAT_RANGE_MAP_IPS && 162 !l3proto->in_range(tuple, range)) 163 return 0; 164 165 if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) || 166 l4proto->in_range(tuple, NF_NAT_MANIP_SRC, 167 &range->min_proto, &range->max_proto)) 168 return 1; 169 170 return 0; 171 } 172 173 static inline int 174 same_src(const struct nf_conn *ct, 175 const struct nf_conntrack_tuple *tuple) 176 { 177 const struct nf_conntrack_tuple *t; 178 179 t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 180 return (t->dst.protonum == tuple->dst.protonum && 181 nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && 182 t->src.u.all == tuple->src.u.all); 183 } 184 185 /* Only called for SRC manip */ 186 static int 187 find_appropriate_src(struct net *net, 188 const struct nf_conntrack_zone *zone, 189 const struct nf_nat_l3proto *l3proto, 190 const struct nf_nat_l4proto *l4proto, 191 const struct nf_conntrack_tuple *tuple, 192 struct nf_conntrack_tuple *result, 193 const struct nf_nat_range *range) 194 { 195 unsigned int h = hash_by_src(net, tuple); 196 const struct nf_conn_nat *nat; 197 const struct nf_conn *ct; 198 199 hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { 200 ct = nat->ct; 201 if (same_src(ct, tuple) && 202 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { 203 /* Copy source part from reply tuple. */ 204 nf_ct_invert_tuplepr(result, 205 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 206 result->dst = tuple->dst; 207 208 if (in_range(l3proto, l4proto, result, range)) 209 return 1; 210 } 211 } 212 return 0; 213 } 214 215 /* For [FUTURE] fragmentation handling, we want the least-used 216 * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus 217 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports 218 * 1-65535, we don't do pro-rata allocation based on ports; we choose 219 * the ip with the lowest src-ip/dst-ip/proto usage. 220 */ 221 static void 222 find_best_ips_proto(const struct nf_conntrack_zone *zone, 223 struct nf_conntrack_tuple *tuple, 224 const struct nf_nat_range *range, 225 const struct nf_conn *ct, 226 enum nf_nat_manip_type maniptype) 227 { 228 union nf_inet_addr *var_ipp; 229 unsigned int i, max; 230 /* Host order */ 231 u32 minip, maxip, j, dist; 232 bool full_range; 233 234 /* No IP mapping? Do nothing. */ 235 if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) 236 return; 237 238 if (maniptype == NF_NAT_MANIP_SRC) 239 var_ipp = &tuple->src.u3; 240 else 241 var_ipp = &tuple->dst.u3; 242 243 /* Fast path: only one choice. */ 244 if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { 245 *var_ipp = range->min_addr; 246 return; 247 } 248 249 if (nf_ct_l3num(ct) == NFPROTO_IPV4) 250 max = sizeof(var_ipp->ip) / sizeof(u32) - 1; 251 else 252 max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; 253 254 /* Hashing source and destination IPs gives a fairly even 255 * spread in practice (if there are a small number of IPs 256 * involved, there usually aren't that many connections 257 * anyway). The consistency means that servers see the same 258 * client coming from the same IP (some Internet Banking sites 259 * like this), even across reboots. 260 */ 261 j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), 262 range->flags & NF_NAT_RANGE_PERSISTENT ? 263 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); 264 265 full_range = false; 266 for (i = 0; i <= max; i++) { 267 /* If first bytes of the address are at the maximum, use the 268 * distance. Otherwise use the full range. 269 */ 270 if (!full_range) { 271 minip = ntohl((__force __be32)range->min_addr.all[i]); 272 maxip = ntohl((__force __be32)range->max_addr.all[i]); 273 dist = maxip - minip + 1; 274 } else { 275 minip = 0; 276 dist = ~0; 277 } 278 279 var_ipp->all[i] = (__force __u32) 280 htonl(minip + reciprocal_scale(j, dist)); 281 if (var_ipp->all[i] != range->max_addr.all[i]) 282 full_range = true; 283 284 if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) 285 j ^= (__force u32)tuple->dst.u3.all[i]; 286 } 287 } 288 289 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, 290 * we change the source to map into the range. For NF_INET_PRE_ROUTING 291 * and NF_INET_LOCAL_OUT, we change the destination to map into the 292 * range. It might not be possible to get a unique tuple, but we try. 293 * At worst (or if we race), we will end up with a final duplicate in 294 * __ip_conntrack_confirm and drop the packet. */ 295 static void 296 get_unique_tuple(struct nf_conntrack_tuple *tuple, 297 const struct nf_conntrack_tuple *orig_tuple, 298 const struct nf_nat_range *range, 299 struct nf_conn *ct, 300 enum nf_nat_manip_type maniptype) 301 { 302 const struct nf_conntrack_zone *zone; 303 const struct nf_nat_l3proto *l3proto; 304 const struct nf_nat_l4proto *l4proto; 305 struct net *net = nf_ct_net(ct); 306 307 zone = nf_ct_zone(ct); 308 309 rcu_read_lock(); 310 l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); 311 l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num, 312 orig_tuple->dst.protonum); 313 314 /* 1) If this srcip/proto/src-proto-part is currently mapped, 315 * and that same mapping gives a unique tuple within the given 316 * range, use that. 317 * 318 * This is only required for source (ie. NAT/masq) mappings. 319 * So far, we don't do local source mappings, so multiple 320 * manips not an issue. 321 */ 322 if (maniptype == NF_NAT_MANIP_SRC && 323 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 324 /* try the original tuple first */ 325 if (in_range(l3proto, l4proto, orig_tuple, range)) { 326 if (!nf_nat_used_tuple(orig_tuple, ct)) { 327 *tuple = *orig_tuple; 328 goto out; 329 } 330 } else if (find_appropriate_src(net, zone, l3proto, l4proto, 331 orig_tuple, tuple, range)) { 332 pr_debug("get_unique_tuple: Found current src map\n"); 333 if (!nf_nat_used_tuple(tuple, ct)) 334 goto out; 335 } 336 } 337 338 /* 2) Select the least-used IP/proto combination in the given range */ 339 *tuple = *orig_tuple; 340 find_best_ips_proto(zone, tuple, range, ct, maniptype); 341 342 /* 3) The per-protocol part of the manip is made to map into 343 * the range to make a unique tuple. 344 */ 345 346 /* Only bother mapping if it's not already in range and unique */ 347 if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 348 if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { 349 if (l4proto->in_range(tuple, maniptype, 350 &range->min_proto, 351 &range->max_proto) && 352 (range->min_proto.all == range->max_proto.all || 353 !nf_nat_used_tuple(tuple, ct))) 354 goto out; 355 } else if (!nf_nat_used_tuple(tuple, ct)) { 356 goto out; 357 } 358 } 359 360 /* Last change: get protocol to try to obtain unique tuple. */ 361 l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct); 362 out: 363 rcu_read_unlock(); 364 } 365 366 struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) 367 { 368 struct nf_conn_nat *nat = nfct_nat(ct); 369 if (nat) 370 return nat; 371 372 if (!nf_ct_is_confirmed(ct)) 373 nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); 374 375 return nat; 376 } 377 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); 378 379 unsigned int 380 nf_nat_setup_info(struct nf_conn *ct, 381 const struct nf_nat_range *range, 382 enum nf_nat_manip_type maniptype) 383 { 384 struct net *net = nf_ct_net(ct); 385 struct nf_conntrack_tuple curr_tuple, new_tuple; 386 struct nf_conn_nat *nat; 387 388 /* nat helper or nfctnetlink also setup binding */ 389 nat = nf_ct_nat_ext_add(ct); 390 if (nat == NULL) 391 return NF_ACCEPT; 392 393 NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || 394 maniptype == NF_NAT_MANIP_DST); 395 BUG_ON(nf_nat_initialized(ct, maniptype)); 396 397 /* What we've got will look like inverse of reply. Normally 398 * this is what is in the conntrack, except for prior 399 * manipulations (future optimization: if num_manips == 0, 400 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 401 */ 402 nf_ct_invert_tuplepr(&curr_tuple, 403 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 404 405 get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); 406 407 if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { 408 struct nf_conntrack_tuple reply; 409 410 /* Alter conntrack table so will recognize replies. */ 411 nf_ct_invert_tuplepr(&reply, &new_tuple); 412 nf_conntrack_alter_reply(ct, &reply); 413 414 /* Non-atomic: we own this at the moment. */ 415 if (maniptype == NF_NAT_MANIP_SRC) 416 ct->status |= IPS_SRC_NAT; 417 else 418 ct->status |= IPS_DST_NAT; 419 420 if (nfct_help(ct)) 421 nfct_seqadj_ext_add(ct); 422 } 423 424 if (maniptype == NF_NAT_MANIP_SRC) { 425 unsigned int srchash; 426 427 srchash = hash_by_src(net, 428 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 429 spin_lock_bh(&nf_nat_lock); 430 /* nf_conntrack_alter_reply might re-allocate extension aera */ 431 nat = nfct_nat(ct); 432 nat->ct = ct; 433 hlist_add_head_rcu(&nat->bysource, 434 &net->ct.nat_bysource[srchash]); 435 spin_unlock_bh(&nf_nat_lock); 436 } 437 438 /* It's done. */ 439 if (maniptype == NF_NAT_MANIP_DST) 440 ct->status |= IPS_DST_NAT_DONE; 441 else 442 ct->status |= IPS_SRC_NAT_DONE; 443 444 return NF_ACCEPT; 445 } 446 EXPORT_SYMBOL(nf_nat_setup_info); 447 448 static unsigned int 449 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) 450 { 451 /* Force range to this IP; let proto decide mapping for 452 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). 453 * Use reply in case it's already been mangled (eg local packet). 454 */ 455 union nf_inet_addr ip = 456 (manip == NF_NAT_MANIP_SRC ? 457 ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : 458 ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); 459 struct nf_nat_range range = { 460 .flags = NF_NAT_RANGE_MAP_IPS, 461 .min_addr = ip, 462 .max_addr = ip, 463 }; 464 return nf_nat_setup_info(ct, &range, manip); 465 } 466 467 unsigned int 468 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) 469 { 470 return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); 471 } 472 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); 473 474 /* Do packet manipulations according to nf_nat_setup_info. */ 475 unsigned int nf_nat_packet(struct nf_conn *ct, 476 enum ip_conntrack_info ctinfo, 477 unsigned int hooknum, 478 struct sk_buff *skb) 479 { 480 const struct nf_nat_l3proto *l3proto; 481 const struct nf_nat_l4proto *l4proto; 482 enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); 483 unsigned long statusbit; 484 enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); 485 486 if (mtype == NF_NAT_MANIP_SRC) 487 statusbit = IPS_SRC_NAT; 488 else 489 statusbit = IPS_DST_NAT; 490 491 /* Invert if this is reply dir. */ 492 if (dir == IP_CT_DIR_REPLY) 493 statusbit ^= IPS_NAT_MASK; 494 495 /* Non-atomic: these bits don't change. */ 496 if (ct->status & statusbit) { 497 struct nf_conntrack_tuple target; 498 499 /* We are aiming to look like inverse of other direction. */ 500 nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); 501 502 l3proto = __nf_nat_l3proto_find(target.src.l3num); 503 l4proto = __nf_nat_l4proto_find(target.src.l3num, 504 target.dst.protonum); 505 if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype)) 506 return NF_DROP; 507 } 508 return NF_ACCEPT; 509 } 510 EXPORT_SYMBOL_GPL(nf_nat_packet); 511 512 struct nf_nat_proto_clean { 513 u8 l3proto; 514 u8 l4proto; 515 }; 516 517 /* kill conntracks with affected NAT section */ 518 static int nf_nat_proto_remove(struct nf_conn *i, void *data) 519 { 520 const struct nf_nat_proto_clean *clean = data; 521 struct nf_conn_nat *nat = nfct_nat(i); 522 523 if (!nat) 524 return 0; 525 526 if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || 527 (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) 528 return 0; 529 530 return i->status & IPS_NAT_MASK ? 1 : 0; 531 } 532 533 static int nf_nat_proto_clean(struct nf_conn *ct, void *data) 534 { 535 struct nf_conn_nat *nat = nfct_nat(ct); 536 537 if (nf_nat_proto_remove(ct, data)) 538 return 1; 539 540 if (!nat || !nat->ct) 541 return 0; 542 543 /* This netns is being destroyed, and conntrack has nat null binding. 544 * Remove it from bysource hash, as the table will be freed soon. 545 * 546 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() 547 * will delete entry from already-freed table. 548 */ 549 if (!del_timer(&ct->timeout)) 550 return 1; 551 552 spin_lock_bh(&nf_nat_lock); 553 hlist_del_rcu(&nat->bysource); 554 ct->status &= ~IPS_NAT_DONE_MASK; 555 nat->ct = NULL; 556 spin_unlock_bh(&nf_nat_lock); 557 558 add_timer(&ct->timeout); 559 560 /* don't delete conntrack. Although that would make things a lot 561 * simpler, we'd end up flushing all conntracks on nat rmmod. 562 */ 563 return 0; 564 } 565 566 static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto) 567 { 568 struct nf_nat_proto_clean clean = { 569 .l3proto = l3proto, 570 .l4proto = l4proto, 571 }; 572 struct net *net; 573 574 rtnl_lock(); 575 for_each_net(net) 576 nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); 577 rtnl_unlock(); 578 } 579 580 static void nf_nat_l3proto_clean(u8 l3proto) 581 { 582 struct nf_nat_proto_clean clean = { 583 .l3proto = l3proto, 584 }; 585 struct net *net; 586 587 rtnl_lock(); 588 589 for_each_net(net) 590 nf_ct_iterate_cleanup(net, nf_nat_proto_remove, &clean, 0, 0); 591 rtnl_unlock(); 592 } 593 594 /* Protocol registration. */ 595 int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto) 596 { 597 const struct nf_nat_l4proto **l4protos; 598 unsigned int i; 599 int ret = 0; 600 601 mutex_lock(&nf_nat_proto_mutex); 602 if (nf_nat_l4protos[l3proto] == NULL) { 603 l4protos = kmalloc(IPPROTO_MAX * sizeof(struct nf_nat_l4proto *), 604 GFP_KERNEL); 605 if (l4protos == NULL) { 606 ret = -ENOMEM; 607 goto out; 608 } 609 610 for (i = 0; i < IPPROTO_MAX; i++) 611 RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown); 612 613 /* Before making proto_array visible to lockless readers, 614 * we must make sure its content is committed to memory. 615 */ 616 smp_wmb(); 617 618 nf_nat_l4protos[l3proto] = l4protos; 619 } 620 621 if (rcu_dereference_protected( 622 nf_nat_l4protos[l3proto][l4proto->l4proto], 623 lockdep_is_held(&nf_nat_proto_mutex) 624 ) != &nf_nat_l4proto_unknown) { 625 ret = -EBUSY; 626 goto out; 627 } 628 RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto); 629 out: 630 mutex_unlock(&nf_nat_proto_mutex); 631 return ret; 632 } 633 EXPORT_SYMBOL_GPL(nf_nat_l4proto_register); 634 635 /* No one stores the protocol anywhere; simply delete it. */ 636 void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto) 637 { 638 mutex_lock(&nf_nat_proto_mutex); 639 RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], 640 &nf_nat_l4proto_unknown); 641 mutex_unlock(&nf_nat_proto_mutex); 642 synchronize_rcu(); 643 644 nf_nat_l4proto_clean(l3proto, l4proto->l4proto); 645 } 646 EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister); 647 648 int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto) 649 { 650 int err; 651 652 err = nf_ct_l3proto_try_module_get(l3proto->l3proto); 653 if (err < 0) 654 return err; 655 656 mutex_lock(&nf_nat_proto_mutex); 657 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP], 658 &nf_nat_l4proto_tcp); 659 RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP], 660 &nf_nat_l4proto_udp); 661 mutex_unlock(&nf_nat_proto_mutex); 662 663 RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto); 664 return 0; 665 } 666 EXPORT_SYMBOL_GPL(nf_nat_l3proto_register); 667 668 void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto) 669 { 670 mutex_lock(&nf_nat_proto_mutex); 671 RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL); 672 mutex_unlock(&nf_nat_proto_mutex); 673 synchronize_rcu(); 674 675 nf_nat_l3proto_clean(l3proto->l3proto); 676 nf_ct_l3proto_module_put(l3proto->l3proto); 677 } 678 EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); 679 680 /* No one using conntrack by the time this called. */ 681 static void nf_nat_cleanup_conntrack(struct nf_conn *ct) 682 { 683 struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); 684 685 if (nat == NULL || nat->ct == NULL) 686 return; 687 688 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); 689 690 spin_lock_bh(&nf_nat_lock); 691 hlist_del_rcu(&nat->bysource); 692 spin_unlock_bh(&nf_nat_lock); 693 } 694 695 static void nf_nat_move_storage(void *new, void *old) 696 { 697 struct nf_conn_nat *new_nat = new; 698 struct nf_conn_nat *old_nat = old; 699 struct nf_conn *ct = old_nat->ct; 700 701 if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) 702 return; 703 704 spin_lock_bh(&nf_nat_lock); 705 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); 706 spin_unlock_bh(&nf_nat_lock); 707 } 708 709 static struct nf_ct_ext_type nat_extend __read_mostly = { 710 .len = sizeof(struct nf_conn_nat), 711 .align = __alignof__(struct nf_conn_nat), 712 .destroy = nf_nat_cleanup_conntrack, 713 .move = nf_nat_move_storage, 714 .id = NF_CT_EXT_NAT, 715 .flags = NF_CT_EXT_F_PREALLOC, 716 }; 717 718 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 719 720 #include <linux/netfilter/nfnetlink.h> 721 #include <linux/netfilter/nfnetlink_conntrack.h> 722 723 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { 724 [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, 725 [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, 726 }; 727 728 static int nfnetlink_parse_nat_proto(struct nlattr *attr, 729 const struct nf_conn *ct, 730 struct nf_nat_range *range) 731 { 732 struct nlattr *tb[CTA_PROTONAT_MAX+1]; 733 const struct nf_nat_l4proto *l4proto; 734 int err; 735 736 err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); 737 if (err < 0) 738 return err; 739 740 l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); 741 if (l4proto->nlattr_to_range) 742 err = l4proto->nlattr_to_range(tb, range); 743 744 return err; 745 } 746 747 static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { 748 [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, 749 [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, 750 [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, 751 [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, 752 [CTA_NAT_PROTO] = { .type = NLA_NESTED }, 753 }; 754 755 static int 756 nfnetlink_parse_nat(const struct nlattr *nat, 757 const struct nf_conn *ct, struct nf_nat_range *range, 758 const struct nf_nat_l3proto *l3proto) 759 { 760 struct nlattr *tb[CTA_NAT_MAX+1]; 761 int err; 762 763 memset(range, 0, sizeof(*range)); 764 765 err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); 766 if (err < 0) 767 return err; 768 769 err = l3proto->nlattr_to_range(tb, range); 770 if (err < 0) 771 return err; 772 773 if (!tb[CTA_NAT_PROTO]) 774 return 0; 775 776 return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); 777 } 778 779 /* This function is called under rcu_read_lock() */ 780 static int 781 nfnetlink_parse_nat_setup(struct nf_conn *ct, 782 enum nf_nat_manip_type manip, 783 const struct nlattr *attr) 784 { 785 struct nf_nat_range range; 786 const struct nf_nat_l3proto *l3proto; 787 int err; 788 789 /* Should not happen, restricted to creating new conntracks 790 * via ctnetlink. 791 */ 792 if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) 793 return -EEXIST; 794 795 /* Make sure that L3 NAT is there by when we call nf_nat_setup_info to 796 * attach the null binding, otherwise this may oops. 797 */ 798 l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); 799 if (l3proto == NULL) 800 return -EAGAIN; 801 802 /* No NAT information has been passed, allocate the null-binding */ 803 if (attr == NULL) 804 return __nf_nat_alloc_null_binding(ct, manip); 805 806 err = nfnetlink_parse_nat(attr, ct, &range, l3proto); 807 if (err < 0) 808 return err; 809 810 return nf_nat_setup_info(ct, &range, manip); 811 } 812 #else 813 static int 814 nfnetlink_parse_nat_setup(struct nf_conn *ct, 815 enum nf_nat_manip_type manip, 816 const struct nlattr *attr) 817 { 818 return -EOPNOTSUPP; 819 } 820 #endif 821 822 static int __net_init nf_nat_net_init(struct net *net) 823 { 824 /* Leave them the same for the moment. */ 825 net->ct.nat_htable_size = net->ct.htable_size; 826 net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); 827 if (!net->ct.nat_bysource) 828 return -ENOMEM; 829 return 0; 830 } 831 832 static void __net_exit nf_nat_net_exit(struct net *net) 833 { 834 struct nf_nat_proto_clean clean = {}; 835 836 nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); 837 synchronize_rcu(); 838 nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); 839 } 840 841 static struct pernet_operations nf_nat_net_ops = { 842 .init = nf_nat_net_init, 843 .exit = nf_nat_net_exit, 844 }; 845 846 static struct nf_ct_helper_expectfn follow_master_nat = { 847 .name = "nat-follow-master", 848 .expectfn = nf_nat_follow_master, 849 }; 850 851 static int __init nf_nat_init(void) 852 { 853 int ret; 854 855 ret = nf_ct_extend_register(&nat_extend); 856 if (ret < 0) { 857 printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); 858 return ret; 859 } 860 861 ret = register_pernet_subsys(&nf_nat_net_ops); 862 if (ret < 0) 863 goto cleanup_extend; 864 865 nf_ct_helper_expectfn_register(&follow_master_nat); 866 867 /* Initialize fake conntrack so that NAT will skip it */ 868 nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); 869 870 BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); 871 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, 872 nfnetlink_parse_nat_setup); 873 #ifdef CONFIG_XFRM 874 BUG_ON(nf_nat_decode_session_hook != NULL); 875 RCU_INIT_POINTER(nf_nat_decode_session_hook, __nf_nat_decode_session); 876 #endif 877 return 0; 878 879 cleanup_extend: 880 nf_ct_extend_unregister(&nat_extend); 881 return ret; 882 } 883 884 static void __exit nf_nat_cleanup(void) 885 { 886 unsigned int i; 887 888 unregister_pernet_subsys(&nf_nat_net_ops); 889 nf_ct_extend_unregister(&nat_extend); 890 nf_ct_helper_expectfn_unregister(&follow_master_nat); 891 RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, NULL); 892 #ifdef CONFIG_XFRM 893 RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); 894 #endif 895 for (i = 0; i < NFPROTO_NUMPROTO; i++) 896 kfree(nf_nat_l4protos[i]); 897 synchronize_net(); 898 } 899 900 MODULE_LICENSE("GPL"); 901 902 module_init(nf_nat_init); 903 module_exit(nf_nat_cleanup); 904