1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 8u 100 #define MAX_CHAINLEN (32u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 140 unsigned int h2, unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 struct { 215 struct nf_conntrack_man src; 216 union nf_inet_addr dst_addr; 217 unsigned int zone; 218 u32 net_mix; 219 u16 dport; 220 u16 proto; 221 } __aligned(SIPHASH_ALIGNMENT) combined; 222 223 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 224 225 memset(&combined, 0, sizeof(combined)); 226 227 /* The direction must be ignored, so handle usable members manually. */ 228 combined.src = tuple->src; 229 combined.dst_addr = tuple->dst.u3; 230 combined.zone = zoneid; 231 combined.net_mix = net_hash_mix(net); 232 combined.dport = (__force __u16)tuple->dst.u.all; 233 combined.proto = tuple->dst.protonum; 234 235 return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); 236 } 237 238 static u32 scale_hash(u32 hash) 239 { 240 return reciprocal_scale(hash, nf_conntrack_htable_size); 241 } 242 243 static u32 __hash_conntrack(const struct net *net, 244 const struct nf_conntrack_tuple *tuple, 245 unsigned int zoneid, 246 unsigned int size) 247 { 248 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 249 } 250 251 static u32 hash_conntrack(const struct net *net, 252 const struct nf_conntrack_tuple *tuple, 253 unsigned int zoneid) 254 { 255 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 256 } 257 258 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 259 unsigned int dataoff, 260 struct nf_conntrack_tuple *tuple) 261 { struct { 262 __be16 sport; 263 __be16 dport; 264 } _inet_hdr, *inet_hdr; 265 266 /* Actually only need first 4 bytes to get ports. */ 267 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 268 if (!inet_hdr) 269 return false; 270 271 tuple->src.u.udp.port = inet_hdr->sport; 272 tuple->dst.u.udp.port = inet_hdr->dport; 273 return true; 274 } 275 276 static bool 277 nf_ct_get_tuple(const struct sk_buff *skb, 278 unsigned int nhoff, 279 unsigned int dataoff, 280 u_int16_t l3num, 281 u_int8_t protonum, 282 struct net *net, 283 struct nf_conntrack_tuple *tuple) 284 { 285 unsigned int size; 286 const __be32 *ap; 287 __be32 _addrs[8]; 288 289 memset(tuple, 0, sizeof(*tuple)); 290 291 tuple->src.l3num = l3num; 292 switch (l3num) { 293 case NFPROTO_IPV4: 294 nhoff += offsetof(struct iphdr, saddr); 295 size = 2 * sizeof(__be32); 296 break; 297 case NFPROTO_IPV6: 298 nhoff += offsetof(struct ipv6hdr, saddr); 299 size = sizeof(_addrs); 300 break; 301 default: 302 return true; 303 } 304 305 ap = skb_header_pointer(skb, nhoff, size, _addrs); 306 if (!ap) 307 return false; 308 309 switch (l3num) { 310 case NFPROTO_IPV4: 311 tuple->src.u3.ip = ap[0]; 312 tuple->dst.u3.ip = ap[1]; 313 break; 314 case NFPROTO_IPV6: 315 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 316 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 317 break; 318 } 319 320 tuple->dst.protonum = protonum; 321 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 322 323 switch (protonum) { 324 #if IS_ENABLED(CONFIG_IPV6) 325 case IPPROTO_ICMPV6: 326 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 327 #endif 328 case IPPROTO_ICMP: 329 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 330 #ifdef CONFIG_NF_CT_PROTO_GRE 331 case IPPROTO_GRE: 332 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 333 #endif 334 case IPPROTO_TCP: 335 case IPPROTO_UDP: 336 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 337 case IPPROTO_UDPLITE: 338 #endif 339 #ifdef CONFIG_NF_CT_PROTO_SCTP 340 case IPPROTO_SCTP: 341 #endif 342 #ifdef CONFIG_NF_CT_PROTO_DCCP 343 case IPPROTO_DCCP: 344 #endif 345 /* fallthrough */ 346 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 347 default: 348 break; 349 } 350 351 return true; 352 } 353 354 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 355 u_int8_t *protonum) 356 { 357 int dataoff = -1; 358 const struct iphdr *iph; 359 struct iphdr _iph; 360 361 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 362 if (!iph) 363 return -1; 364 365 /* Conntrack defragments packets, we might still see fragments 366 * inside ICMP packets though. 367 */ 368 if (iph->frag_off & htons(IP_OFFSET)) 369 return -1; 370 371 dataoff = nhoff + (iph->ihl << 2); 372 *protonum = iph->protocol; 373 374 /* Check bogus IP headers */ 375 if (dataoff > skb->len) { 376 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 377 nhoff, iph->ihl << 2, skb->len); 378 return -1; 379 } 380 return dataoff; 381 } 382 383 #if IS_ENABLED(CONFIG_IPV6) 384 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 385 u8 *protonum) 386 { 387 int protoff = -1; 388 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 389 __be16 frag_off; 390 u8 nexthdr; 391 392 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 393 &nexthdr, sizeof(nexthdr)) != 0) { 394 pr_debug("can't get nexthdr\n"); 395 return -1; 396 } 397 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 398 /* 399 * (protoff == skb->len) means the packet has not data, just 400 * IPv6 and possibly extensions headers, but it is tracked anyway 401 */ 402 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 403 pr_debug("can't find proto in pkt\n"); 404 return -1; 405 } 406 407 *protonum = nexthdr; 408 return protoff; 409 } 410 #endif 411 412 static int get_l4proto(const struct sk_buff *skb, 413 unsigned int nhoff, u8 pf, u8 *l4num) 414 { 415 switch (pf) { 416 case NFPROTO_IPV4: 417 return ipv4_get_l4proto(skb, nhoff, l4num); 418 #if IS_ENABLED(CONFIG_IPV6) 419 case NFPROTO_IPV6: 420 return ipv6_get_l4proto(skb, nhoff, l4num); 421 #endif 422 default: 423 *l4num = 0; 424 break; 425 } 426 return -1; 427 } 428 429 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 430 u_int16_t l3num, 431 struct net *net, struct nf_conntrack_tuple *tuple) 432 { 433 u8 protonum; 434 int protoff; 435 436 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 437 if (protoff <= 0) 438 return false; 439 440 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 441 } 442 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 443 444 bool 445 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 446 const struct nf_conntrack_tuple *orig) 447 { 448 memset(inverse, 0, sizeof(*inverse)); 449 450 inverse->src.l3num = orig->src.l3num; 451 452 switch (orig->src.l3num) { 453 case NFPROTO_IPV4: 454 inverse->src.u3.ip = orig->dst.u3.ip; 455 inverse->dst.u3.ip = orig->src.u3.ip; 456 break; 457 case NFPROTO_IPV6: 458 inverse->src.u3.in6 = orig->dst.u3.in6; 459 inverse->dst.u3.in6 = orig->src.u3.in6; 460 break; 461 default: 462 break; 463 } 464 465 inverse->dst.dir = !orig->dst.dir; 466 467 inverse->dst.protonum = orig->dst.protonum; 468 469 switch (orig->dst.protonum) { 470 case IPPROTO_ICMP: 471 return nf_conntrack_invert_icmp_tuple(inverse, orig); 472 #if IS_ENABLED(CONFIG_IPV6) 473 case IPPROTO_ICMPV6: 474 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 475 #endif 476 } 477 478 inverse->src.u.all = orig->dst.u.all; 479 inverse->dst.u.all = orig->src.u.all; 480 return true; 481 } 482 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 483 484 /* Generate a almost-unique pseudo-id for a given conntrack. 485 * 486 * intentionally doesn't re-use any of the seeds used for hash 487 * table location, we assume id gets exposed to userspace. 488 * 489 * Following nf_conn items do not change throughout lifetime 490 * of the nf_conn: 491 * 492 * 1. nf_conn address 493 * 2. nf_conn->master address (normally NULL) 494 * 3. the associated net namespace 495 * 4. the original direction tuple 496 */ 497 u32 nf_ct_get_id(const struct nf_conn *ct) 498 { 499 static siphash_aligned_key_t ct_id_seed; 500 unsigned long a, b, c, d; 501 502 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 503 504 a = (unsigned long)ct; 505 b = (unsigned long)ct->master; 506 c = (unsigned long)nf_ct_net(ct); 507 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 508 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 509 &ct_id_seed); 510 #ifdef CONFIG_64BIT 511 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 512 #else 513 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 514 #endif 515 } 516 EXPORT_SYMBOL_GPL(nf_ct_get_id); 517 518 static void 519 clean_from_lists(struct nf_conn *ct) 520 { 521 pr_debug("clean_from_lists(%p)\n", ct); 522 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 523 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 524 525 /* Destroy all pending expectations */ 526 nf_ct_remove_expectations(ct); 527 } 528 529 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 530 531 /* Released via nf_ct_destroy() */ 532 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 533 const struct nf_conntrack_zone *zone, 534 gfp_t flags) 535 { 536 struct nf_conn *tmpl, *p; 537 538 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 539 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 540 if (!tmpl) 541 return NULL; 542 543 p = tmpl; 544 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 545 if (tmpl != p) { 546 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 547 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 548 } 549 } else { 550 tmpl = kzalloc(sizeof(*tmpl), flags); 551 if (!tmpl) 552 return NULL; 553 } 554 555 tmpl->status = IPS_TEMPLATE; 556 write_pnet(&tmpl->ct_net, net); 557 nf_ct_zone_add(tmpl, zone); 558 refcount_set(&tmpl->ct_general.use, 1); 559 560 return tmpl; 561 } 562 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 563 564 void nf_ct_tmpl_free(struct nf_conn *tmpl) 565 { 566 kfree(tmpl->ext); 567 568 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 569 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 570 else 571 kfree(tmpl); 572 } 573 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 574 575 static void destroy_gre_conntrack(struct nf_conn *ct) 576 { 577 #ifdef CONFIG_NF_CT_PROTO_GRE 578 struct nf_conn *master = ct->master; 579 580 if (master) 581 nf_ct_gre_keymap_destroy(master); 582 #endif 583 } 584 585 void nf_ct_destroy(struct nf_conntrack *nfct) 586 { 587 struct nf_conn *ct = (struct nf_conn *)nfct; 588 589 pr_debug("%s(%p)\n", __func__, ct); 590 WARN_ON(refcount_read(&nfct->use) != 0); 591 592 if (unlikely(nf_ct_is_template(ct))) { 593 nf_ct_tmpl_free(ct); 594 return; 595 } 596 597 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 598 destroy_gre_conntrack(ct); 599 600 /* Expectations will have been removed in clean_from_lists, 601 * except TFTP can create an expectation on the first packet, 602 * before connection is in the list, so we need to clean here, 603 * too. 604 */ 605 nf_ct_remove_expectations(ct); 606 607 if (ct->master) 608 nf_ct_put(ct->master); 609 610 pr_debug("%s: returning ct=%p to slab\n", __func__, ct); 611 nf_conntrack_free(ct); 612 } 613 EXPORT_SYMBOL(nf_ct_destroy); 614 615 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 616 { 617 struct net *net = nf_ct_net(ct); 618 unsigned int hash, reply_hash; 619 unsigned int sequence; 620 621 do { 622 sequence = read_seqcount_begin(&nf_conntrack_generation); 623 hash = hash_conntrack(net, 624 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 625 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 626 reply_hash = hash_conntrack(net, 627 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 628 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 629 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 630 631 clean_from_lists(ct); 632 nf_conntrack_double_unlock(hash, reply_hash); 633 } 634 635 static void nf_ct_delete_from_lists(struct nf_conn *ct) 636 { 637 nf_ct_helper_destroy(ct); 638 local_bh_disable(); 639 640 __nf_ct_delete_from_lists(ct); 641 642 local_bh_enable(); 643 } 644 645 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 646 { 647 #ifdef CONFIG_NF_CONNTRACK_EVENTS 648 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 649 650 spin_lock(&cnet->ecache.dying_lock); 651 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 652 &cnet->ecache.dying_list); 653 spin_unlock(&cnet->ecache.dying_lock); 654 #endif 655 } 656 657 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 658 { 659 struct nf_conn_tstamp *tstamp; 660 struct net *net; 661 662 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 663 return false; 664 665 tstamp = nf_conn_tstamp_find(ct); 666 if (tstamp) { 667 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 668 669 tstamp->stop = ktime_get_real_ns(); 670 if (timeout < 0) 671 tstamp->stop -= jiffies_to_nsecs(-timeout); 672 } 673 674 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 675 portid, report) < 0) { 676 /* destroy event was not delivered. nf_ct_put will 677 * be done by event cache worker on redelivery. 678 */ 679 nf_ct_helper_destroy(ct); 680 local_bh_disable(); 681 __nf_ct_delete_from_lists(ct); 682 nf_ct_add_to_ecache_list(ct); 683 local_bh_enable(); 684 685 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 686 return false; 687 } 688 689 net = nf_ct_net(ct); 690 if (nf_conntrack_ecache_dwork_pending(net)) 691 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 692 nf_ct_delete_from_lists(ct); 693 nf_ct_put(ct); 694 return true; 695 } 696 EXPORT_SYMBOL_GPL(nf_ct_delete); 697 698 static inline bool 699 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 700 const struct nf_conntrack_tuple *tuple, 701 const struct nf_conntrack_zone *zone, 702 const struct net *net) 703 { 704 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 705 706 /* A conntrack can be recreated with the equal tuple, 707 * so we need to check that the conntrack is confirmed 708 */ 709 return nf_ct_tuple_equal(tuple, &h->tuple) && 710 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 711 nf_ct_is_confirmed(ct) && 712 net_eq(net, nf_ct_net(ct)); 713 } 714 715 static inline bool 716 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 717 { 718 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 719 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 720 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 721 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 722 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 723 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 724 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 725 } 726 727 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 728 static void nf_ct_gc_expired(struct nf_conn *ct) 729 { 730 if (!refcount_inc_not_zero(&ct->ct_general.use)) 731 return; 732 733 /* load ->status after refcount increase */ 734 smp_acquire__after_ctrl_dep(); 735 736 if (nf_ct_should_gc(ct)) 737 nf_ct_kill(ct); 738 739 nf_ct_put(ct); 740 } 741 742 /* 743 * Warning : 744 * - Caller must take a reference on returned object 745 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 746 */ 747 static struct nf_conntrack_tuple_hash * 748 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 749 const struct nf_conntrack_tuple *tuple, u32 hash) 750 { 751 struct nf_conntrack_tuple_hash *h; 752 struct hlist_nulls_head *ct_hash; 753 struct hlist_nulls_node *n; 754 unsigned int bucket, hsize; 755 756 begin: 757 nf_conntrack_get_ht(&ct_hash, &hsize); 758 bucket = reciprocal_scale(hash, hsize); 759 760 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 761 struct nf_conn *ct; 762 763 ct = nf_ct_tuplehash_to_ctrack(h); 764 if (nf_ct_is_expired(ct)) { 765 nf_ct_gc_expired(ct); 766 continue; 767 } 768 769 if (nf_ct_key_equal(h, tuple, zone, net)) 770 return h; 771 } 772 /* 773 * if the nulls value we got at the end of this lookup is 774 * not the expected one, we must restart lookup. 775 * We probably met an item that was moved to another chain. 776 */ 777 if (get_nulls_value(n) != bucket) { 778 NF_CT_STAT_INC_ATOMIC(net, search_restart); 779 goto begin; 780 } 781 782 return NULL; 783 } 784 785 /* Find a connection corresponding to a tuple. */ 786 static struct nf_conntrack_tuple_hash * 787 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 788 const struct nf_conntrack_tuple *tuple, u32 hash) 789 { 790 struct nf_conntrack_tuple_hash *h; 791 struct nf_conn *ct; 792 793 rcu_read_lock(); 794 795 h = ____nf_conntrack_find(net, zone, tuple, hash); 796 if (h) { 797 /* We have a candidate that matches the tuple we're interested 798 * in, try to obtain a reference and re-check tuple 799 */ 800 ct = nf_ct_tuplehash_to_ctrack(h); 801 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 802 /* re-check key after refcount */ 803 smp_acquire__after_ctrl_dep(); 804 805 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 806 goto found; 807 808 /* TYPESAFE_BY_RCU recycled the candidate */ 809 nf_ct_put(ct); 810 } 811 812 h = NULL; 813 } 814 found: 815 rcu_read_unlock(); 816 817 return h; 818 } 819 820 struct nf_conntrack_tuple_hash * 821 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 822 const struct nf_conntrack_tuple *tuple) 823 { 824 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 825 struct nf_conntrack_tuple_hash *thash; 826 827 thash = __nf_conntrack_find_get(net, zone, tuple, 828 hash_conntrack_raw(tuple, zone_id, net)); 829 830 if (thash) 831 return thash; 832 833 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 834 if (rid != zone_id) 835 return __nf_conntrack_find_get(net, zone, tuple, 836 hash_conntrack_raw(tuple, rid, net)); 837 return thash; 838 } 839 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 840 841 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 842 unsigned int hash, 843 unsigned int reply_hash) 844 { 845 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 846 &nf_conntrack_hash[hash]); 847 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 848 &nf_conntrack_hash[reply_hash]); 849 } 850 851 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 852 { 853 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 854 * may contain stale pointers to e.g. helper that has been removed. 855 * 856 * The helper can't clear this because the nf_conn object isn't in 857 * any hash and synchronize_rcu() isn't enough because associated skb 858 * might sit in a queue. 859 */ 860 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 861 } 862 863 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 864 { 865 if (!ext) 866 return true; 867 868 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 869 return false; 870 871 /* inserted into conntrack table, nf_ct_iterate_cleanup() 872 * will find it. Disable nf_ct_ext_find() id check. 873 */ 874 WRITE_ONCE(ext->gen_id, 0); 875 return true; 876 } 877 878 int 879 nf_conntrack_hash_check_insert(struct nf_conn *ct) 880 { 881 const struct nf_conntrack_zone *zone; 882 struct net *net = nf_ct_net(ct); 883 unsigned int hash, reply_hash; 884 struct nf_conntrack_tuple_hash *h; 885 struct hlist_nulls_node *n; 886 unsigned int max_chainlen; 887 unsigned int chainlen = 0; 888 unsigned int sequence; 889 int err = -EEXIST; 890 891 zone = nf_ct_zone(ct); 892 893 if (!nf_ct_ext_valid_pre(ct->ext)) { 894 NF_CT_STAT_INC(net, insert_failed); 895 return -ETIMEDOUT; 896 } 897 898 local_bh_disable(); 899 do { 900 sequence = read_seqcount_begin(&nf_conntrack_generation); 901 hash = hash_conntrack(net, 902 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 903 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 904 reply_hash = hash_conntrack(net, 905 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 906 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 907 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 908 909 max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); 910 911 /* See if there's one in the list already, including reverse */ 912 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 913 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 914 zone, net)) 915 goto out; 916 917 if (chainlen++ > max_chainlen) 918 goto chaintoolong; 919 } 920 921 chainlen = 0; 922 923 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 924 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 925 zone, net)) 926 goto out; 927 if (chainlen++ > max_chainlen) 928 goto chaintoolong; 929 } 930 931 smp_wmb(); 932 /* The caller holds a reference to this object */ 933 refcount_set(&ct->ct_general.use, 2); 934 __nf_conntrack_hash_insert(ct, hash, reply_hash); 935 nf_conntrack_double_unlock(hash, reply_hash); 936 NF_CT_STAT_INC(net, insert); 937 local_bh_enable(); 938 939 if (!nf_ct_ext_valid_post(ct->ext)) { 940 nf_ct_kill(ct); 941 NF_CT_STAT_INC(net, drop); 942 return -ETIMEDOUT; 943 } 944 945 return 0; 946 chaintoolong: 947 NF_CT_STAT_INC(net, chaintoolong); 948 err = -ENOSPC; 949 out: 950 nf_conntrack_double_unlock(hash, reply_hash); 951 local_bh_enable(); 952 return err; 953 } 954 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 955 956 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 957 unsigned int bytes) 958 { 959 struct nf_conn_acct *acct; 960 961 acct = nf_conn_acct_find(ct); 962 if (acct) { 963 struct nf_conn_counter *counter = acct->counter; 964 965 atomic64_add(packets, &counter[dir].packets); 966 atomic64_add(bytes, &counter[dir].bytes); 967 } 968 } 969 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 970 971 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 972 const struct nf_conn *loser_ct) 973 { 974 struct nf_conn_acct *acct; 975 976 acct = nf_conn_acct_find(loser_ct); 977 if (acct) { 978 struct nf_conn_counter *counter = acct->counter; 979 unsigned int bytes; 980 981 /* u32 should be fine since we must have seen one packet. */ 982 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 983 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 984 } 985 } 986 987 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 988 { 989 struct nf_conn_tstamp *tstamp; 990 991 refcount_inc(&ct->ct_general.use); 992 993 /* set conntrack timestamp, if enabled. */ 994 tstamp = nf_conn_tstamp_find(ct); 995 if (tstamp) 996 tstamp->start = ktime_get_real_ns(); 997 } 998 999 /* caller must hold locks to prevent concurrent changes */ 1000 static int __nf_ct_resolve_clash(struct sk_buff *skb, 1001 struct nf_conntrack_tuple_hash *h) 1002 { 1003 /* This is the conntrack entry already in hashes that won race. */ 1004 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1005 enum ip_conntrack_info ctinfo; 1006 struct nf_conn *loser_ct; 1007 1008 loser_ct = nf_ct_get(skb, &ctinfo); 1009 1010 if (nf_ct_is_dying(ct)) 1011 return NF_DROP; 1012 1013 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1014 nf_ct_match(ct, loser_ct)) { 1015 struct net *net = nf_ct_net(ct); 1016 1017 nf_conntrack_get(&ct->ct_general); 1018 1019 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1020 nf_ct_put(loser_ct); 1021 nf_ct_set(skb, ct, ctinfo); 1022 1023 NF_CT_STAT_INC(net, clash_resolve); 1024 return NF_ACCEPT; 1025 } 1026 1027 return NF_DROP; 1028 } 1029 1030 /** 1031 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1032 * 1033 * @skb: skb that causes the collision 1034 * @repl_idx: hash slot for reply direction 1035 * 1036 * Called when origin or reply direction had a clash. 1037 * The skb can be handled without packet drop provided the reply direction 1038 * is unique or there the existing entry has the identical tuple in both 1039 * directions. 1040 * 1041 * Caller must hold conntrack table locks to prevent concurrent updates. 1042 * 1043 * Returns NF_DROP if the clash could not be handled. 1044 */ 1045 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1046 { 1047 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1048 const struct nf_conntrack_zone *zone; 1049 struct nf_conntrack_tuple_hash *h; 1050 struct hlist_nulls_node *n; 1051 struct net *net; 1052 1053 zone = nf_ct_zone(loser_ct); 1054 net = nf_ct_net(loser_ct); 1055 1056 /* Reply direction must never result in a clash, unless both origin 1057 * and reply tuples are identical. 1058 */ 1059 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1060 if (nf_ct_key_equal(h, 1061 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1062 zone, net)) 1063 return __nf_ct_resolve_clash(skb, h); 1064 } 1065 1066 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1067 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1068 1069 /* IPS_NAT_CLASH removes the entry automatically on the first 1070 * reply. Also prevents UDP tracker from moving the entry to 1071 * ASSURED state, i.e. the entry can always be evicted under 1072 * pressure. 1073 */ 1074 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1075 1076 __nf_conntrack_insert_prepare(loser_ct); 1077 1078 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1079 * already in the table. This also hides the clashing entry from 1080 * ctnetlink iteration, i.e. conntrack -L won't show them. 1081 */ 1082 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1083 1084 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1085 &nf_conntrack_hash[repl_idx]); 1086 1087 NF_CT_STAT_INC(net, clash_resolve); 1088 return NF_ACCEPT; 1089 } 1090 1091 /** 1092 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1093 * 1094 * @skb: skb that causes the clash 1095 * @h: tuplehash of the clashing entry already in table 1096 * @reply_hash: hash slot for reply direction 1097 * 1098 * A conntrack entry can be inserted to the connection tracking table 1099 * if there is no existing entry with an identical tuple. 1100 * 1101 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1102 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1103 * will find the already-existing entry. 1104 * 1105 * The major problem with such packet drop is the extra delay added by 1106 * the packet loss -- it will take some time for a retransmit to occur 1107 * (or the sender to time out when waiting for a reply). 1108 * 1109 * This function attempts to handle the situation without packet drop. 1110 * 1111 * If @skb has no NAT transformation or if the colliding entries are 1112 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1113 * and @skb is associated with the conntrack entry already in the table. 1114 * 1115 * Failing that, the new, unconfirmed conntrack is still added to the table 1116 * provided that the collision only occurs in the ORIGINAL direction. 1117 * The new entry will be added only in the non-clashing REPLY direction, 1118 * so packets in the ORIGINAL direction will continue to match the existing 1119 * entry. The new entry will also have a fixed timeout so it expires -- 1120 * due to the collision, it will only see reply traffic. 1121 * 1122 * Returns NF_DROP if the clash could not be resolved. 1123 */ 1124 static __cold noinline int 1125 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1126 u32 reply_hash) 1127 { 1128 /* This is the conntrack entry already in hashes that won race. */ 1129 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1130 const struct nf_conntrack_l4proto *l4proto; 1131 enum ip_conntrack_info ctinfo; 1132 struct nf_conn *loser_ct; 1133 struct net *net; 1134 int ret; 1135 1136 loser_ct = nf_ct_get(skb, &ctinfo); 1137 net = nf_ct_net(loser_ct); 1138 1139 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1140 if (!l4proto->allow_clash) 1141 goto drop; 1142 1143 ret = __nf_ct_resolve_clash(skb, h); 1144 if (ret == NF_ACCEPT) 1145 return ret; 1146 1147 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1148 if (ret == NF_ACCEPT) 1149 return ret; 1150 1151 drop: 1152 NF_CT_STAT_INC(net, drop); 1153 NF_CT_STAT_INC(net, insert_failed); 1154 return NF_DROP; 1155 } 1156 1157 /* Confirm a connection given skb; places it in hash table */ 1158 int 1159 __nf_conntrack_confirm(struct sk_buff *skb) 1160 { 1161 unsigned int chainlen = 0, sequence, max_chainlen; 1162 const struct nf_conntrack_zone *zone; 1163 unsigned int hash, reply_hash; 1164 struct nf_conntrack_tuple_hash *h; 1165 struct nf_conn *ct; 1166 struct nf_conn_help *help; 1167 struct hlist_nulls_node *n; 1168 enum ip_conntrack_info ctinfo; 1169 struct net *net; 1170 int ret = NF_DROP; 1171 1172 ct = nf_ct_get(skb, &ctinfo); 1173 net = nf_ct_net(ct); 1174 1175 /* ipt_REJECT uses nf_conntrack_attach to attach related 1176 ICMP/TCP RST packets in other direction. Actual packet 1177 which created connection will be IP_CT_NEW or for an 1178 expected connection, IP_CT_RELATED. */ 1179 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1180 return NF_ACCEPT; 1181 1182 zone = nf_ct_zone(ct); 1183 local_bh_disable(); 1184 1185 do { 1186 sequence = read_seqcount_begin(&nf_conntrack_generation); 1187 /* reuse the hash saved before */ 1188 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1189 hash = scale_hash(hash); 1190 reply_hash = hash_conntrack(net, 1191 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1192 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1193 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1194 1195 /* We're not in hash table, and we refuse to set up related 1196 * connections for unconfirmed conns. But packet copies and 1197 * REJECT will give spurious warnings here. 1198 */ 1199 1200 /* Another skb with the same unconfirmed conntrack may 1201 * win the race. This may happen for bridge(br_flood) 1202 * or broadcast/multicast packets do skb_clone with 1203 * unconfirmed conntrack. 1204 */ 1205 if (unlikely(nf_ct_is_confirmed(ct))) { 1206 WARN_ON_ONCE(1); 1207 nf_conntrack_double_unlock(hash, reply_hash); 1208 local_bh_enable(); 1209 return NF_DROP; 1210 } 1211 1212 if (!nf_ct_ext_valid_pre(ct->ext)) { 1213 NF_CT_STAT_INC(net, insert_failed); 1214 goto dying; 1215 } 1216 1217 pr_debug("Confirming conntrack %p\n", ct); 1218 /* We have to check the DYING flag after unlink to prevent 1219 * a race against nf_ct_get_next_corpse() possibly called from 1220 * user context, else we insert an already 'dead' hash, blocking 1221 * further use of that particular connection -JM. 1222 */ 1223 ct->status |= IPS_CONFIRMED; 1224 1225 if (unlikely(nf_ct_is_dying(ct))) { 1226 NF_CT_STAT_INC(net, insert_failed); 1227 goto dying; 1228 } 1229 1230 max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); 1231 /* See if there's one in the list already, including reverse: 1232 NAT could have grabbed it without realizing, since we're 1233 not in the hash. If there is, we lost race. */ 1234 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1235 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1236 zone, net)) 1237 goto out; 1238 if (chainlen++ > max_chainlen) 1239 goto chaintoolong; 1240 } 1241 1242 chainlen = 0; 1243 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1244 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1245 zone, net)) 1246 goto out; 1247 if (chainlen++ > max_chainlen) { 1248 chaintoolong: 1249 NF_CT_STAT_INC(net, chaintoolong); 1250 NF_CT_STAT_INC(net, insert_failed); 1251 ret = NF_DROP; 1252 goto dying; 1253 } 1254 } 1255 1256 /* Timer relative to confirmation time, not original 1257 setting time, otherwise we'd get timer wrap in 1258 weird delay cases. */ 1259 ct->timeout += nfct_time_stamp; 1260 1261 __nf_conntrack_insert_prepare(ct); 1262 1263 /* Since the lookup is lockless, hash insertion must be done after 1264 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1265 * guarantee that no other CPU can find the conntrack before the above 1266 * stores are visible. 1267 */ 1268 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1269 nf_conntrack_double_unlock(hash, reply_hash); 1270 local_bh_enable(); 1271 1272 /* ext area is still valid (rcu read lock is held, 1273 * but will go out of scope soon, we need to remove 1274 * this conntrack again. 1275 */ 1276 if (!nf_ct_ext_valid_post(ct->ext)) { 1277 nf_ct_kill(ct); 1278 NF_CT_STAT_INC(net, drop); 1279 return NF_DROP; 1280 } 1281 1282 help = nfct_help(ct); 1283 if (help && help->helper) 1284 nf_conntrack_event_cache(IPCT_HELPER, ct); 1285 1286 nf_conntrack_event_cache(master_ct(ct) ? 1287 IPCT_RELATED : IPCT_NEW, ct); 1288 return NF_ACCEPT; 1289 1290 out: 1291 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1292 dying: 1293 nf_conntrack_double_unlock(hash, reply_hash); 1294 local_bh_enable(); 1295 return ret; 1296 } 1297 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1298 1299 /* Returns true if a connection correspondings to the tuple (required 1300 for NAT). */ 1301 int 1302 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1303 const struct nf_conn *ignored_conntrack) 1304 { 1305 struct net *net = nf_ct_net(ignored_conntrack); 1306 const struct nf_conntrack_zone *zone; 1307 struct nf_conntrack_tuple_hash *h; 1308 struct hlist_nulls_head *ct_hash; 1309 unsigned int hash, hsize; 1310 struct hlist_nulls_node *n; 1311 struct nf_conn *ct; 1312 1313 zone = nf_ct_zone(ignored_conntrack); 1314 1315 rcu_read_lock(); 1316 begin: 1317 nf_conntrack_get_ht(&ct_hash, &hsize); 1318 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1319 1320 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1321 ct = nf_ct_tuplehash_to_ctrack(h); 1322 1323 if (ct == ignored_conntrack) 1324 continue; 1325 1326 if (nf_ct_is_expired(ct)) { 1327 nf_ct_gc_expired(ct); 1328 continue; 1329 } 1330 1331 if (nf_ct_key_equal(h, tuple, zone, net)) { 1332 /* Tuple is taken already, so caller will need to find 1333 * a new source port to use. 1334 * 1335 * Only exception: 1336 * If the *original tuples* are identical, then both 1337 * conntracks refer to the same flow. 1338 * This is a rare situation, it can occur e.g. when 1339 * more than one UDP packet is sent from same socket 1340 * in different threads. 1341 * 1342 * Let nf_ct_resolve_clash() deal with this later. 1343 */ 1344 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1345 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1346 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1347 continue; 1348 1349 NF_CT_STAT_INC_ATOMIC(net, found); 1350 rcu_read_unlock(); 1351 return 1; 1352 } 1353 } 1354 1355 if (get_nulls_value(n) != hash) { 1356 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1357 goto begin; 1358 } 1359 1360 rcu_read_unlock(); 1361 1362 return 0; 1363 } 1364 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1365 1366 #define NF_CT_EVICTION_RANGE 8 1367 1368 /* There's a small race here where we may free a just-assured 1369 connection. Too bad: we're in trouble anyway. */ 1370 static unsigned int early_drop_list(struct net *net, 1371 struct hlist_nulls_head *head) 1372 { 1373 struct nf_conntrack_tuple_hash *h; 1374 struct hlist_nulls_node *n; 1375 unsigned int drops = 0; 1376 struct nf_conn *tmp; 1377 1378 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1379 tmp = nf_ct_tuplehash_to_ctrack(h); 1380 1381 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) 1382 continue; 1383 1384 if (nf_ct_is_expired(tmp)) { 1385 nf_ct_gc_expired(tmp); 1386 continue; 1387 } 1388 1389 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1390 !net_eq(nf_ct_net(tmp), net) || 1391 nf_ct_is_dying(tmp)) 1392 continue; 1393 1394 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1395 continue; 1396 1397 /* load ->ct_net and ->status after refcount increase */ 1398 smp_acquire__after_ctrl_dep(); 1399 1400 /* kill only if still in same netns -- might have moved due to 1401 * SLAB_TYPESAFE_BY_RCU rules. 1402 * 1403 * We steal the timer reference. If that fails timer has 1404 * already fired or someone else deleted it. Just drop ref 1405 * and move to next entry. 1406 */ 1407 if (net_eq(nf_ct_net(tmp), net) && 1408 nf_ct_is_confirmed(tmp) && 1409 nf_ct_delete(tmp, 0, 0)) 1410 drops++; 1411 1412 nf_ct_put(tmp); 1413 } 1414 1415 return drops; 1416 } 1417 1418 static noinline int early_drop(struct net *net, unsigned int hash) 1419 { 1420 unsigned int i, bucket; 1421 1422 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1423 struct hlist_nulls_head *ct_hash; 1424 unsigned int hsize, drops; 1425 1426 rcu_read_lock(); 1427 nf_conntrack_get_ht(&ct_hash, &hsize); 1428 if (!i) 1429 bucket = reciprocal_scale(hash, hsize); 1430 else 1431 bucket = (bucket + 1) % hsize; 1432 1433 drops = early_drop_list(net, &ct_hash[bucket]); 1434 rcu_read_unlock(); 1435 1436 if (drops) { 1437 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1438 return true; 1439 } 1440 } 1441 1442 return false; 1443 } 1444 1445 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1446 { 1447 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1448 } 1449 1450 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1451 { 1452 const struct nf_conntrack_l4proto *l4proto; 1453 1454 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1455 return true; 1456 1457 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1458 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1459 return true; 1460 1461 return false; 1462 } 1463 1464 static void gc_worker(struct work_struct *work) 1465 { 1466 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1467 u32 end_time, start_time = nfct_time_stamp; 1468 struct conntrack_gc_work *gc_work; 1469 unsigned int expired_count = 0; 1470 unsigned long next_run; 1471 s32 delta_time; 1472 long count; 1473 1474 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1475 1476 i = gc_work->next_bucket; 1477 if (gc_work->early_drop) 1478 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1479 1480 if (i == 0) { 1481 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1482 gc_work->count = GC_SCAN_INITIAL_COUNT; 1483 gc_work->start_time = start_time; 1484 } 1485 1486 next_run = gc_work->avg_timeout; 1487 count = gc_work->count; 1488 1489 end_time = start_time + GC_SCAN_MAX_DURATION; 1490 1491 do { 1492 struct nf_conntrack_tuple_hash *h; 1493 struct hlist_nulls_head *ct_hash; 1494 struct hlist_nulls_node *n; 1495 struct nf_conn *tmp; 1496 1497 rcu_read_lock(); 1498 1499 nf_conntrack_get_ht(&ct_hash, &hashsz); 1500 if (i >= hashsz) { 1501 rcu_read_unlock(); 1502 break; 1503 } 1504 1505 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1506 struct nf_conntrack_net *cnet; 1507 struct net *net; 1508 long expires; 1509 1510 tmp = nf_ct_tuplehash_to_ctrack(h); 1511 1512 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1513 nf_ct_offload_timeout(tmp); 1514 continue; 1515 } 1516 1517 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1518 rcu_read_unlock(); 1519 1520 gc_work->next_bucket = i; 1521 gc_work->avg_timeout = next_run; 1522 gc_work->count = count; 1523 1524 delta_time = nfct_time_stamp - gc_work->start_time; 1525 1526 /* re-sched immediately if total cycle time is exceeded */ 1527 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1528 goto early_exit; 1529 } 1530 1531 if (nf_ct_is_expired(tmp)) { 1532 nf_ct_gc_expired(tmp); 1533 expired_count++; 1534 continue; 1535 } 1536 1537 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1538 expires = (expires - (long)next_run) / ++count; 1539 next_run += expires; 1540 1541 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1542 continue; 1543 1544 net = nf_ct_net(tmp); 1545 cnet = nf_ct_pernet(net); 1546 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1547 continue; 1548 1549 /* need to take reference to avoid possible races */ 1550 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1551 continue; 1552 1553 /* load ->status after refcount increase */ 1554 smp_acquire__after_ctrl_dep(); 1555 1556 if (gc_worker_skip_ct(tmp)) { 1557 nf_ct_put(tmp); 1558 continue; 1559 } 1560 1561 if (gc_worker_can_early_drop(tmp)) { 1562 nf_ct_kill(tmp); 1563 expired_count++; 1564 } 1565 1566 nf_ct_put(tmp); 1567 } 1568 1569 /* could check get_nulls_value() here and restart if ct 1570 * was moved to another chain. But given gc is best-effort 1571 * we will just continue with next hash slot. 1572 */ 1573 rcu_read_unlock(); 1574 cond_resched(); 1575 i++; 1576 1577 delta_time = nfct_time_stamp - end_time; 1578 if (delta_time > 0 && i < hashsz) { 1579 gc_work->avg_timeout = next_run; 1580 gc_work->count = count; 1581 gc_work->next_bucket = i; 1582 next_run = 0; 1583 goto early_exit; 1584 } 1585 } while (i < hashsz); 1586 1587 gc_work->next_bucket = 0; 1588 1589 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1590 1591 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1592 if (next_run > (unsigned long)delta_time) 1593 next_run -= delta_time; 1594 else 1595 next_run = 1; 1596 1597 early_exit: 1598 if (gc_work->exiting) 1599 return; 1600 1601 if (next_run) 1602 gc_work->early_drop = false; 1603 1604 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1605 } 1606 1607 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1608 { 1609 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1610 gc_work->exiting = false; 1611 } 1612 1613 static struct nf_conn * 1614 __nf_conntrack_alloc(struct net *net, 1615 const struct nf_conntrack_zone *zone, 1616 const struct nf_conntrack_tuple *orig, 1617 const struct nf_conntrack_tuple *repl, 1618 gfp_t gfp, u32 hash) 1619 { 1620 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1621 unsigned int ct_count; 1622 struct nf_conn *ct; 1623 1624 /* We don't want any race condition at early drop stage */ 1625 ct_count = atomic_inc_return(&cnet->count); 1626 1627 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1628 if (!early_drop(net, hash)) { 1629 if (!conntrack_gc_work.early_drop) 1630 conntrack_gc_work.early_drop = true; 1631 atomic_dec(&cnet->count); 1632 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1633 return ERR_PTR(-ENOMEM); 1634 } 1635 } 1636 1637 /* 1638 * Do not use kmem_cache_zalloc(), as this cache uses 1639 * SLAB_TYPESAFE_BY_RCU. 1640 */ 1641 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1642 if (ct == NULL) 1643 goto out; 1644 1645 spin_lock_init(&ct->lock); 1646 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1647 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1648 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1649 /* save hash for reusing when confirming */ 1650 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1651 ct->status = 0; 1652 WRITE_ONCE(ct->timeout, 0); 1653 write_pnet(&ct->ct_net, net); 1654 memset_after(ct, 0, __nfct_init_offset); 1655 1656 nf_ct_zone_add(ct, zone); 1657 1658 /* Because we use RCU lookups, we set ct_general.use to zero before 1659 * this is inserted in any list. 1660 */ 1661 refcount_set(&ct->ct_general.use, 0); 1662 return ct; 1663 out: 1664 atomic_dec(&cnet->count); 1665 return ERR_PTR(-ENOMEM); 1666 } 1667 1668 struct nf_conn *nf_conntrack_alloc(struct net *net, 1669 const struct nf_conntrack_zone *zone, 1670 const struct nf_conntrack_tuple *orig, 1671 const struct nf_conntrack_tuple *repl, 1672 gfp_t gfp) 1673 { 1674 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1675 } 1676 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1677 1678 void nf_conntrack_free(struct nf_conn *ct) 1679 { 1680 struct net *net = nf_ct_net(ct); 1681 struct nf_conntrack_net *cnet; 1682 1683 /* A freed object has refcnt == 0, that's 1684 * the golden rule for SLAB_TYPESAFE_BY_RCU 1685 */ 1686 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1687 1688 if (ct->status & IPS_SRC_NAT_DONE) { 1689 const struct nf_nat_hook *nat_hook; 1690 1691 rcu_read_lock(); 1692 nat_hook = rcu_dereference(nf_nat_hook); 1693 if (nat_hook) 1694 nat_hook->remove_nat_bysrc(ct); 1695 rcu_read_unlock(); 1696 } 1697 1698 kfree(ct->ext); 1699 kmem_cache_free(nf_conntrack_cachep, ct); 1700 cnet = nf_ct_pernet(net); 1701 1702 smp_mb__before_atomic(); 1703 atomic_dec(&cnet->count); 1704 } 1705 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1706 1707 1708 /* Allocate a new conntrack: we return -ENOMEM if classification 1709 failed due to stress. Otherwise it really is unclassifiable. */ 1710 static noinline struct nf_conntrack_tuple_hash * 1711 init_conntrack(struct net *net, struct nf_conn *tmpl, 1712 const struct nf_conntrack_tuple *tuple, 1713 struct sk_buff *skb, 1714 unsigned int dataoff, u32 hash) 1715 { 1716 struct nf_conn *ct; 1717 struct nf_conn_help *help; 1718 struct nf_conntrack_tuple repl_tuple; 1719 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1720 struct nf_conntrack_ecache *ecache; 1721 #endif 1722 struct nf_conntrack_expect *exp = NULL; 1723 const struct nf_conntrack_zone *zone; 1724 struct nf_conn_timeout *timeout_ext; 1725 struct nf_conntrack_zone tmp; 1726 struct nf_conntrack_net *cnet; 1727 1728 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { 1729 pr_debug("Can't invert tuple.\n"); 1730 return NULL; 1731 } 1732 1733 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1734 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1735 hash); 1736 if (IS_ERR(ct)) 1737 return (struct nf_conntrack_tuple_hash *)ct; 1738 1739 if (!nf_ct_add_synproxy(ct, tmpl)) { 1740 nf_conntrack_free(ct); 1741 return ERR_PTR(-ENOMEM); 1742 } 1743 1744 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1745 1746 if (timeout_ext) 1747 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1748 GFP_ATOMIC); 1749 1750 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1751 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1752 nf_ct_labels_ext_add(ct); 1753 1754 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1755 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1756 1757 if ((ecache || net->ct.sysctl_events) && 1758 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1759 ecache ? ecache->expmask : 0, 1760 GFP_ATOMIC)) { 1761 nf_conntrack_free(ct); 1762 return ERR_PTR(-ENOMEM); 1763 } 1764 #endif 1765 1766 cnet = nf_ct_pernet(net); 1767 if (cnet->expect_count) { 1768 spin_lock_bh(&nf_conntrack_expect_lock); 1769 exp = nf_ct_find_expectation(net, zone, tuple); 1770 if (exp) { 1771 pr_debug("expectation arrives ct=%p exp=%p\n", 1772 ct, exp); 1773 /* Welcome, Mr. Bond. We've been expecting you... */ 1774 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1775 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1776 ct->master = exp->master; 1777 if (exp->helper) { 1778 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1779 if (help) 1780 rcu_assign_pointer(help->helper, exp->helper); 1781 } 1782 1783 #ifdef CONFIG_NF_CONNTRACK_MARK 1784 ct->mark = exp->master->mark; 1785 #endif 1786 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1787 ct->secmark = exp->master->secmark; 1788 #endif 1789 NF_CT_STAT_INC(net, expect_new); 1790 } 1791 spin_unlock_bh(&nf_conntrack_expect_lock); 1792 } 1793 if (!exp && tmpl) 1794 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1795 1796 /* Other CPU might have obtained a pointer to this object before it was 1797 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1798 * 1799 * After refcount_set(1) it will succeed; ensure that zeroing of 1800 * ct->status and the correct ct->net pointer are visible; else other 1801 * core might observe CONFIRMED bit which means the entry is valid and 1802 * in the hash table, but its not (anymore). 1803 */ 1804 smp_wmb(); 1805 1806 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1807 refcount_set(&ct->ct_general.use, 1); 1808 1809 if (exp) { 1810 if (exp->expectfn) 1811 exp->expectfn(ct, exp); 1812 nf_ct_expect_put(exp); 1813 } 1814 1815 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1816 } 1817 1818 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1819 static int 1820 resolve_normal_ct(struct nf_conn *tmpl, 1821 struct sk_buff *skb, 1822 unsigned int dataoff, 1823 u_int8_t protonum, 1824 const struct nf_hook_state *state) 1825 { 1826 const struct nf_conntrack_zone *zone; 1827 struct nf_conntrack_tuple tuple; 1828 struct nf_conntrack_tuple_hash *h; 1829 enum ip_conntrack_info ctinfo; 1830 struct nf_conntrack_zone tmp; 1831 u32 hash, zone_id, rid; 1832 struct nf_conn *ct; 1833 1834 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1835 dataoff, state->pf, protonum, state->net, 1836 &tuple)) { 1837 pr_debug("Can't get tuple\n"); 1838 return 0; 1839 } 1840 1841 /* look for tuple match */ 1842 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1843 1844 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1845 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1846 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1847 1848 if (!h) { 1849 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1850 if (zone_id != rid) { 1851 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1852 1853 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1854 } 1855 } 1856 1857 if (!h) { 1858 h = init_conntrack(state->net, tmpl, &tuple, 1859 skb, dataoff, hash); 1860 if (!h) 1861 return 0; 1862 if (IS_ERR(h)) 1863 return PTR_ERR(h); 1864 } 1865 ct = nf_ct_tuplehash_to_ctrack(h); 1866 1867 /* It exists; we have (non-exclusive) reference. */ 1868 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1869 ctinfo = IP_CT_ESTABLISHED_REPLY; 1870 } else { 1871 /* Once we've had two way comms, always ESTABLISHED. */ 1872 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1873 pr_debug("normal packet for %p\n", ct); 1874 ctinfo = IP_CT_ESTABLISHED; 1875 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1876 pr_debug("related packet for %p\n", ct); 1877 ctinfo = IP_CT_RELATED; 1878 } else { 1879 pr_debug("new packet for %p\n", ct); 1880 ctinfo = IP_CT_NEW; 1881 } 1882 } 1883 nf_ct_set(skb, ct, ctinfo); 1884 return 0; 1885 } 1886 1887 /* 1888 * icmp packets need special treatment to handle error messages that are 1889 * related to a connection. 1890 * 1891 * Callers need to check if skb has a conntrack assigned when this 1892 * helper returns; in such case skb belongs to an already known connection. 1893 */ 1894 static unsigned int __cold 1895 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1896 struct sk_buff *skb, 1897 unsigned int dataoff, 1898 u8 protonum, 1899 const struct nf_hook_state *state) 1900 { 1901 int ret; 1902 1903 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1904 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1905 #if IS_ENABLED(CONFIG_IPV6) 1906 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1907 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1908 #endif 1909 else 1910 return NF_ACCEPT; 1911 1912 if (ret <= 0) 1913 NF_CT_STAT_INC_ATOMIC(state->net, error); 1914 1915 return ret; 1916 } 1917 1918 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1919 enum ip_conntrack_info ctinfo) 1920 { 1921 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1922 1923 if (!timeout) 1924 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1925 1926 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1927 return NF_ACCEPT; 1928 } 1929 1930 /* Returns verdict for packet, or -1 for invalid. */ 1931 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1932 struct sk_buff *skb, 1933 unsigned int dataoff, 1934 enum ip_conntrack_info ctinfo, 1935 const struct nf_hook_state *state) 1936 { 1937 switch (nf_ct_protonum(ct)) { 1938 case IPPROTO_TCP: 1939 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1940 ctinfo, state); 1941 case IPPROTO_UDP: 1942 return nf_conntrack_udp_packet(ct, skb, dataoff, 1943 ctinfo, state); 1944 case IPPROTO_ICMP: 1945 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1946 #if IS_ENABLED(CONFIG_IPV6) 1947 case IPPROTO_ICMPV6: 1948 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1949 #endif 1950 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1951 case IPPROTO_UDPLITE: 1952 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1953 ctinfo, state); 1954 #endif 1955 #ifdef CONFIG_NF_CT_PROTO_SCTP 1956 case IPPROTO_SCTP: 1957 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1958 ctinfo, state); 1959 #endif 1960 #ifdef CONFIG_NF_CT_PROTO_DCCP 1961 case IPPROTO_DCCP: 1962 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1963 ctinfo, state); 1964 #endif 1965 #ifdef CONFIG_NF_CT_PROTO_GRE 1966 case IPPROTO_GRE: 1967 return nf_conntrack_gre_packet(ct, skb, dataoff, 1968 ctinfo, state); 1969 #endif 1970 } 1971 1972 return generic_packet(ct, skb, ctinfo); 1973 } 1974 1975 unsigned int 1976 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1977 { 1978 enum ip_conntrack_info ctinfo; 1979 struct nf_conn *ct, *tmpl; 1980 u_int8_t protonum; 1981 int dataoff, ret; 1982 1983 tmpl = nf_ct_get(skb, &ctinfo); 1984 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1985 /* Previously seen (loopback or untracked)? Ignore. */ 1986 if ((tmpl && !nf_ct_is_template(tmpl)) || 1987 ctinfo == IP_CT_UNTRACKED) 1988 return NF_ACCEPT; 1989 skb->_nfct = 0; 1990 } 1991 1992 /* rcu_read_lock()ed by nf_hook_thresh */ 1993 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1994 if (dataoff <= 0) { 1995 pr_debug("not prepared to track yet or error occurred\n"); 1996 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1997 ret = NF_ACCEPT; 1998 goto out; 1999 } 2000 2001 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 2002 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 2003 protonum, state); 2004 if (ret <= 0) { 2005 ret = -ret; 2006 goto out; 2007 } 2008 /* ICMP[v6] protocol trackers may assign one conntrack. */ 2009 if (skb->_nfct) 2010 goto out; 2011 } 2012 repeat: 2013 ret = resolve_normal_ct(tmpl, skb, dataoff, 2014 protonum, state); 2015 if (ret < 0) { 2016 /* Too stressed to deal. */ 2017 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2018 ret = NF_DROP; 2019 goto out; 2020 } 2021 2022 ct = nf_ct_get(skb, &ctinfo); 2023 if (!ct) { 2024 /* Not valid part of a connection */ 2025 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2026 ret = NF_ACCEPT; 2027 goto out; 2028 } 2029 2030 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2031 if (ret <= 0) { 2032 /* Invalid: inverse of the return code tells 2033 * the netfilter core what to do */ 2034 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 2035 nf_ct_put(ct); 2036 skb->_nfct = 0; 2037 /* Special case: TCP tracker reports an attempt to reopen a 2038 * closed/aborted connection. We have to go back and create a 2039 * fresh conntrack. 2040 */ 2041 if (ret == -NF_REPEAT) 2042 goto repeat; 2043 2044 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2045 if (ret == -NF_DROP) 2046 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2047 2048 ret = -ret; 2049 goto out; 2050 } 2051 2052 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2053 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2054 nf_conntrack_event_cache(IPCT_REPLY, ct); 2055 out: 2056 if (tmpl) 2057 nf_ct_put(tmpl); 2058 2059 return ret; 2060 } 2061 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2062 2063 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 2064 implicitly racy: see __nf_conntrack_confirm */ 2065 void nf_conntrack_alter_reply(struct nf_conn *ct, 2066 const struct nf_conntrack_tuple *newreply) 2067 { 2068 struct nf_conn_help *help = nfct_help(ct); 2069 2070 /* Should be unconfirmed, so not in hash table yet */ 2071 WARN_ON(nf_ct_is_confirmed(ct)); 2072 2073 pr_debug("Altering reply tuple of %p to ", ct); 2074 nf_ct_dump_tuple(newreply); 2075 2076 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 2077 if (ct->master || (help && !hlist_empty(&help->expectations))) 2078 return; 2079 } 2080 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 2081 2082 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2083 void __nf_ct_refresh_acct(struct nf_conn *ct, 2084 enum ip_conntrack_info ctinfo, 2085 const struct sk_buff *skb, 2086 u32 extra_jiffies, 2087 bool do_acct) 2088 { 2089 /* Only update if this is not a fixed timeout */ 2090 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2091 goto acct; 2092 2093 /* If not in hash table, timer will not be active yet */ 2094 if (nf_ct_is_confirmed(ct)) 2095 extra_jiffies += nfct_time_stamp; 2096 2097 if (READ_ONCE(ct->timeout) != extra_jiffies) 2098 WRITE_ONCE(ct->timeout, extra_jiffies); 2099 acct: 2100 if (do_acct) 2101 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2102 } 2103 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2104 2105 bool nf_ct_kill_acct(struct nf_conn *ct, 2106 enum ip_conntrack_info ctinfo, 2107 const struct sk_buff *skb) 2108 { 2109 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2110 2111 return nf_ct_delete(ct, 0, 0); 2112 } 2113 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2114 2115 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2116 2117 #include <linux/netfilter/nfnetlink.h> 2118 #include <linux/netfilter/nfnetlink_conntrack.h> 2119 #include <linux/mutex.h> 2120 2121 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2122 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2123 const struct nf_conntrack_tuple *tuple) 2124 { 2125 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2126 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2127 goto nla_put_failure; 2128 return 0; 2129 2130 nla_put_failure: 2131 return -1; 2132 } 2133 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2134 2135 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2136 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2137 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2138 }; 2139 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2140 2141 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2142 struct nf_conntrack_tuple *t, 2143 u_int32_t flags) 2144 { 2145 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2146 if (!tb[CTA_PROTO_SRC_PORT]) 2147 return -EINVAL; 2148 2149 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2150 } 2151 2152 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2153 if (!tb[CTA_PROTO_DST_PORT]) 2154 return -EINVAL; 2155 2156 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2157 } 2158 2159 return 0; 2160 } 2161 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2162 2163 unsigned int nf_ct_port_nlattr_tuple_size(void) 2164 { 2165 static unsigned int size __read_mostly; 2166 2167 if (!size) 2168 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2169 2170 return size; 2171 } 2172 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2173 #endif 2174 2175 /* Used by ipt_REJECT and ip6t_REJECT. */ 2176 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2177 { 2178 struct nf_conn *ct; 2179 enum ip_conntrack_info ctinfo; 2180 2181 /* This ICMP is in reverse direction to the packet which caused it */ 2182 ct = nf_ct_get(skb, &ctinfo); 2183 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2184 ctinfo = IP_CT_RELATED_REPLY; 2185 else 2186 ctinfo = IP_CT_RELATED; 2187 2188 /* Attach to new skbuff, and increment count */ 2189 nf_ct_set(nskb, ct, ctinfo); 2190 nf_conntrack_get(skb_nfct(nskb)); 2191 } 2192 2193 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2194 struct nf_conn *ct, 2195 enum ip_conntrack_info ctinfo) 2196 { 2197 const struct nf_nat_hook *nat_hook; 2198 struct nf_conntrack_tuple_hash *h; 2199 struct nf_conntrack_tuple tuple; 2200 unsigned int status; 2201 int dataoff; 2202 u16 l3num; 2203 u8 l4num; 2204 2205 l3num = nf_ct_l3num(ct); 2206 2207 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2208 if (dataoff <= 0) 2209 return -1; 2210 2211 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2212 l4num, net, &tuple)) 2213 return -1; 2214 2215 if (ct->status & IPS_SRC_NAT) { 2216 memcpy(tuple.src.u3.all, 2217 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2218 sizeof(tuple.src.u3.all)); 2219 tuple.src.u.all = 2220 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2221 } 2222 2223 if (ct->status & IPS_DST_NAT) { 2224 memcpy(tuple.dst.u3.all, 2225 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2226 sizeof(tuple.dst.u3.all)); 2227 tuple.dst.u.all = 2228 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2229 } 2230 2231 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2232 if (!h) 2233 return 0; 2234 2235 /* Store status bits of the conntrack that is clashing to re-do NAT 2236 * mangling according to what it has been done already to this packet. 2237 */ 2238 status = ct->status; 2239 2240 nf_ct_put(ct); 2241 ct = nf_ct_tuplehash_to_ctrack(h); 2242 nf_ct_set(skb, ct, ctinfo); 2243 2244 nat_hook = rcu_dereference(nf_nat_hook); 2245 if (!nat_hook) 2246 return 0; 2247 2248 if (status & IPS_SRC_NAT && 2249 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 2250 IP_CT_DIR_ORIGINAL) == NF_DROP) 2251 return -1; 2252 2253 if (status & IPS_DST_NAT && 2254 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 2255 IP_CT_DIR_ORIGINAL) == NF_DROP) 2256 return -1; 2257 2258 return 0; 2259 } 2260 2261 /* This packet is coming from userspace via nf_queue, complete the packet 2262 * processing after the helper invocation in nf_confirm(). 2263 */ 2264 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2265 enum ip_conntrack_info ctinfo) 2266 { 2267 const struct nf_conntrack_helper *helper; 2268 const struct nf_conn_help *help; 2269 int protoff; 2270 2271 help = nfct_help(ct); 2272 if (!help) 2273 return 0; 2274 2275 helper = rcu_dereference(help->helper); 2276 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2277 return 0; 2278 2279 switch (nf_ct_l3num(ct)) { 2280 case NFPROTO_IPV4: 2281 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2282 break; 2283 #if IS_ENABLED(CONFIG_IPV6) 2284 case NFPROTO_IPV6: { 2285 __be16 frag_off; 2286 u8 pnum; 2287 2288 pnum = ipv6_hdr(skb)->nexthdr; 2289 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2290 &frag_off); 2291 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2292 return 0; 2293 break; 2294 } 2295 #endif 2296 default: 2297 return 0; 2298 } 2299 2300 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2301 !nf_is_loopback_packet(skb)) { 2302 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2303 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2304 return -1; 2305 } 2306 } 2307 2308 /* We've seen it coming out the other side: confirm it */ 2309 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; 2310 } 2311 2312 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2313 { 2314 enum ip_conntrack_info ctinfo; 2315 struct nf_conn *ct; 2316 int err; 2317 2318 ct = nf_ct_get(skb, &ctinfo); 2319 if (!ct) 2320 return 0; 2321 2322 if (!nf_ct_is_confirmed(ct)) { 2323 err = __nf_conntrack_update(net, skb, ct, ctinfo); 2324 if (err < 0) 2325 return err; 2326 2327 ct = nf_ct_get(skb, &ctinfo); 2328 } 2329 2330 return nf_confirm_cthelper(skb, ct, ctinfo); 2331 } 2332 2333 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2334 const struct sk_buff *skb) 2335 { 2336 const struct nf_conntrack_tuple *src_tuple; 2337 const struct nf_conntrack_tuple_hash *hash; 2338 struct nf_conntrack_tuple srctuple; 2339 enum ip_conntrack_info ctinfo; 2340 struct nf_conn *ct; 2341 2342 ct = nf_ct_get(skb, &ctinfo); 2343 if (ct) { 2344 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2345 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2346 return true; 2347 } 2348 2349 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2350 NFPROTO_IPV4, dev_net(skb->dev), 2351 &srctuple)) 2352 return false; 2353 2354 hash = nf_conntrack_find_get(dev_net(skb->dev), 2355 &nf_ct_zone_dflt, 2356 &srctuple); 2357 if (!hash) 2358 return false; 2359 2360 ct = nf_ct_tuplehash_to_ctrack(hash); 2361 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2362 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2363 nf_ct_put(ct); 2364 2365 return true; 2366 } 2367 2368 /* Bring out ya dead! */ 2369 static struct nf_conn * 2370 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2371 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2372 { 2373 struct nf_conntrack_tuple_hash *h; 2374 struct nf_conn *ct; 2375 struct hlist_nulls_node *n; 2376 spinlock_t *lockp; 2377 2378 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2379 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2380 2381 if (hlist_nulls_empty(hslot)) 2382 continue; 2383 2384 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2385 local_bh_disable(); 2386 nf_conntrack_lock(lockp); 2387 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2388 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2389 continue; 2390 /* All nf_conn objects are added to hash table twice, one 2391 * for original direction tuple, once for the reply tuple. 2392 * 2393 * Exception: In the IPS_NAT_CLASH case, only the reply 2394 * tuple is added (the original tuple already existed for 2395 * a different object). 2396 * 2397 * We only need to call the iterator once for each 2398 * conntrack, so we just use the 'reply' direction 2399 * tuple while iterating. 2400 */ 2401 ct = nf_ct_tuplehash_to_ctrack(h); 2402 2403 if (iter_data->net && 2404 !net_eq(iter_data->net, nf_ct_net(ct))) 2405 continue; 2406 2407 if (iter(ct, iter_data->data)) 2408 goto found; 2409 } 2410 spin_unlock(lockp); 2411 local_bh_enable(); 2412 cond_resched(); 2413 } 2414 2415 return NULL; 2416 found: 2417 refcount_inc(&ct->ct_general.use); 2418 spin_unlock(lockp); 2419 local_bh_enable(); 2420 return ct; 2421 } 2422 2423 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2424 const struct nf_ct_iter_data *iter_data) 2425 { 2426 unsigned int bucket = 0; 2427 struct nf_conn *ct; 2428 2429 might_sleep(); 2430 2431 mutex_lock(&nf_conntrack_mutex); 2432 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2433 /* Time to push up daises... */ 2434 2435 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2436 nf_ct_put(ct); 2437 cond_resched(); 2438 } 2439 mutex_unlock(&nf_conntrack_mutex); 2440 } 2441 2442 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2443 const struct nf_ct_iter_data *iter_data) 2444 { 2445 struct net *net = iter_data->net; 2446 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2447 2448 might_sleep(); 2449 2450 if (atomic_read(&cnet->count) == 0) 2451 return; 2452 2453 nf_ct_iterate_cleanup(iter, iter_data); 2454 } 2455 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2456 2457 /** 2458 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2459 * @iter: callback to invoke for each conntrack 2460 * @data: data to pass to @iter 2461 * 2462 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2463 * unconfirmed list as dying (so they will not be inserted into 2464 * main table). 2465 * 2466 * Can only be called in module exit path. 2467 */ 2468 void 2469 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2470 { 2471 struct nf_ct_iter_data iter_data = {}; 2472 struct net *net; 2473 2474 down_read(&net_rwsem); 2475 for_each_net(net) { 2476 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2477 2478 if (atomic_read(&cnet->count) == 0) 2479 continue; 2480 nf_queue_nf_hook_drop(net); 2481 } 2482 up_read(&net_rwsem); 2483 2484 /* Need to wait for netns cleanup worker to finish, if its 2485 * running -- it might have deleted a net namespace from 2486 * the global list, so hook drop above might not have 2487 * affected all namespaces. 2488 */ 2489 net_ns_barrier(); 2490 2491 /* a skb w. unconfirmed conntrack could have been reinjected just 2492 * before we called nf_queue_nf_hook_drop(). 2493 * 2494 * This makes sure its inserted into conntrack table. 2495 */ 2496 synchronize_net(); 2497 2498 nf_ct_ext_bump_genid(); 2499 iter_data.data = data; 2500 nf_ct_iterate_cleanup(iter, &iter_data); 2501 2502 /* Another cpu might be in a rcu read section with 2503 * rcu protected pointer cleared in iter callback 2504 * or hidden via nf_ct_ext_bump_genid() above. 2505 * 2506 * Wait until those are done. 2507 */ 2508 synchronize_rcu(); 2509 } 2510 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2511 2512 static int kill_all(struct nf_conn *i, void *data) 2513 { 2514 return 1; 2515 } 2516 2517 void nf_conntrack_cleanup_start(void) 2518 { 2519 cleanup_nf_conntrack_bpf(); 2520 conntrack_gc_work.exiting = true; 2521 } 2522 2523 void nf_conntrack_cleanup_end(void) 2524 { 2525 RCU_INIT_POINTER(nf_ct_hook, NULL); 2526 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2527 kvfree(nf_conntrack_hash); 2528 2529 nf_conntrack_proto_fini(); 2530 nf_conntrack_helper_fini(); 2531 nf_conntrack_expect_fini(); 2532 2533 kmem_cache_destroy(nf_conntrack_cachep); 2534 } 2535 2536 /* 2537 * Mishearing the voices in his head, our hero wonders how he's 2538 * supposed to kill the mall. 2539 */ 2540 void nf_conntrack_cleanup_net(struct net *net) 2541 { 2542 LIST_HEAD(single); 2543 2544 list_add(&net->exit_list, &single); 2545 nf_conntrack_cleanup_net_list(&single); 2546 } 2547 2548 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2549 { 2550 struct nf_ct_iter_data iter_data = {}; 2551 struct net *net; 2552 int busy; 2553 2554 /* 2555 * This makes sure all current packets have passed through 2556 * netfilter framework. Roll on, two-stage module 2557 * delete... 2558 */ 2559 synchronize_net(); 2560 i_see_dead_people: 2561 busy = 0; 2562 list_for_each_entry(net, net_exit_list, exit_list) { 2563 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2564 2565 iter_data.net = net; 2566 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2567 if (atomic_read(&cnet->count) != 0) 2568 busy = 1; 2569 } 2570 if (busy) { 2571 schedule(); 2572 goto i_see_dead_people; 2573 } 2574 2575 list_for_each_entry(net, net_exit_list, exit_list) { 2576 nf_conntrack_ecache_pernet_fini(net); 2577 nf_conntrack_expect_pernet_fini(net); 2578 free_percpu(net->ct.stat); 2579 } 2580 } 2581 2582 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2583 { 2584 struct hlist_nulls_head *hash; 2585 unsigned int nr_slots, i; 2586 2587 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2588 return NULL; 2589 2590 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2591 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2592 2593 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2594 2595 if (hash && nulls) 2596 for (i = 0; i < nr_slots; i++) 2597 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2598 2599 return hash; 2600 } 2601 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2602 2603 int nf_conntrack_hash_resize(unsigned int hashsize) 2604 { 2605 int i, bucket; 2606 unsigned int old_size; 2607 struct hlist_nulls_head *hash, *old_hash; 2608 struct nf_conntrack_tuple_hash *h; 2609 struct nf_conn *ct; 2610 2611 if (!hashsize) 2612 return -EINVAL; 2613 2614 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2615 if (!hash) 2616 return -ENOMEM; 2617 2618 mutex_lock(&nf_conntrack_mutex); 2619 old_size = nf_conntrack_htable_size; 2620 if (old_size == hashsize) { 2621 mutex_unlock(&nf_conntrack_mutex); 2622 kvfree(hash); 2623 return 0; 2624 } 2625 2626 local_bh_disable(); 2627 nf_conntrack_all_lock(); 2628 write_seqcount_begin(&nf_conntrack_generation); 2629 2630 /* Lookups in the old hash might happen in parallel, which means we 2631 * might get false negatives during connection lookup. New connections 2632 * created because of a false negative won't make it into the hash 2633 * though since that required taking the locks. 2634 */ 2635 2636 for (i = 0; i < nf_conntrack_htable_size; i++) { 2637 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2638 unsigned int zone_id; 2639 2640 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2641 struct nf_conntrack_tuple_hash, hnnode); 2642 ct = nf_ct_tuplehash_to_ctrack(h); 2643 hlist_nulls_del_rcu(&h->hnnode); 2644 2645 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2646 bucket = __hash_conntrack(nf_ct_net(ct), 2647 &h->tuple, zone_id, hashsize); 2648 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2649 } 2650 } 2651 old_hash = nf_conntrack_hash; 2652 2653 nf_conntrack_hash = hash; 2654 nf_conntrack_htable_size = hashsize; 2655 2656 write_seqcount_end(&nf_conntrack_generation); 2657 nf_conntrack_all_unlock(); 2658 local_bh_enable(); 2659 2660 mutex_unlock(&nf_conntrack_mutex); 2661 2662 synchronize_net(); 2663 kvfree(old_hash); 2664 return 0; 2665 } 2666 2667 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2668 { 2669 unsigned int hashsize; 2670 int rc; 2671 2672 if (current->nsproxy->net_ns != &init_net) 2673 return -EOPNOTSUPP; 2674 2675 /* On boot, we can set this without any fancy locking. */ 2676 if (!nf_conntrack_hash) 2677 return param_set_uint(val, kp); 2678 2679 rc = kstrtouint(val, 0, &hashsize); 2680 if (rc) 2681 return rc; 2682 2683 return nf_conntrack_hash_resize(hashsize); 2684 } 2685 2686 int nf_conntrack_init_start(void) 2687 { 2688 unsigned long nr_pages = totalram_pages(); 2689 int max_factor = 8; 2690 int ret = -ENOMEM; 2691 int i; 2692 2693 seqcount_spinlock_init(&nf_conntrack_generation, 2694 &nf_conntrack_locks_all_lock); 2695 2696 for (i = 0; i < CONNTRACK_LOCKS; i++) 2697 spin_lock_init(&nf_conntrack_locks[i]); 2698 2699 if (!nf_conntrack_htable_size) { 2700 nf_conntrack_htable_size 2701 = (((nr_pages << PAGE_SHIFT) / 16384) 2702 / sizeof(struct hlist_head)); 2703 if (BITS_PER_LONG >= 64 && 2704 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2705 nf_conntrack_htable_size = 262144; 2706 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2707 nf_conntrack_htable_size = 65536; 2708 2709 if (nf_conntrack_htable_size < 1024) 2710 nf_conntrack_htable_size = 1024; 2711 /* Use a max. factor of one by default to keep the average 2712 * hash chain length at 2 entries. Each entry has to be added 2713 * twice (once for original direction, once for reply). 2714 * When a table size is given we use the old value of 8 to 2715 * avoid implicit reduction of the max entries setting. 2716 */ 2717 max_factor = 1; 2718 } 2719 2720 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2721 if (!nf_conntrack_hash) 2722 return -ENOMEM; 2723 2724 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2725 2726 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2727 sizeof(struct nf_conn), 2728 NFCT_INFOMASK + 1, 2729 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2730 if (!nf_conntrack_cachep) 2731 goto err_cachep; 2732 2733 ret = nf_conntrack_expect_init(); 2734 if (ret < 0) 2735 goto err_expect; 2736 2737 ret = nf_conntrack_helper_init(); 2738 if (ret < 0) 2739 goto err_helper; 2740 2741 ret = nf_conntrack_proto_init(); 2742 if (ret < 0) 2743 goto err_proto; 2744 2745 conntrack_gc_work_init(&conntrack_gc_work); 2746 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2747 2748 ret = register_nf_conntrack_bpf(); 2749 if (ret < 0) 2750 goto err_kfunc; 2751 2752 return 0; 2753 2754 err_kfunc: 2755 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2756 nf_conntrack_proto_fini(); 2757 err_proto: 2758 nf_conntrack_helper_fini(); 2759 err_helper: 2760 nf_conntrack_expect_fini(); 2761 err_expect: 2762 kmem_cache_destroy(nf_conntrack_cachep); 2763 err_cachep: 2764 kvfree(nf_conntrack_hash); 2765 return ret; 2766 } 2767 2768 static const struct nf_ct_hook nf_conntrack_hook = { 2769 .update = nf_conntrack_update, 2770 .destroy = nf_ct_destroy, 2771 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2772 .attach = nf_conntrack_attach, 2773 }; 2774 2775 void nf_conntrack_init_end(void) 2776 { 2777 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2778 } 2779 2780 /* 2781 * We need to use special "null" values, not used in hash table 2782 */ 2783 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2784 2785 int nf_conntrack_init_net(struct net *net) 2786 { 2787 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2788 int ret = -ENOMEM; 2789 2790 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2791 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2792 atomic_set(&cnet->count, 0); 2793 2794 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2795 if (!net->ct.stat) 2796 return ret; 2797 2798 ret = nf_conntrack_expect_pernet_init(net); 2799 if (ret < 0) 2800 goto err_expect; 2801 2802 nf_conntrack_acct_pernet_init(net); 2803 nf_conntrack_tstamp_pernet_init(net); 2804 nf_conntrack_ecache_pernet_init(net); 2805 nf_conntrack_proto_pernet_init(net); 2806 2807 return 0; 2808 2809 err_expect: 2810 free_percpu(net->ct.stat); 2811 return ret; 2812 } 2813 2814 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2815 2816 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2817 { 2818 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2819 return -EPERM; 2820 2821 __nf_ct_set_timeout(ct, timeout); 2822 2823 if (test_bit(IPS_DYING_BIT, &ct->status)) 2824 return -ETIME; 2825 2826 return 0; 2827 } 2828 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2829 2830 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2831 { 2832 unsigned int bit; 2833 2834 /* Ignore these unchangable bits */ 2835 on &= ~IPS_UNCHANGEABLE_MASK; 2836 off &= ~IPS_UNCHANGEABLE_MASK; 2837 2838 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2839 if (on & (1 << bit)) 2840 set_bit(bit, &ct->status); 2841 else if (off & (1 << bit)) 2842 clear_bit(bit, &ct->status); 2843 } 2844 } 2845 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2846 2847 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2848 { 2849 unsigned long d; 2850 2851 d = ct->status ^ status; 2852 2853 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2854 /* unchangeable */ 2855 return -EBUSY; 2856 2857 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2858 /* SEEN_REPLY bit can only be set */ 2859 return -EBUSY; 2860 2861 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2862 /* ASSURED bit can only be set */ 2863 return -EBUSY; 2864 2865 __nf_ct_change_status(ct, status, 0); 2866 return 0; 2867 } 2868 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2869