1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 50u 100 #define MAX_CHAINLEN (80u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 140 unsigned int h2, unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 u64 a, b, c, d; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 /* The direction must be ignored, handle usable tuplehash members manually */ 219 a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; 220 b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3]; 221 222 c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; 223 c |= tuple->dst.protonum; 224 225 d = (u64)zoneid << 32 | net_hash_mix(net); 226 227 /* IPv4: u3.all[1,2,3] == 0 */ 228 c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; 229 d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; 230 231 return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); 232 } 233 234 static u32 scale_hash(u32 hash) 235 { 236 return reciprocal_scale(hash, nf_conntrack_htable_size); 237 } 238 239 static u32 __hash_conntrack(const struct net *net, 240 const struct nf_conntrack_tuple *tuple, 241 unsigned int zoneid, 242 unsigned int size) 243 { 244 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 245 } 246 247 static u32 hash_conntrack(const struct net *net, 248 const struct nf_conntrack_tuple *tuple, 249 unsigned int zoneid) 250 { 251 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 252 } 253 254 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 255 unsigned int dataoff, 256 struct nf_conntrack_tuple *tuple) 257 { struct { 258 __be16 sport; 259 __be16 dport; 260 } _inet_hdr, *inet_hdr; 261 262 /* Actually only need first 4 bytes to get ports. */ 263 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 264 if (!inet_hdr) 265 return false; 266 267 tuple->src.u.udp.port = inet_hdr->sport; 268 tuple->dst.u.udp.port = inet_hdr->dport; 269 return true; 270 } 271 272 static bool 273 nf_ct_get_tuple(const struct sk_buff *skb, 274 unsigned int nhoff, 275 unsigned int dataoff, 276 u_int16_t l3num, 277 u_int8_t protonum, 278 struct net *net, 279 struct nf_conntrack_tuple *tuple) 280 { 281 unsigned int size; 282 const __be32 *ap; 283 __be32 _addrs[8]; 284 285 memset(tuple, 0, sizeof(*tuple)); 286 287 tuple->src.l3num = l3num; 288 switch (l3num) { 289 case NFPROTO_IPV4: 290 nhoff += offsetof(struct iphdr, saddr); 291 size = 2 * sizeof(__be32); 292 break; 293 case NFPROTO_IPV6: 294 nhoff += offsetof(struct ipv6hdr, saddr); 295 size = sizeof(_addrs); 296 break; 297 default: 298 return true; 299 } 300 301 ap = skb_header_pointer(skb, nhoff, size, _addrs); 302 if (!ap) 303 return false; 304 305 switch (l3num) { 306 case NFPROTO_IPV4: 307 tuple->src.u3.ip = ap[0]; 308 tuple->dst.u3.ip = ap[1]; 309 break; 310 case NFPROTO_IPV6: 311 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 312 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 313 break; 314 } 315 316 tuple->dst.protonum = protonum; 317 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 318 319 switch (protonum) { 320 #if IS_ENABLED(CONFIG_IPV6) 321 case IPPROTO_ICMPV6: 322 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 323 #endif 324 case IPPROTO_ICMP: 325 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 326 #ifdef CONFIG_NF_CT_PROTO_GRE 327 case IPPROTO_GRE: 328 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 329 #endif 330 case IPPROTO_TCP: 331 case IPPROTO_UDP: 332 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 333 case IPPROTO_UDPLITE: 334 #endif 335 #ifdef CONFIG_NF_CT_PROTO_SCTP 336 case IPPROTO_SCTP: 337 #endif 338 #ifdef CONFIG_NF_CT_PROTO_DCCP 339 case IPPROTO_DCCP: 340 #endif 341 /* fallthrough */ 342 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 343 default: 344 break; 345 } 346 347 return true; 348 } 349 350 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 351 u_int8_t *protonum) 352 { 353 int dataoff = -1; 354 const struct iphdr *iph; 355 struct iphdr _iph; 356 357 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 358 if (!iph) 359 return -1; 360 361 /* Conntrack defragments packets, we might still see fragments 362 * inside ICMP packets though. 363 */ 364 if (iph->frag_off & htons(IP_OFFSET)) 365 return -1; 366 367 dataoff = nhoff + (iph->ihl << 2); 368 *protonum = iph->protocol; 369 370 /* Check bogus IP headers */ 371 if (dataoff > skb->len) { 372 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 373 nhoff, iph->ihl << 2, skb->len); 374 return -1; 375 } 376 return dataoff; 377 } 378 379 #if IS_ENABLED(CONFIG_IPV6) 380 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 381 u8 *protonum) 382 { 383 int protoff = -1; 384 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 385 __be16 frag_off; 386 u8 nexthdr; 387 388 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 389 &nexthdr, sizeof(nexthdr)) != 0) { 390 pr_debug("can't get nexthdr\n"); 391 return -1; 392 } 393 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 394 /* 395 * (protoff == skb->len) means the packet has not data, just 396 * IPv6 and possibly extensions headers, but it is tracked anyway 397 */ 398 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 399 pr_debug("can't find proto in pkt\n"); 400 return -1; 401 } 402 403 *protonum = nexthdr; 404 return protoff; 405 } 406 #endif 407 408 static int get_l4proto(const struct sk_buff *skb, 409 unsigned int nhoff, u8 pf, u8 *l4num) 410 { 411 switch (pf) { 412 case NFPROTO_IPV4: 413 return ipv4_get_l4proto(skb, nhoff, l4num); 414 #if IS_ENABLED(CONFIG_IPV6) 415 case NFPROTO_IPV6: 416 return ipv6_get_l4proto(skb, nhoff, l4num); 417 #endif 418 default: 419 *l4num = 0; 420 break; 421 } 422 return -1; 423 } 424 425 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 426 u_int16_t l3num, 427 struct net *net, struct nf_conntrack_tuple *tuple) 428 { 429 u8 protonum; 430 int protoff; 431 432 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 433 if (protoff <= 0) 434 return false; 435 436 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 437 } 438 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 439 440 bool 441 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 442 const struct nf_conntrack_tuple *orig) 443 { 444 memset(inverse, 0, sizeof(*inverse)); 445 446 inverse->src.l3num = orig->src.l3num; 447 448 switch (orig->src.l3num) { 449 case NFPROTO_IPV4: 450 inverse->src.u3.ip = orig->dst.u3.ip; 451 inverse->dst.u3.ip = orig->src.u3.ip; 452 break; 453 case NFPROTO_IPV6: 454 inverse->src.u3.in6 = orig->dst.u3.in6; 455 inverse->dst.u3.in6 = orig->src.u3.in6; 456 break; 457 default: 458 break; 459 } 460 461 inverse->dst.dir = !orig->dst.dir; 462 463 inverse->dst.protonum = orig->dst.protonum; 464 465 switch (orig->dst.protonum) { 466 case IPPROTO_ICMP: 467 return nf_conntrack_invert_icmp_tuple(inverse, orig); 468 #if IS_ENABLED(CONFIG_IPV6) 469 case IPPROTO_ICMPV6: 470 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 471 #endif 472 } 473 474 inverse->src.u.all = orig->dst.u.all; 475 inverse->dst.u.all = orig->src.u.all; 476 return true; 477 } 478 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 479 480 /* Generate a almost-unique pseudo-id for a given conntrack. 481 * 482 * intentionally doesn't re-use any of the seeds used for hash 483 * table location, we assume id gets exposed to userspace. 484 * 485 * Following nf_conn items do not change throughout lifetime 486 * of the nf_conn: 487 * 488 * 1. nf_conn address 489 * 2. nf_conn->master address (normally NULL) 490 * 3. the associated net namespace 491 * 4. the original direction tuple 492 */ 493 u32 nf_ct_get_id(const struct nf_conn *ct) 494 { 495 static siphash_aligned_key_t ct_id_seed; 496 unsigned long a, b, c, d; 497 498 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 499 500 a = (unsigned long)ct; 501 b = (unsigned long)ct->master; 502 c = (unsigned long)nf_ct_net(ct); 503 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 504 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 505 &ct_id_seed); 506 #ifdef CONFIG_64BIT 507 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 508 #else 509 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 510 #endif 511 } 512 EXPORT_SYMBOL_GPL(nf_ct_get_id); 513 514 static void 515 clean_from_lists(struct nf_conn *ct) 516 { 517 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 518 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 519 520 /* Destroy all pending expectations */ 521 nf_ct_remove_expectations(ct); 522 } 523 524 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 525 526 /* Released via nf_ct_destroy() */ 527 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 528 const struct nf_conntrack_zone *zone, 529 gfp_t flags) 530 { 531 struct nf_conn *tmpl, *p; 532 533 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 534 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 535 if (!tmpl) 536 return NULL; 537 538 p = tmpl; 539 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 540 if (tmpl != p) { 541 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 542 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 543 } 544 } else { 545 tmpl = kzalloc(sizeof(*tmpl), flags); 546 if (!tmpl) 547 return NULL; 548 } 549 550 tmpl->status = IPS_TEMPLATE; 551 write_pnet(&tmpl->ct_net, net); 552 nf_ct_zone_add(tmpl, zone); 553 refcount_set(&tmpl->ct_general.use, 1); 554 555 return tmpl; 556 } 557 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 558 559 void nf_ct_tmpl_free(struct nf_conn *tmpl) 560 { 561 kfree(tmpl->ext); 562 563 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 564 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 565 else 566 kfree(tmpl); 567 } 568 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 569 570 static void destroy_gre_conntrack(struct nf_conn *ct) 571 { 572 #ifdef CONFIG_NF_CT_PROTO_GRE 573 struct nf_conn *master = ct->master; 574 575 if (master) 576 nf_ct_gre_keymap_destroy(master); 577 #endif 578 } 579 580 void nf_ct_destroy(struct nf_conntrack *nfct) 581 { 582 struct nf_conn *ct = (struct nf_conn *)nfct; 583 584 WARN_ON(refcount_read(&nfct->use) != 0); 585 586 if (unlikely(nf_ct_is_template(ct))) { 587 nf_ct_tmpl_free(ct); 588 return; 589 } 590 591 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 592 destroy_gre_conntrack(ct); 593 594 /* Expectations will have been removed in clean_from_lists, 595 * except TFTP can create an expectation on the first packet, 596 * before connection is in the list, so we need to clean here, 597 * too. 598 */ 599 nf_ct_remove_expectations(ct); 600 601 if (ct->master) 602 nf_ct_put(ct->master); 603 604 nf_conntrack_free(ct); 605 } 606 EXPORT_SYMBOL(nf_ct_destroy); 607 608 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 609 { 610 struct net *net = nf_ct_net(ct); 611 unsigned int hash, reply_hash; 612 unsigned int sequence; 613 614 do { 615 sequence = read_seqcount_begin(&nf_conntrack_generation); 616 hash = hash_conntrack(net, 617 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 618 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 619 reply_hash = hash_conntrack(net, 620 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 621 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 622 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 623 624 clean_from_lists(ct); 625 nf_conntrack_double_unlock(hash, reply_hash); 626 } 627 628 static void nf_ct_delete_from_lists(struct nf_conn *ct) 629 { 630 nf_ct_helper_destroy(ct); 631 local_bh_disable(); 632 633 __nf_ct_delete_from_lists(ct); 634 635 local_bh_enable(); 636 } 637 638 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 639 { 640 #ifdef CONFIG_NF_CONNTRACK_EVENTS 641 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 642 643 spin_lock(&cnet->ecache.dying_lock); 644 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 645 &cnet->ecache.dying_list); 646 spin_unlock(&cnet->ecache.dying_lock); 647 #endif 648 } 649 650 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 651 { 652 struct nf_conn_tstamp *tstamp; 653 struct net *net; 654 655 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 656 return false; 657 658 tstamp = nf_conn_tstamp_find(ct); 659 if (tstamp) { 660 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 661 662 tstamp->stop = ktime_get_real_ns(); 663 if (timeout < 0) 664 tstamp->stop -= jiffies_to_nsecs(-timeout); 665 } 666 667 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 668 portid, report) < 0) { 669 /* destroy event was not delivered. nf_ct_put will 670 * be done by event cache worker on redelivery. 671 */ 672 nf_ct_helper_destroy(ct); 673 local_bh_disable(); 674 __nf_ct_delete_from_lists(ct); 675 nf_ct_add_to_ecache_list(ct); 676 local_bh_enable(); 677 678 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 679 return false; 680 } 681 682 net = nf_ct_net(ct); 683 if (nf_conntrack_ecache_dwork_pending(net)) 684 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 685 nf_ct_delete_from_lists(ct); 686 nf_ct_put(ct); 687 return true; 688 } 689 EXPORT_SYMBOL_GPL(nf_ct_delete); 690 691 static inline bool 692 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 693 const struct nf_conntrack_tuple *tuple, 694 const struct nf_conntrack_zone *zone, 695 const struct net *net) 696 { 697 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 698 699 /* A conntrack can be recreated with the equal tuple, 700 * so we need to check that the conntrack is confirmed 701 */ 702 return nf_ct_tuple_equal(tuple, &h->tuple) && 703 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 704 nf_ct_is_confirmed(ct) && 705 net_eq(net, nf_ct_net(ct)); 706 } 707 708 static inline bool 709 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 710 { 711 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 712 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 713 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 714 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 715 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 716 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 717 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 718 } 719 720 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 721 static void nf_ct_gc_expired(struct nf_conn *ct) 722 { 723 if (!refcount_inc_not_zero(&ct->ct_general.use)) 724 return; 725 726 /* load ->status after refcount increase */ 727 smp_acquire__after_ctrl_dep(); 728 729 if (nf_ct_should_gc(ct)) 730 nf_ct_kill(ct); 731 732 nf_ct_put(ct); 733 } 734 735 /* 736 * Warning : 737 * - Caller must take a reference on returned object 738 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 739 */ 740 static struct nf_conntrack_tuple_hash * 741 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 742 const struct nf_conntrack_tuple *tuple, u32 hash) 743 { 744 struct nf_conntrack_tuple_hash *h; 745 struct hlist_nulls_head *ct_hash; 746 struct hlist_nulls_node *n; 747 unsigned int bucket, hsize; 748 749 begin: 750 nf_conntrack_get_ht(&ct_hash, &hsize); 751 bucket = reciprocal_scale(hash, hsize); 752 753 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 754 struct nf_conn *ct; 755 756 ct = nf_ct_tuplehash_to_ctrack(h); 757 if (nf_ct_is_expired(ct)) { 758 nf_ct_gc_expired(ct); 759 continue; 760 } 761 762 if (nf_ct_key_equal(h, tuple, zone, net)) 763 return h; 764 } 765 /* 766 * if the nulls value we got at the end of this lookup is 767 * not the expected one, we must restart lookup. 768 * We probably met an item that was moved to another chain. 769 */ 770 if (get_nulls_value(n) != bucket) { 771 NF_CT_STAT_INC_ATOMIC(net, search_restart); 772 goto begin; 773 } 774 775 return NULL; 776 } 777 778 /* Find a connection corresponding to a tuple. */ 779 static struct nf_conntrack_tuple_hash * 780 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 781 const struct nf_conntrack_tuple *tuple, u32 hash) 782 { 783 struct nf_conntrack_tuple_hash *h; 784 struct nf_conn *ct; 785 786 h = ____nf_conntrack_find(net, zone, tuple, hash); 787 if (h) { 788 /* We have a candidate that matches the tuple we're interested 789 * in, try to obtain a reference and re-check tuple 790 */ 791 ct = nf_ct_tuplehash_to_ctrack(h); 792 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 793 /* re-check key after refcount */ 794 smp_acquire__after_ctrl_dep(); 795 796 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 797 return h; 798 799 /* TYPESAFE_BY_RCU recycled the candidate */ 800 nf_ct_put(ct); 801 } 802 803 h = NULL; 804 } 805 806 return h; 807 } 808 809 struct nf_conntrack_tuple_hash * 810 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 811 const struct nf_conntrack_tuple *tuple) 812 { 813 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 814 struct nf_conntrack_tuple_hash *thash; 815 816 rcu_read_lock(); 817 818 thash = __nf_conntrack_find_get(net, zone, tuple, 819 hash_conntrack_raw(tuple, zone_id, net)); 820 821 if (thash) 822 goto out_unlock; 823 824 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 825 if (rid != zone_id) 826 thash = __nf_conntrack_find_get(net, zone, tuple, 827 hash_conntrack_raw(tuple, rid, net)); 828 829 out_unlock: 830 rcu_read_unlock(); 831 return thash; 832 } 833 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 834 835 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 836 unsigned int hash, 837 unsigned int reply_hash) 838 { 839 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 840 &nf_conntrack_hash[hash]); 841 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 842 &nf_conntrack_hash[reply_hash]); 843 } 844 845 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 846 { 847 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 848 * may contain stale pointers to e.g. helper that has been removed. 849 * 850 * The helper can't clear this because the nf_conn object isn't in 851 * any hash and synchronize_rcu() isn't enough because associated skb 852 * might sit in a queue. 853 */ 854 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 855 } 856 857 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 858 { 859 if (!ext) 860 return true; 861 862 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 863 return false; 864 865 /* inserted into conntrack table, nf_ct_iterate_cleanup() 866 * will find it. Disable nf_ct_ext_find() id check. 867 */ 868 WRITE_ONCE(ext->gen_id, 0); 869 return true; 870 } 871 872 int 873 nf_conntrack_hash_check_insert(struct nf_conn *ct) 874 { 875 const struct nf_conntrack_zone *zone; 876 struct net *net = nf_ct_net(ct); 877 unsigned int hash, reply_hash; 878 struct nf_conntrack_tuple_hash *h; 879 struct hlist_nulls_node *n; 880 unsigned int max_chainlen; 881 unsigned int chainlen = 0; 882 unsigned int sequence; 883 int err = -EEXIST; 884 885 zone = nf_ct_zone(ct); 886 887 if (!nf_ct_ext_valid_pre(ct->ext)) 888 return -EAGAIN; 889 890 local_bh_disable(); 891 do { 892 sequence = read_seqcount_begin(&nf_conntrack_generation); 893 hash = hash_conntrack(net, 894 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 895 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 896 reply_hash = hash_conntrack(net, 897 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 898 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 899 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 900 901 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 902 903 /* See if there's one in the list already, including reverse */ 904 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 905 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 906 zone, net)) 907 goto out; 908 909 if (chainlen++ > max_chainlen) 910 goto chaintoolong; 911 } 912 913 chainlen = 0; 914 915 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 916 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 917 zone, net)) 918 goto out; 919 if (chainlen++ > max_chainlen) 920 goto chaintoolong; 921 } 922 923 /* If genid has changed, we can't insert anymore because ct 924 * extensions could have stale pointers and nf_ct_iterate_destroy 925 * might have completed its table scan already. 926 * 927 * Increment of the ext genid right after this check is fine: 928 * nf_ct_iterate_destroy blocks until locks are released. 929 */ 930 if (!nf_ct_ext_valid_post(ct->ext)) { 931 err = -EAGAIN; 932 goto out; 933 } 934 935 smp_wmb(); 936 /* The caller holds a reference to this object */ 937 refcount_set(&ct->ct_general.use, 2); 938 __nf_conntrack_hash_insert(ct, hash, reply_hash); 939 nf_conntrack_double_unlock(hash, reply_hash); 940 NF_CT_STAT_INC(net, insert); 941 local_bh_enable(); 942 943 return 0; 944 chaintoolong: 945 NF_CT_STAT_INC(net, chaintoolong); 946 err = -ENOSPC; 947 out: 948 nf_conntrack_double_unlock(hash, reply_hash); 949 local_bh_enable(); 950 return err; 951 } 952 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 953 954 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 955 unsigned int bytes) 956 { 957 struct nf_conn_acct *acct; 958 959 acct = nf_conn_acct_find(ct); 960 if (acct) { 961 struct nf_conn_counter *counter = acct->counter; 962 963 atomic64_add(packets, &counter[dir].packets); 964 atomic64_add(bytes, &counter[dir].bytes); 965 } 966 } 967 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 968 969 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 970 const struct nf_conn *loser_ct) 971 { 972 struct nf_conn_acct *acct; 973 974 acct = nf_conn_acct_find(loser_ct); 975 if (acct) { 976 struct nf_conn_counter *counter = acct->counter; 977 unsigned int bytes; 978 979 /* u32 should be fine since we must have seen one packet. */ 980 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 981 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 982 } 983 } 984 985 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 986 { 987 struct nf_conn_tstamp *tstamp; 988 989 refcount_inc(&ct->ct_general.use); 990 991 /* set conntrack timestamp, if enabled. */ 992 tstamp = nf_conn_tstamp_find(ct); 993 if (tstamp) 994 tstamp->start = ktime_get_real_ns(); 995 } 996 997 /* caller must hold locks to prevent concurrent changes */ 998 static int __nf_ct_resolve_clash(struct sk_buff *skb, 999 struct nf_conntrack_tuple_hash *h) 1000 { 1001 /* This is the conntrack entry already in hashes that won race. */ 1002 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1003 enum ip_conntrack_info ctinfo; 1004 struct nf_conn *loser_ct; 1005 1006 loser_ct = nf_ct_get(skb, &ctinfo); 1007 1008 if (nf_ct_is_dying(ct)) 1009 return NF_DROP; 1010 1011 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1012 nf_ct_match(ct, loser_ct)) { 1013 struct net *net = nf_ct_net(ct); 1014 1015 nf_conntrack_get(&ct->ct_general); 1016 1017 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1018 nf_ct_put(loser_ct); 1019 nf_ct_set(skb, ct, ctinfo); 1020 1021 NF_CT_STAT_INC(net, clash_resolve); 1022 return NF_ACCEPT; 1023 } 1024 1025 return NF_DROP; 1026 } 1027 1028 /** 1029 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1030 * 1031 * @skb: skb that causes the collision 1032 * @repl_idx: hash slot for reply direction 1033 * 1034 * Called when origin or reply direction had a clash. 1035 * The skb can be handled without packet drop provided the reply direction 1036 * is unique or there the existing entry has the identical tuple in both 1037 * directions. 1038 * 1039 * Caller must hold conntrack table locks to prevent concurrent updates. 1040 * 1041 * Returns NF_DROP if the clash could not be handled. 1042 */ 1043 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1044 { 1045 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1046 const struct nf_conntrack_zone *zone; 1047 struct nf_conntrack_tuple_hash *h; 1048 struct hlist_nulls_node *n; 1049 struct net *net; 1050 1051 zone = nf_ct_zone(loser_ct); 1052 net = nf_ct_net(loser_ct); 1053 1054 /* Reply direction must never result in a clash, unless both origin 1055 * and reply tuples are identical. 1056 */ 1057 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1058 if (nf_ct_key_equal(h, 1059 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1060 zone, net)) 1061 return __nf_ct_resolve_clash(skb, h); 1062 } 1063 1064 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1065 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1066 1067 /* IPS_NAT_CLASH removes the entry automatically on the first 1068 * reply. Also prevents UDP tracker from moving the entry to 1069 * ASSURED state, i.e. the entry can always be evicted under 1070 * pressure. 1071 */ 1072 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1073 1074 __nf_conntrack_insert_prepare(loser_ct); 1075 1076 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1077 * already in the table. This also hides the clashing entry from 1078 * ctnetlink iteration, i.e. conntrack -L won't show them. 1079 */ 1080 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1081 1082 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1083 &nf_conntrack_hash[repl_idx]); 1084 1085 NF_CT_STAT_INC(net, clash_resolve); 1086 return NF_ACCEPT; 1087 } 1088 1089 /** 1090 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1091 * 1092 * @skb: skb that causes the clash 1093 * @h: tuplehash of the clashing entry already in table 1094 * @reply_hash: hash slot for reply direction 1095 * 1096 * A conntrack entry can be inserted to the connection tracking table 1097 * if there is no existing entry with an identical tuple. 1098 * 1099 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1100 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1101 * will find the already-existing entry. 1102 * 1103 * The major problem with such packet drop is the extra delay added by 1104 * the packet loss -- it will take some time for a retransmit to occur 1105 * (or the sender to time out when waiting for a reply). 1106 * 1107 * This function attempts to handle the situation without packet drop. 1108 * 1109 * If @skb has no NAT transformation or if the colliding entries are 1110 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1111 * and @skb is associated with the conntrack entry already in the table. 1112 * 1113 * Failing that, the new, unconfirmed conntrack is still added to the table 1114 * provided that the collision only occurs in the ORIGINAL direction. 1115 * The new entry will be added only in the non-clashing REPLY direction, 1116 * so packets in the ORIGINAL direction will continue to match the existing 1117 * entry. The new entry will also have a fixed timeout so it expires -- 1118 * due to the collision, it will only see reply traffic. 1119 * 1120 * Returns NF_DROP if the clash could not be resolved. 1121 */ 1122 static __cold noinline int 1123 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1124 u32 reply_hash) 1125 { 1126 /* This is the conntrack entry already in hashes that won race. */ 1127 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1128 const struct nf_conntrack_l4proto *l4proto; 1129 enum ip_conntrack_info ctinfo; 1130 struct nf_conn *loser_ct; 1131 struct net *net; 1132 int ret; 1133 1134 loser_ct = nf_ct_get(skb, &ctinfo); 1135 net = nf_ct_net(loser_ct); 1136 1137 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1138 if (!l4proto->allow_clash) 1139 goto drop; 1140 1141 ret = __nf_ct_resolve_clash(skb, h); 1142 if (ret == NF_ACCEPT) 1143 return ret; 1144 1145 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1146 if (ret == NF_ACCEPT) 1147 return ret; 1148 1149 drop: 1150 NF_CT_STAT_INC(net, drop); 1151 NF_CT_STAT_INC(net, insert_failed); 1152 return NF_DROP; 1153 } 1154 1155 /* Confirm a connection given skb; places it in hash table */ 1156 int 1157 __nf_conntrack_confirm(struct sk_buff *skb) 1158 { 1159 unsigned int chainlen = 0, sequence, max_chainlen; 1160 const struct nf_conntrack_zone *zone; 1161 unsigned int hash, reply_hash; 1162 struct nf_conntrack_tuple_hash *h; 1163 struct nf_conn *ct; 1164 struct nf_conn_help *help; 1165 struct hlist_nulls_node *n; 1166 enum ip_conntrack_info ctinfo; 1167 struct net *net; 1168 int ret = NF_DROP; 1169 1170 ct = nf_ct_get(skb, &ctinfo); 1171 net = nf_ct_net(ct); 1172 1173 /* ipt_REJECT uses nf_conntrack_attach to attach related 1174 ICMP/TCP RST packets in other direction. Actual packet 1175 which created connection will be IP_CT_NEW or for an 1176 expected connection, IP_CT_RELATED. */ 1177 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1178 return NF_ACCEPT; 1179 1180 zone = nf_ct_zone(ct); 1181 local_bh_disable(); 1182 1183 do { 1184 sequence = read_seqcount_begin(&nf_conntrack_generation); 1185 /* reuse the hash saved before */ 1186 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1187 hash = scale_hash(hash); 1188 reply_hash = hash_conntrack(net, 1189 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1190 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1191 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1192 1193 /* We're not in hash table, and we refuse to set up related 1194 * connections for unconfirmed conns. But packet copies and 1195 * REJECT will give spurious warnings here. 1196 */ 1197 1198 /* Another skb with the same unconfirmed conntrack may 1199 * win the race. This may happen for bridge(br_flood) 1200 * or broadcast/multicast packets do skb_clone with 1201 * unconfirmed conntrack. 1202 */ 1203 if (unlikely(nf_ct_is_confirmed(ct))) { 1204 WARN_ON_ONCE(1); 1205 nf_conntrack_double_unlock(hash, reply_hash); 1206 local_bh_enable(); 1207 return NF_DROP; 1208 } 1209 1210 if (!nf_ct_ext_valid_pre(ct->ext)) { 1211 NF_CT_STAT_INC(net, insert_failed); 1212 goto dying; 1213 } 1214 1215 /* We have to check the DYING flag after unlink to prevent 1216 * a race against nf_ct_get_next_corpse() possibly called from 1217 * user context, else we insert an already 'dead' hash, blocking 1218 * further use of that particular connection -JM. 1219 */ 1220 ct->status |= IPS_CONFIRMED; 1221 1222 if (unlikely(nf_ct_is_dying(ct))) { 1223 NF_CT_STAT_INC(net, insert_failed); 1224 goto dying; 1225 } 1226 1227 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1228 /* See if there's one in the list already, including reverse: 1229 NAT could have grabbed it without realizing, since we're 1230 not in the hash. If there is, we lost race. */ 1231 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1232 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1233 zone, net)) 1234 goto out; 1235 if (chainlen++ > max_chainlen) 1236 goto chaintoolong; 1237 } 1238 1239 chainlen = 0; 1240 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1241 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1242 zone, net)) 1243 goto out; 1244 if (chainlen++ > max_chainlen) { 1245 chaintoolong: 1246 NF_CT_STAT_INC(net, chaintoolong); 1247 NF_CT_STAT_INC(net, insert_failed); 1248 ret = NF_DROP; 1249 goto dying; 1250 } 1251 } 1252 1253 /* Timer relative to confirmation time, not original 1254 setting time, otherwise we'd get timer wrap in 1255 weird delay cases. */ 1256 ct->timeout += nfct_time_stamp; 1257 1258 __nf_conntrack_insert_prepare(ct); 1259 1260 /* Since the lookup is lockless, hash insertion must be done after 1261 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1262 * guarantee that no other CPU can find the conntrack before the above 1263 * stores are visible. 1264 */ 1265 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1266 nf_conntrack_double_unlock(hash, reply_hash); 1267 local_bh_enable(); 1268 1269 /* ext area is still valid (rcu read lock is held, 1270 * but will go out of scope soon, we need to remove 1271 * this conntrack again. 1272 */ 1273 if (!nf_ct_ext_valid_post(ct->ext)) { 1274 nf_ct_kill(ct); 1275 NF_CT_STAT_INC_ATOMIC(net, drop); 1276 return NF_DROP; 1277 } 1278 1279 help = nfct_help(ct); 1280 if (help && help->helper) 1281 nf_conntrack_event_cache(IPCT_HELPER, ct); 1282 1283 nf_conntrack_event_cache(master_ct(ct) ? 1284 IPCT_RELATED : IPCT_NEW, ct); 1285 return NF_ACCEPT; 1286 1287 out: 1288 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1289 dying: 1290 nf_conntrack_double_unlock(hash, reply_hash); 1291 local_bh_enable(); 1292 return ret; 1293 } 1294 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1295 1296 /* Returns true if a connection corresponds to the tuple (required 1297 for NAT). */ 1298 int 1299 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1300 const struct nf_conn *ignored_conntrack) 1301 { 1302 struct net *net = nf_ct_net(ignored_conntrack); 1303 const struct nf_conntrack_zone *zone; 1304 struct nf_conntrack_tuple_hash *h; 1305 struct hlist_nulls_head *ct_hash; 1306 unsigned int hash, hsize; 1307 struct hlist_nulls_node *n; 1308 struct nf_conn *ct; 1309 1310 zone = nf_ct_zone(ignored_conntrack); 1311 1312 rcu_read_lock(); 1313 begin: 1314 nf_conntrack_get_ht(&ct_hash, &hsize); 1315 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1316 1317 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1318 ct = nf_ct_tuplehash_to_ctrack(h); 1319 1320 if (ct == ignored_conntrack) 1321 continue; 1322 1323 if (nf_ct_is_expired(ct)) { 1324 nf_ct_gc_expired(ct); 1325 continue; 1326 } 1327 1328 if (nf_ct_key_equal(h, tuple, zone, net)) { 1329 /* Tuple is taken already, so caller will need to find 1330 * a new source port to use. 1331 * 1332 * Only exception: 1333 * If the *original tuples* are identical, then both 1334 * conntracks refer to the same flow. 1335 * This is a rare situation, it can occur e.g. when 1336 * more than one UDP packet is sent from same socket 1337 * in different threads. 1338 * 1339 * Let nf_ct_resolve_clash() deal with this later. 1340 */ 1341 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1342 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1343 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1344 continue; 1345 1346 NF_CT_STAT_INC_ATOMIC(net, found); 1347 rcu_read_unlock(); 1348 return 1; 1349 } 1350 } 1351 1352 if (get_nulls_value(n) != hash) { 1353 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1354 goto begin; 1355 } 1356 1357 rcu_read_unlock(); 1358 1359 return 0; 1360 } 1361 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1362 1363 #define NF_CT_EVICTION_RANGE 8 1364 1365 /* There's a small race here where we may free a just-assured 1366 connection. Too bad: we're in trouble anyway. */ 1367 static unsigned int early_drop_list(struct net *net, 1368 struct hlist_nulls_head *head) 1369 { 1370 struct nf_conntrack_tuple_hash *h; 1371 struct hlist_nulls_node *n; 1372 unsigned int drops = 0; 1373 struct nf_conn *tmp; 1374 1375 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1376 tmp = nf_ct_tuplehash_to_ctrack(h); 1377 1378 if (nf_ct_is_expired(tmp)) { 1379 nf_ct_gc_expired(tmp); 1380 continue; 1381 } 1382 1383 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1384 !net_eq(nf_ct_net(tmp), net) || 1385 nf_ct_is_dying(tmp)) 1386 continue; 1387 1388 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1389 continue; 1390 1391 /* load ->ct_net and ->status after refcount increase */ 1392 smp_acquire__after_ctrl_dep(); 1393 1394 /* kill only if still in same netns -- might have moved due to 1395 * SLAB_TYPESAFE_BY_RCU rules. 1396 * 1397 * We steal the timer reference. If that fails timer has 1398 * already fired or someone else deleted it. Just drop ref 1399 * and move to next entry. 1400 */ 1401 if (net_eq(nf_ct_net(tmp), net) && 1402 nf_ct_is_confirmed(tmp) && 1403 nf_ct_delete(tmp, 0, 0)) 1404 drops++; 1405 1406 nf_ct_put(tmp); 1407 } 1408 1409 return drops; 1410 } 1411 1412 static noinline int early_drop(struct net *net, unsigned int hash) 1413 { 1414 unsigned int i, bucket; 1415 1416 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1417 struct hlist_nulls_head *ct_hash; 1418 unsigned int hsize, drops; 1419 1420 rcu_read_lock(); 1421 nf_conntrack_get_ht(&ct_hash, &hsize); 1422 if (!i) 1423 bucket = reciprocal_scale(hash, hsize); 1424 else 1425 bucket = (bucket + 1) % hsize; 1426 1427 drops = early_drop_list(net, &ct_hash[bucket]); 1428 rcu_read_unlock(); 1429 1430 if (drops) { 1431 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1432 return true; 1433 } 1434 } 1435 1436 return false; 1437 } 1438 1439 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1440 { 1441 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1442 } 1443 1444 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1445 { 1446 const struct nf_conntrack_l4proto *l4proto; 1447 u8 protonum = nf_ct_protonum(ct); 1448 1449 if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) 1450 return false; 1451 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1452 return true; 1453 1454 l4proto = nf_ct_l4proto_find(protonum); 1455 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1456 return true; 1457 1458 return false; 1459 } 1460 1461 static void gc_worker(struct work_struct *work) 1462 { 1463 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1464 u32 end_time, start_time = nfct_time_stamp; 1465 struct conntrack_gc_work *gc_work; 1466 unsigned int expired_count = 0; 1467 unsigned long next_run; 1468 s32 delta_time; 1469 long count; 1470 1471 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1472 1473 i = gc_work->next_bucket; 1474 if (gc_work->early_drop) 1475 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1476 1477 if (i == 0) { 1478 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1479 gc_work->count = GC_SCAN_INITIAL_COUNT; 1480 gc_work->start_time = start_time; 1481 } 1482 1483 next_run = gc_work->avg_timeout; 1484 count = gc_work->count; 1485 1486 end_time = start_time + GC_SCAN_MAX_DURATION; 1487 1488 do { 1489 struct nf_conntrack_tuple_hash *h; 1490 struct hlist_nulls_head *ct_hash; 1491 struct hlist_nulls_node *n; 1492 struct nf_conn *tmp; 1493 1494 rcu_read_lock(); 1495 1496 nf_conntrack_get_ht(&ct_hash, &hashsz); 1497 if (i >= hashsz) { 1498 rcu_read_unlock(); 1499 break; 1500 } 1501 1502 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1503 struct nf_conntrack_net *cnet; 1504 struct net *net; 1505 long expires; 1506 1507 tmp = nf_ct_tuplehash_to_ctrack(h); 1508 1509 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1510 nf_ct_offload_timeout(tmp); 1511 if (!nf_conntrack_max95) 1512 continue; 1513 } 1514 1515 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1516 rcu_read_unlock(); 1517 1518 gc_work->next_bucket = i; 1519 gc_work->avg_timeout = next_run; 1520 gc_work->count = count; 1521 1522 delta_time = nfct_time_stamp - gc_work->start_time; 1523 1524 /* re-sched immediately if total cycle time is exceeded */ 1525 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1526 goto early_exit; 1527 } 1528 1529 if (nf_ct_is_expired(tmp)) { 1530 nf_ct_gc_expired(tmp); 1531 expired_count++; 1532 continue; 1533 } 1534 1535 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1536 expires = (expires - (long)next_run) / ++count; 1537 next_run += expires; 1538 1539 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1540 continue; 1541 1542 net = nf_ct_net(tmp); 1543 cnet = nf_ct_pernet(net); 1544 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1545 continue; 1546 1547 /* need to take reference to avoid possible races */ 1548 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1549 continue; 1550 1551 /* load ->status after refcount increase */ 1552 smp_acquire__after_ctrl_dep(); 1553 1554 if (gc_worker_skip_ct(tmp)) { 1555 nf_ct_put(tmp); 1556 continue; 1557 } 1558 1559 if (gc_worker_can_early_drop(tmp)) { 1560 nf_ct_kill(tmp); 1561 expired_count++; 1562 } 1563 1564 nf_ct_put(tmp); 1565 } 1566 1567 /* could check get_nulls_value() here and restart if ct 1568 * was moved to another chain. But given gc is best-effort 1569 * we will just continue with next hash slot. 1570 */ 1571 rcu_read_unlock(); 1572 cond_resched(); 1573 i++; 1574 1575 delta_time = nfct_time_stamp - end_time; 1576 if (delta_time > 0 && i < hashsz) { 1577 gc_work->avg_timeout = next_run; 1578 gc_work->count = count; 1579 gc_work->next_bucket = i; 1580 next_run = 0; 1581 goto early_exit; 1582 } 1583 } while (i < hashsz); 1584 1585 gc_work->next_bucket = 0; 1586 1587 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1588 1589 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1590 if (next_run > (unsigned long)delta_time) 1591 next_run -= delta_time; 1592 else 1593 next_run = 1; 1594 1595 early_exit: 1596 if (gc_work->exiting) 1597 return; 1598 1599 if (next_run) 1600 gc_work->early_drop = false; 1601 1602 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1603 } 1604 1605 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1606 { 1607 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1608 gc_work->exiting = false; 1609 } 1610 1611 static struct nf_conn * 1612 __nf_conntrack_alloc(struct net *net, 1613 const struct nf_conntrack_zone *zone, 1614 const struct nf_conntrack_tuple *orig, 1615 const struct nf_conntrack_tuple *repl, 1616 gfp_t gfp, u32 hash) 1617 { 1618 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1619 unsigned int ct_count; 1620 struct nf_conn *ct; 1621 1622 /* We don't want any race condition at early drop stage */ 1623 ct_count = atomic_inc_return(&cnet->count); 1624 1625 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1626 if (!early_drop(net, hash)) { 1627 if (!conntrack_gc_work.early_drop) 1628 conntrack_gc_work.early_drop = true; 1629 atomic_dec(&cnet->count); 1630 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1631 return ERR_PTR(-ENOMEM); 1632 } 1633 } 1634 1635 /* 1636 * Do not use kmem_cache_zalloc(), as this cache uses 1637 * SLAB_TYPESAFE_BY_RCU. 1638 */ 1639 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1640 if (ct == NULL) 1641 goto out; 1642 1643 spin_lock_init(&ct->lock); 1644 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1645 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1646 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1647 /* save hash for reusing when confirming */ 1648 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1649 ct->status = 0; 1650 WRITE_ONCE(ct->timeout, 0); 1651 write_pnet(&ct->ct_net, net); 1652 memset_after(ct, 0, __nfct_init_offset); 1653 1654 nf_ct_zone_add(ct, zone); 1655 1656 /* Because we use RCU lookups, we set ct_general.use to zero before 1657 * this is inserted in any list. 1658 */ 1659 refcount_set(&ct->ct_general.use, 0); 1660 return ct; 1661 out: 1662 atomic_dec(&cnet->count); 1663 return ERR_PTR(-ENOMEM); 1664 } 1665 1666 struct nf_conn *nf_conntrack_alloc(struct net *net, 1667 const struct nf_conntrack_zone *zone, 1668 const struct nf_conntrack_tuple *orig, 1669 const struct nf_conntrack_tuple *repl, 1670 gfp_t gfp) 1671 { 1672 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1673 } 1674 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1675 1676 void nf_conntrack_free(struct nf_conn *ct) 1677 { 1678 struct net *net = nf_ct_net(ct); 1679 struct nf_conntrack_net *cnet; 1680 1681 /* A freed object has refcnt == 0, that's 1682 * the golden rule for SLAB_TYPESAFE_BY_RCU 1683 */ 1684 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1685 1686 if (ct->status & IPS_SRC_NAT_DONE) { 1687 const struct nf_nat_hook *nat_hook; 1688 1689 rcu_read_lock(); 1690 nat_hook = rcu_dereference(nf_nat_hook); 1691 if (nat_hook) 1692 nat_hook->remove_nat_bysrc(ct); 1693 rcu_read_unlock(); 1694 } 1695 1696 kfree(ct->ext); 1697 kmem_cache_free(nf_conntrack_cachep, ct); 1698 cnet = nf_ct_pernet(net); 1699 1700 smp_mb__before_atomic(); 1701 atomic_dec(&cnet->count); 1702 } 1703 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1704 1705 1706 /* Allocate a new conntrack: we return -ENOMEM if classification 1707 failed due to stress. Otherwise it really is unclassifiable. */ 1708 static noinline struct nf_conntrack_tuple_hash * 1709 init_conntrack(struct net *net, struct nf_conn *tmpl, 1710 const struct nf_conntrack_tuple *tuple, 1711 struct sk_buff *skb, 1712 unsigned int dataoff, u32 hash) 1713 { 1714 struct nf_conn *ct; 1715 struct nf_conn_help *help; 1716 struct nf_conntrack_tuple repl_tuple; 1717 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1718 struct nf_conntrack_ecache *ecache; 1719 #endif 1720 struct nf_conntrack_expect *exp = NULL; 1721 const struct nf_conntrack_zone *zone; 1722 struct nf_conn_timeout *timeout_ext; 1723 struct nf_conntrack_zone tmp; 1724 struct nf_conntrack_net *cnet; 1725 1726 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1727 return NULL; 1728 1729 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1730 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1731 hash); 1732 if (IS_ERR(ct)) 1733 return (struct nf_conntrack_tuple_hash *)ct; 1734 1735 if (!nf_ct_add_synproxy(ct, tmpl)) { 1736 nf_conntrack_free(ct); 1737 return ERR_PTR(-ENOMEM); 1738 } 1739 1740 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1741 1742 if (timeout_ext) 1743 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1744 GFP_ATOMIC); 1745 1746 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1747 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1748 nf_ct_labels_ext_add(ct); 1749 1750 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1751 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1752 1753 if ((ecache || net->ct.sysctl_events) && 1754 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1755 ecache ? ecache->expmask : 0, 1756 GFP_ATOMIC)) { 1757 nf_conntrack_free(ct); 1758 return ERR_PTR(-ENOMEM); 1759 } 1760 #endif 1761 1762 cnet = nf_ct_pernet(net); 1763 if (cnet->expect_count) { 1764 spin_lock_bh(&nf_conntrack_expect_lock); 1765 exp = nf_ct_find_expectation(net, zone, tuple); 1766 if (exp) { 1767 /* Welcome, Mr. Bond. We've been expecting you... */ 1768 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1769 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1770 ct->master = exp->master; 1771 if (exp->helper) { 1772 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1773 if (help) 1774 rcu_assign_pointer(help->helper, exp->helper); 1775 } 1776 1777 #ifdef CONFIG_NF_CONNTRACK_MARK 1778 ct->mark = READ_ONCE(exp->master->mark); 1779 #endif 1780 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1781 ct->secmark = exp->master->secmark; 1782 #endif 1783 NF_CT_STAT_INC(net, expect_new); 1784 } 1785 spin_unlock_bh(&nf_conntrack_expect_lock); 1786 } 1787 if (!exp && tmpl) 1788 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1789 1790 /* Other CPU might have obtained a pointer to this object before it was 1791 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1792 * 1793 * After refcount_set(1) it will succeed; ensure that zeroing of 1794 * ct->status and the correct ct->net pointer are visible; else other 1795 * core might observe CONFIRMED bit which means the entry is valid and 1796 * in the hash table, but its not (anymore). 1797 */ 1798 smp_wmb(); 1799 1800 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1801 refcount_set(&ct->ct_general.use, 1); 1802 1803 if (exp) { 1804 if (exp->expectfn) 1805 exp->expectfn(ct, exp); 1806 nf_ct_expect_put(exp); 1807 } 1808 1809 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1810 } 1811 1812 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1813 static int 1814 resolve_normal_ct(struct nf_conn *tmpl, 1815 struct sk_buff *skb, 1816 unsigned int dataoff, 1817 u_int8_t protonum, 1818 const struct nf_hook_state *state) 1819 { 1820 const struct nf_conntrack_zone *zone; 1821 struct nf_conntrack_tuple tuple; 1822 struct nf_conntrack_tuple_hash *h; 1823 enum ip_conntrack_info ctinfo; 1824 struct nf_conntrack_zone tmp; 1825 u32 hash, zone_id, rid; 1826 struct nf_conn *ct; 1827 1828 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1829 dataoff, state->pf, protonum, state->net, 1830 &tuple)) 1831 return 0; 1832 1833 /* look for tuple match */ 1834 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1835 1836 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1837 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1838 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1839 1840 if (!h) { 1841 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1842 if (zone_id != rid) { 1843 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1844 1845 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1846 } 1847 } 1848 1849 if (!h) { 1850 h = init_conntrack(state->net, tmpl, &tuple, 1851 skb, dataoff, hash); 1852 if (!h) 1853 return 0; 1854 if (IS_ERR(h)) 1855 return PTR_ERR(h); 1856 } 1857 ct = nf_ct_tuplehash_to_ctrack(h); 1858 1859 /* It exists; we have (non-exclusive) reference. */ 1860 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1861 ctinfo = IP_CT_ESTABLISHED_REPLY; 1862 } else { 1863 unsigned long status = READ_ONCE(ct->status); 1864 1865 /* Once we've had two way comms, always ESTABLISHED. */ 1866 if (likely(status & IPS_SEEN_REPLY)) 1867 ctinfo = IP_CT_ESTABLISHED; 1868 else if (status & IPS_EXPECTED) 1869 ctinfo = IP_CT_RELATED; 1870 else 1871 ctinfo = IP_CT_NEW; 1872 } 1873 nf_ct_set(skb, ct, ctinfo); 1874 return 0; 1875 } 1876 1877 /* 1878 * icmp packets need special treatment to handle error messages that are 1879 * related to a connection. 1880 * 1881 * Callers need to check if skb has a conntrack assigned when this 1882 * helper returns; in such case skb belongs to an already known connection. 1883 */ 1884 static unsigned int __cold 1885 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1886 struct sk_buff *skb, 1887 unsigned int dataoff, 1888 u8 protonum, 1889 const struct nf_hook_state *state) 1890 { 1891 int ret; 1892 1893 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1894 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1895 #if IS_ENABLED(CONFIG_IPV6) 1896 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1897 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1898 #endif 1899 else 1900 return NF_ACCEPT; 1901 1902 if (ret <= 0) 1903 NF_CT_STAT_INC_ATOMIC(state->net, error); 1904 1905 return ret; 1906 } 1907 1908 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1909 enum ip_conntrack_info ctinfo) 1910 { 1911 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1912 1913 if (!timeout) 1914 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1915 1916 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1917 return NF_ACCEPT; 1918 } 1919 1920 /* Returns verdict for packet, or -1 for invalid. */ 1921 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1922 struct sk_buff *skb, 1923 unsigned int dataoff, 1924 enum ip_conntrack_info ctinfo, 1925 const struct nf_hook_state *state) 1926 { 1927 switch (nf_ct_protonum(ct)) { 1928 case IPPROTO_TCP: 1929 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1930 ctinfo, state); 1931 case IPPROTO_UDP: 1932 return nf_conntrack_udp_packet(ct, skb, dataoff, 1933 ctinfo, state); 1934 case IPPROTO_ICMP: 1935 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1936 #if IS_ENABLED(CONFIG_IPV6) 1937 case IPPROTO_ICMPV6: 1938 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1939 #endif 1940 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1941 case IPPROTO_UDPLITE: 1942 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1943 ctinfo, state); 1944 #endif 1945 #ifdef CONFIG_NF_CT_PROTO_SCTP 1946 case IPPROTO_SCTP: 1947 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1948 ctinfo, state); 1949 #endif 1950 #ifdef CONFIG_NF_CT_PROTO_DCCP 1951 case IPPROTO_DCCP: 1952 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1953 ctinfo, state); 1954 #endif 1955 #ifdef CONFIG_NF_CT_PROTO_GRE 1956 case IPPROTO_GRE: 1957 return nf_conntrack_gre_packet(ct, skb, dataoff, 1958 ctinfo, state); 1959 #endif 1960 } 1961 1962 return generic_packet(ct, skb, ctinfo); 1963 } 1964 1965 unsigned int 1966 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1967 { 1968 enum ip_conntrack_info ctinfo; 1969 struct nf_conn *ct, *tmpl; 1970 u_int8_t protonum; 1971 int dataoff, ret; 1972 1973 tmpl = nf_ct_get(skb, &ctinfo); 1974 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1975 /* Previously seen (loopback or untracked)? Ignore. */ 1976 if ((tmpl && !nf_ct_is_template(tmpl)) || 1977 ctinfo == IP_CT_UNTRACKED) 1978 return NF_ACCEPT; 1979 skb->_nfct = 0; 1980 } 1981 1982 /* rcu_read_lock()ed by nf_hook_thresh */ 1983 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1984 if (dataoff <= 0) { 1985 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1986 ret = NF_ACCEPT; 1987 goto out; 1988 } 1989 1990 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1991 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1992 protonum, state); 1993 if (ret <= 0) { 1994 ret = -ret; 1995 goto out; 1996 } 1997 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1998 if (skb->_nfct) 1999 goto out; 2000 } 2001 repeat: 2002 ret = resolve_normal_ct(tmpl, skb, dataoff, 2003 protonum, state); 2004 if (ret < 0) { 2005 /* Too stressed to deal. */ 2006 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2007 ret = NF_DROP; 2008 goto out; 2009 } 2010 2011 ct = nf_ct_get(skb, &ctinfo); 2012 if (!ct) { 2013 /* Not valid part of a connection */ 2014 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2015 ret = NF_ACCEPT; 2016 goto out; 2017 } 2018 2019 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2020 if (ret <= 0) { 2021 /* Invalid: inverse of the return code tells 2022 * the netfilter core what to do */ 2023 nf_ct_put(ct); 2024 skb->_nfct = 0; 2025 /* Special case: TCP tracker reports an attempt to reopen a 2026 * closed/aborted connection. We have to go back and create a 2027 * fresh conntrack. 2028 */ 2029 if (ret == -NF_REPEAT) 2030 goto repeat; 2031 2032 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2033 if (ret == -NF_DROP) 2034 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2035 2036 ret = -ret; 2037 goto out; 2038 } 2039 2040 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2041 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2042 nf_conntrack_event_cache(IPCT_REPLY, ct); 2043 out: 2044 if (tmpl) 2045 nf_ct_put(tmpl); 2046 2047 return ret; 2048 } 2049 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2050 2051 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 2052 implicitly racy: see __nf_conntrack_confirm */ 2053 void nf_conntrack_alter_reply(struct nf_conn *ct, 2054 const struct nf_conntrack_tuple *newreply) 2055 { 2056 struct nf_conn_help *help = nfct_help(ct); 2057 2058 /* Should be unconfirmed, so not in hash table yet */ 2059 WARN_ON(nf_ct_is_confirmed(ct)); 2060 2061 nf_ct_dump_tuple(newreply); 2062 2063 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 2064 if (ct->master || (help && !hlist_empty(&help->expectations))) 2065 return; 2066 } 2067 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 2068 2069 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2070 void __nf_ct_refresh_acct(struct nf_conn *ct, 2071 enum ip_conntrack_info ctinfo, 2072 const struct sk_buff *skb, 2073 u32 extra_jiffies, 2074 bool do_acct) 2075 { 2076 /* Only update if this is not a fixed timeout */ 2077 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2078 goto acct; 2079 2080 /* If not in hash table, timer will not be active yet */ 2081 if (nf_ct_is_confirmed(ct)) 2082 extra_jiffies += nfct_time_stamp; 2083 2084 if (READ_ONCE(ct->timeout) != extra_jiffies) 2085 WRITE_ONCE(ct->timeout, extra_jiffies); 2086 acct: 2087 if (do_acct) 2088 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2089 } 2090 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2091 2092 bool nf_ct_kill_acct(struct nf_conn *ct, 2093 enum ip_conntrack_info ctinfo, 2094 const struct sk_buff *skb) 2095 { 2096 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2097 2098 return nf_ct_delete(ct, 0, 0); 2099 } 2100 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2101 2102 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2103 2104 #include <linux/netfilter/nfnetlink.h> 2105 #include <linux/netfilter/nfnetlink_conntrack.h> 2106 #include <linux/mutex.h> 2107 2108 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2109 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2110 const struct nf_conntrack_tuple *tuple) 2111 { 2112 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2113 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2114 goto nla_put_failure; 2115 return 0; 2116 2117 nla_put_failure: 2118 return -1; 2119 } 2120 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2121 2122 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2123 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2124 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2125 }; 2126 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2127 2128 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2129 struct nf_conntrack_tuple *t, 2130 u_int32_t flags) 2131 { 2132 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2133 if (!tb[CTA_PROTO_SRC_PORT]) 2134 return -EINVAL; 2135 2136 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2137 } 2138 2139 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2140 if (!tb[CTA_PROTO_DST_PORT]) 2141 return -EINVAL; 2142 2143 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2144 } 2145 2146 return 0; 2147 } 2148 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2149 2150 unsigned int nf_ct_port_nlattr_tuple_size(void) 2151 { 2152 static unsigned int size __read_mostly; 2153 2154 if (!size) 2155 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2156 2157 return size; 2158 } 2159 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2160 #endif 2161 2162 /* Used by ipt_REJECT and ip6t_REJECT. */ 2163 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2164 { 2165 struct nf_conn *ct; 2166 enum ip_conntrack_info ctinfo; 2167 2168 /* This ICMP is in reverse direction to the packet which caused it */ 2169 ct = nf_ct_get(skb, &ctinfo); 2170 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2171 ctinfo = IP_CT_RELATED_REPLY; 2172 else 2173 ctinfo = IP_CT_RELATED; 2174 2175 /* Attach to new skbuff, and increment count */ 2176 nf_ct_set(nskb, ct, ctinfo); 2177 nf_conntrack_get(skb_nfct(nskb)); 2178 } 2179 2180 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2181 struct nf_conn *ct, 2182 enum ip_conntrack_info ctinfo) 2183 { 2184 const struct nf_nat_hook *nat_hook; 2185 struct nf_conntrack_tuple_hash *h; 2186 struct nf_conntrack_tuple tuple; 2187 unsigned int status; 2188 int dataoff; 2189 u16 l3num; 2190 u8 l4num; 2191 2192 l3num = nf_ct_l3num(ct); 2193 2194 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2195 if (dataoff <= 0) 2196 return -1; 2197 2198 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2199 l4num, net, &tuple)) 2200 return -1; 2201 2202 if (ct->status & IPS_SRC_NAT) { 2203 memcpy(tuple.src.u3.all, 2204 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2205 sizeof(tuple.src.u3.all)); 2206 tuple.src.u.all = 2207 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2208 } 2209 2210 if (ct->status & IPS_DST_NAT) { 2211 memcpy(tuple.dst.u3.all, 2212 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2213 sizeof(tuple.dst.u3.all)); 2214 tuple.dst.u.all = 2215 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2216 } 2217 2218 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2219 if (!h) 2220 return 0; 2221 2222 /* Store status bits of the conntrack that is clashing to re-do NAT 2223 * mangling according to what it has been done already to this packet. 2224 */ 2225 status = ct->status; 2226 2227 nf_ct_put(ct); 2228 ct = nf_ct_tuplehash_to_ctrack(h); 2229 nf_ct_set(skb, ct, ctinfo); 2230 2231 nat_hook = rcu_dereference(nf_nat_hook); 2232 if (!nat_hook) 2233 return 0; 2234 2235 if (status & IPS_SRC_NAT && 2236 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 2237 IP_CT_DIR_ORIGINAL) == NF_DROP) 2238 return -1; 2239 2240 if (status & IPS_DST_NAT && 2241 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 2242 IP_CT_DIR_ORIGINAL) == NF_DROP) 2243 return -1; 2244 2245 return 0; 2246 } 2247 2248 /* This packet is coming from userspace via nf_queue, complete the packet 2249 * processing after the helper invocation in nf_confirm(). 2250 */ 2251 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2252 enum ip_conntrack_info ctinfo) 2253 { 2254 const struct nf_conntrack_helper *helper; 2255 const struct nf_conn_help *help; 2256 int protoff; 2257 2258 help = nfct_help(ct); 2259 if (!help) 2260 return 0; 2261 2262 helper = rcu_dereference(help->helper); 2263 if (!helper) 2264 return 0; 2265 2266 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2267 return 0; 2268 2269 switch (nf_ct_l3num(ct)) { 2270 case NFPROTO_IPV4: 2271 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2272 break; 2273 #if IS_ENABLED(CONFIG_IPV6) 2274 case NFPROTO_IPV6: { 2275 __be16 frag_off; 2276 u8 pnum; 2277 2278 pnum = ipv6_hdr(skb)->nexthdr; 2279 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2280 &frag_off); 2281 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2282 return 0; 2283 break; 2284 } 2285 #endif 2286 default: 2287 return 0; 2288 } 2289 2290 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2291 !nf_is_loopback_packet(skb)) { 2292 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2293 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2294 return -1; 2295 } 2296 } 2297 2298 /* We've seen it coming out the other side: confirm it */ 2299 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; 2300 } 2301 2302 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2303 { 2304 enum ip_conntrack_info ctinfo; 2305 struct nf_conn *ct; 2306 int err; 2307 2308 ct = nf_ct_get(skb, &ctinfo); 2309 if (!ct) 2310 return 0; 2311 2312 if (!nf_ct_is_confirmed(ct)) { 2313 err = __nf_conntrack_update(net, skb, ct, ctinfo); 2314 if (err < 0) 2315 return err; 2316 2317 ct = nf_ct_get(skb, &ctinfo); 2318 } 2319 2320 return nf_confirm_cthelper(skb, ct, ctinfo); 2321 } 2322 2323 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2324 const struct sk_buff *skb) 2325 { 2326 const struct nf_conntrack_tuple *src_tuple; 2327 const struct nf_conntrack_tuple_hash *hash; 2328 struct nf_conntrack_tuple srctuple; 2329 enum ip_conntrack_info ctinfo; 2330 struct nf_conn *ct; 2331 2332 ct = nf_ct_get(skb, &ctinfo); 2333 if (ct) { 2334 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2335 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2336 return true; 2337 } 2338 2339 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2340 NFPROTO_IPV4, dev_net(skb->dev), 2341 &srctuple)) 2342 return false; 2343 2344 hash = nf_conntrack_find_get(dev_net(skb->dev), 2345 &nf_ct_zone_dflt, 2346 &srctuple); 2347 if (!hash) 2348 return false; 2349 2350 ct = nf_ct_tuplehash_to_ctrack(hash); 2351 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2352 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2353 nf_ct_put(ct); 2354 2355 return true; 2356 } 2357 2358 /* Bring out ya dead! */ 2359 static struct nf_conn * 2360 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2361 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2362 { 2363 struct nf_conntrack_tuple_hash *h; 2364 struct nf_conn *ct; 2365 struct hlist_nulls_node *n; 2366 spinlock_t *lockp; 2367 2368 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2369 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2370 2371 if (hlist_nulls_empty(hslot)) 2372 continue; 2373 2374 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2375 local_bh_disable(); 2376 nf_conntrack_lock(lockp); 2377 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2378 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2379 continue; 2380 /* All nf_conn objects are added to hash table twice, one 2381 * for original direction tuple, once for the reply tuple. 2382 * 2383 * Exception: In the IPS_NAT_CLASH case, only the reply 2384 * tuple is added (the original tuple already existed for 2385 * a different object). 2386 * 2387 * We only need to call the iterator once for each 2388 * conntrack, so we just use the 'reply' direction 2389 * tuple while iterating. 2390 */ 2391 ct = nf_ct_tuplehash_to_ctrack(h); 2392 2393 if (iter_data->net && 2394 !net_eq(iter_data->net, nf_ct_net(ct))) 2395 continue; 2396 2397 if (iter(ct, iter_data->data)) 2398 goto found; 2399 } 2400 spin_unlock(lockp); 2401 local_bh_enable(); 2402 cond_resched(); 2403 } 2404 2405 return NULL; 2406 found: 2407 refcount_inc(&ct->ct_general.use); 2408 spin_unlock(lockp); 2409 local_bh_enable(); 2410 return ct; 2411 } 2412 2413 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2414 const struct nf_ct_iter_data *iter_data) 2415 { 2416 unsigned int bucket = 0; 2417 struct nf_conn *ct; 2418 2419 might_sleep(); 2420 2421 mutex_lock(&nf_conntrack_mutex); 2422 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2423 /* Time to push up daises... */ 2424 2425 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2426 nf_ct_put(ct); 2427 cond_resched(); 2428 } 2429 mutex_unlock(&nf_conntrack_mutex); 2430 } 2431 2432 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2433 const struct nf_ct_iter_data *iter_data) 2434 { 2435 struct net *net = iter_data->net; 2436 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2437 2438 might_sleep(); 2439 2440 if (atomic_read(&cnet->count) == 0) 2441 return; 2442 2443 nf_ct_iterate_cleanup(iter, iter_data); 2444 } 2445 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2446 2447 /** 2448 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2449 * @iter: callback to invoke for each conntrack 2450 * @data: data to pass to @iter 2451 * 2452 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2453 * unconfirmed list as dying (so they will not be inserted into 2454 * main table). 2455 * 2456 * Can only be called in module exit path. 2457 */ 2458 void 2459 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2460 { 2461 struct nf_ct_iter_data iter_data = {}; 2462 struct net *net; 2463 2464 down_read(&net_rwsem); 2465 for_each_net(net) { 2466 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2467 2468 if (atomic_read(&cnet->count) == 0) 2469 continue; 2470 nf_queue_nf_hook_drop(net); 2471 } 2472 up_read(&net_rwsem); 2473 2474 /* Need to wait for netns cleanup worker to finish, if its 2475 * running -- it might have deleted a net namespace from 2476 * the global list, so hook drop above might not have 2477 * affected all namespaces. 2478 */ 2479 net_ns_barrier(); 2480 2481 /* a skb w. unconfirmed conntrack could have been reinjected just 2482 * before we called nf_queue_nf_hook_drop(). 2483 * 2484 * This makes sure its inserted into conntrack table. 2485 */ 2486 synchronize_net(); 2487 2488 nf_ct_ext_bump_genid(); 2489 iter_data.data = data; 2490 nf_ct_iterate_cleanup(iter, &iter_data); 2491 2492 /* Another cpu might be in a rcu read section with 2493 * rcu protected pointer cleared in iter callback 2494 * or hidden via nf_ct_ext_bump_genid() above. 2495 * 2496 * Wait until those are done. 2497 */ 2498 synchronize_rcu(); 2499 } 2500 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2501 2502 static int kill_all(struct nf_conn *i, void *data) 2503 { 2504 return 1; 2505 } 2506 2507 void nf_conntrack_cleanup_start(void) 2508 { 2509 cleanup_nf_conntrack_bpf(); 2510 conntrack_gc_work.exiting = true; 2511 } 2512 2513 void nf_conntrack_cleanup_end(void) 2514 { 2515 RCU_INIT_POINTER(nf_ct_hook, NULL); 2516 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2517 kvfree(nf_conntrack_hash); 2518 2519 nf_conntrack_proto_fini(); 2520 nf_conntrack_helper_fini(); 2521 nf_conntrack_expect_fini(); 2522 2523 kmem_cache_destroy(nf_conntrack_cachep); 2524 } 2525 2526 /* 2527 * Mishearing the voices in his head, our hero wonders how he's 2528 * supposed to kill the mall. 2529 */ 2530 void nf_conntrack_cleanup_net(struct net *net) 2531 { 2532 LIST_HEAD(single); 2533 2534 list_add(&net->exit_list, &single); 2535 nf_conntrack_cleanup_net_list(&single); 2536 } 2537 2538 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2539 { 2540 struct nf_ct_iter_data iter_data = {}; 2541 struct net *net; 2542 int busy; 2543 2544 /* 2545 * This makes sure all current packets have passed through 2546 * netfilter framework. Roll on, two-stage module 2547 * delete... 2548 */ 2549 synchronize_net(); 2550 i_see_dead_people: 2551 busy = 0; 2552 list_for_each_entry(net, net_exit_list, exit_list) { 2553 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2554 2555 iter_data.net = net; 2556 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2557 if (atomic_read(&cnet->count) != 0) 2558 busy = 1; 2559 } 2560 if (busy) { 2561 schedule(); 2562 goto i_see_dead_people; 2563 } 2564 2565 list_for_each_entry(net, net_exit_list, exit_list) { 2566 nf_conntrack_ecache_pernet_fini(net); 2567 nf_conntrack_expect_pernet_fini(net); 2568 free_percpu(net->ct.stat); 2569 } 2570 } 2571 2572 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2573 { 2574 struct hlist_nulls_head *hash; 2575 unsigned int nr_slots, i; 2576 2577 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2578 return NULL; 2579 2580 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2581 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2582 2583 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2584 2585 if (hash && nulls) 2586 for (i = 0; i < nr_slots; i++) 2587 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2588 2589 return hash; 2590 } 2591 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2592 2593 int nf_conntrack_hash_resize(unsigned int hashsize) 2594 { 2595 int i, bucket; 2596 unsigned int old_size; 2597 struct hlist_nulls_head *hash, *old_hash; 2598 struct nf_conntrack_tuple_hash *h; 2599 struct nf_conn *ct; 2600 2601 if (!hashsize) 2602 return -EINVAL; 2603 2604 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2605 if (!hash) 2606 return -ENOMEM; 2607 2608 mutex_lock(&nf_conntrack_mutex); 2609 old_size = nf_conntrack_htable_size; 2610 if (old_size == hashsize) { 2611 mutex_unlock(&nf_conntrack_mutex); 2612 kvfree(hash); 2613 return 0; 2614 } 2615 2616 local_bh_disable(); 2617 nf_conntrack_all_lock(); 2618 write_seqcount_begin(&nf_conntrack_generation); 2619 2620 /* Lookups in the old hash might happen in parallel, which means we 2621 * might get false negatives during connection lookup. New connections 2622 * created because of a false negative won't make it into the hash 2623 * though since that required taking the locks. 2624 */ 2625 2626 for (i = 0; i < nf_conntrack_htable_size; i++) { 2627 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2628 unsigned int zone_id; 2629 2630 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2631 struct nf_conntrack_tuple_hash, hnnode); 2632 ct = nf_ct_tuplehash_to_ctrack(h); 2633 hlist_nulls_del_rcu(&h->hnnode); 2634 2635 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2636 bucket = __hash_conntrack(nf_ct_net(ct), 2637 &h->tuple, zone_id, hashsize); 2638 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2639 } 2640 } 2641 old_hash = nf_conntrack_hash; 2642 2643 nf_conntrack_hash = hash; 2644 nf_conntrack_htable_size = hashsize; 2645 2646 write_seqcount_end(&nf_conntrack_generation); 2647 nf_conntrack_all_unlock(); 2648 local_bh_enable(); 2649 2650 mutex_unlock(&nf_conntrack_mutex); 2651 2652 synchronize_net(); 2653 kvfree(old_hash); 2654 return 0; 2655 } 2656 2657 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2658 { 2659 unsigned int hashsize; 2660 int rc; 2661 2662 if (current->nsproxy->net_ns != &init_net) 2663 return -EOPNOTSUPP; 2664 2665 /* On boot, we can set this without any fancy locking. */ 2666 if (!nf_conntrack_hash) 2667 return param_set_uint(val, kp); 2668 2669 rc = kstrtouint(val, 0, &hashsize); 2670 if (rc) 2671 return rc; 2672 2673 return nf_conntrack_hash_resize(hashsize); 2674 } 2675 2676 int nf_conntrack_init_start(void) 2677 { 2678 unsigned long nr_pages = totalram_pages(); 2679 int max_factor = 8; 2680 int ret = -ENOMEM; 2681 int i; 2682 2683 seqcount_spinlock_init(&nf_conntrack_generation, 2684 &nf_conntrack_locks_all_lock); 2685 2686 for (i = 0; i < CONNTRACK_LOCKS; i++) 2687 spin_lock_init(&nf_conntrack_locks[i]); 2688 2689 if (!nf_conntrack_htable_size) { 2690 nf_conntrack_htable_size 2691 = (((nr_pages << PAGE_SHIFT) / 16384) 2692 / sizeof(struct hlist_head)); 2693 if (BITS_PER_LONG >= 64 && 2694 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2695 nf_conntrack_htable_size = 262144; 2696 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2697 nf_conntrack_htable_size = 65536; 2698 2699 if (nf_conntrack_htable_size < 1024) 2700 nf_conntrack_htable_size = 1024; 2701 /* Use a max. factor of one by default to keep the average 2702 * hash chain length at 2 entries. Each entry has to be added 2703 * twice (once for original direction, once for reply). 2704 * When a table size is given we use the old value of 8 to 2705 * avoid implicit reduction of the max entries setting. 2706 */ 2707 max_factor = 1; 2708 } 2709 2710 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2711 if (!nf_conntrack_hash) 2712 return -ENOMEM; 2713 2714 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2715 2716 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2717 sizeof(struct nf_conn), 2718 NFCT_INFOMASK + 1, 2719 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2720 if (!nf_conntrack_cachep) 2721 goto err_cachep; 2722 2723 ret = nf_conntrack_expect_init(); 2724 if (ret < 0) 2725 goto err_expect; 2726 2727 ret = nf_conntrack_helper_init(); 2728 if (ret < 0) 2729 goto err_helper; 2730 2731 ret = nf_conntrack_proto_init(); 2732 if (ret < 0) 2733 goto err_proto; 2734 2735 conntrack_gc_work_init(&conntrack_gc_work); 2736 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2737 2738 ret = register_nf_conntrack_bpf(); 2739 if (ret < 0) 2740 goto err_kfunc; 2741 2742 return 0; 2743 2744 err_kfunc: 2745 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2746 nf_conntrack_proto_fini(); 2747 err_proto: 2748 nf_conntrack_helper_fini(); 2749 err_helper: 2750 nf_conntrack_expect_fini(); 2751 err_expect: 2752 kmem_cache_destroy(nf_conntrack_cachep); 2753 err_cachep: 2754 kvfree(nf_conntrack_hash); 2755 return ret; 2756 } 2757 2758 static void nf_conntrack_set_closing(struct nf_conntrack *nfct) 2759 { 2760 struct nf_conn *ct = nf_ct_to_nf_conn(nfct); 2761 2762 switch (nf_ct_protonum(ct)) { 2763 case IPPROTO_TCP: 2764 nf_conntrack_tcp_set_closing(ct); 2765 break; 2766 } 2767 } 2768 2769 static const struct nf_ct_hook nf_conntrack_hook = { 2770 .update = nf_conntrack_update, 2771 .destroy = nf_ct_destroy, 2772 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2773 .attach = nf_conntrack_attach, 2774 .set_closing = nf_conntrack_set_closing, 2775 }; 2776 2777 void nf_conntrack_init_end(void) 2778 { 2779 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2780 } 2781 2782 /* 2783 * We need to use special "null" values, not used in hash table 2784 */ 2785 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2786 2787 int nf_conntrack_init_net(struct net *net) 2788 { 2789 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2790 int ret = -ENOMEM; 2791 2792 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2793 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2794 atomic_set(&cnet->count, 0); 2795 2796 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2797 if (!net->ct.stat) 2798 return ret; 2799 2800 ret = nf_conntrack_expect_pernet_init(net); 2801 if (ret < 0) 2802 goto err_expect; 2803 2804 nf_conntrack_acct_pernet_init(net); 2805 nf_conntrack_tstamp_pernet_init(net); 2806 nf_conntrack_ecache_pernet_init(net); 2807 nf_conntrack_proto_pernet_init(net); 2808 2809 return 0; 2810 2811 err_expect: 2812 free_percpu(net->ct.stat); 2813 return ret; 2814 } 2815 2816 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2817 2818 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2819 { 2820 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2821 return -EPERM; 2822 2823 __nf_ct_set_timeout(ct, timeout); 2824 2825 if (test_bit(IPS_DYING_BIT, &ct->status)) 2826 return -ETIME; 2827 2828 return 0; 2829 } 2830 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2831 2832 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2833 { 2834 unsigned int bit; 2835 2836 /* Ignore these unchangable bits */ 2837 on &= ~IPS_UNCHANGEABLE_MASK; 2838 off &= ~IPS_UNCHANGEABLE_MASK; 2839 2840 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2841 if (on & (1 << bit)) 2842 set_bit(bit, &ct->status); 2843 else if (off & (1 << bit)) 2844 clear_bit(bit, &ct->status); 2845 } 2846 } 2847 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2848 2849 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2850 { 2851 unsigned long d; 2852 2853 d = ct->status ^ status; 2854 2855 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2856 /* unchangeable */ 2857 return -EBUSY; 2858 2859 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2860 /* SEEN_REPLY bit can only be set */ 2861 return -EBUSY; 2862 2863 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2864 /* ASSURED bit can only be set */ 2865 return -EBUSY; 2866 2867 __nf_ct_change_status(ct, status, 0); 2868 return 0; 2869 } 2870 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2871