1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 8u 100 #define MAX_CHAINLEN (32u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 140 unsigned int h2, unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 u64 a, b, c, d; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 /* The direction must be ignored, handle usable tuplehash members manually */ 219 a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; 220 b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3]; 221 222 c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; 223 c |= tuple->dst.protonum; 224 225 d = (u64)zoneid << 32 | net_hash_mix(net); 226 227 /* IPv4: u3.all[1,2,3] == 0 */ 228 c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; 229 d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; 230 231 return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); 232 } 233 234 static u32 scale_hash(u32 hash) 235 { 236 return reciprocal_scale(hash, nf_conntrack_htable_size); 237 } 238 239 static u32 __hash_conntrack(const struct net *net, 240 const struct nf_conntrack_tuple *tuple, 241 unsigned int zoneid, 242 unsigned int size) 243 { 244 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 245 } 246 247 static u32 hash_conntrack(const struct net *net, 248 const struct nf_conntrack_tuple *tuple, 249 unsigned int zoneid) 250 { 251 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 252 } 253 254 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 255 unsigned int dataoff, 256 struct nf_conntrack_tuple *tuple) 257 { struct { 258 __be16 sport; 259 __be16 dport; 260 } _inet_hdr, *inet_hdr; 261 262 /* Actually only need first 4 bytes to get ports. */ 263 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 264 if (!inet_hdr) 265 return false; 266 267 tuple->src.u.udp.port = inet_hdr->sport; 268 tuple->dst.u.udp.port = inet_hdr->dport; 269 return true; 270 } 271 272 static bool 273 nf_ct_get_tuple(const struct sk_buff *skb, 274 unsigned int nhoff, 275 unsigned int dataoff, 276 u_int16_t l3num, 277 u_int8_t protonum, 278 struct net *net, 279 struct nf_conntrack_tuple *tuple) 280 { 281 unsigned int size; 282 const __be32 *ap; 283 __be32 _addrs[8]; 284 285 memset(tuple, 0, sizeof(*tuple)); 286 287 tuple->src.l3num = l3num; 288 switch (l3num) { 289 case NFPROTO_IPV4: 290 nhoff += offsetof(struct iphdr, saddr); 291 size = 2 * sizeof(__be32); 292 break; 293 case NFPROTO_IPV6: 294 nhoff += offsetof(struct ipv6hdr, saddr); 295 size = sizeof(_addrs); 296 break; 297 default: 298 return true; 299 } 300 301 ap = skb_header_pointer(skb, nhoff, size, _addrs); 302 if (!ap) 303 return false; 304 305 switch (l3num) { 306 case NFPROTO_IPV4: 307 tuple->src.u3.ip = ap[0]; 308 tuple->dst.u3.ip = ap[1]; 309 break; 310 case NFPROTO_IPV6: 311 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 312 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 313 break; 314 } 315 316 tuple->dst.protonum = protonum; 317 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 318 319 switch (protonum) { 320 #if IS_ENABLED(CONFIG_IPV6) 321 case IPPROTO_ICMPV6: 322 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 323 #endif 324 case IPPROTO_ICMP: 325 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 326 #ifdef CONFIG_NF_CT_PROTO_GRE 327 case IPPROTO_GRE: 328 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 329 #endif 330 case IPPROTO_TCP: 331 case IPPROTO_UDP: 332 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 333 case IPPROTO_UDPLITE: 334 #endif 335 #ifdef CONFIG_NF_CT_PROTO_SCTP 336 case IPPROTO_SCTP: 337 #endif 338 #ifdef CONFIG_NF_CT_PROTO_DCCP 339 case IPPROTO_DCCP: 340 #endif 341 /* fallthrough */ 342 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 343 default: 344 break; 345 } 346 347 return true; 348 } 349 350 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 351 u_int8_t *protonum) 352 { 353 int dataoff = -1; 354 const struct iphdr *iph; 355 struct iphdr _iph; 356 357 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 358 if (!iph) 359 return -1; 360 361 /* Conntrack defragments packets, we might still see fragments 362 * inside ICMP packets though. 363 */ 364 if (iph->frag_off & htons(IP_OFFSET)) 365 return -1; 366 367 dataoff = nhoff + (iph->ihl << 2); 368 *protonum = iph->protocol; 369 370 /* Check bogus IP headers */ 371 if (dataoff > skb->len) { 372 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 373 nhoff, iph->ihl << 2, skb->len); 374 return -1; 375 } 376 return dataoff; 377 } 378 379 #if IS_ENABLED(CONFIG_IPV6) 380 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 381 u8 *protonum) 382 { 383 int protoff = -1; 384 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 385 __be16 frag_off; 386 u8 nexthdr; 387 388 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 389 &nexthdr, sizeof(nexthdr)) != 0) { 390 pr_debug("can't get nexthdr\n"); 391 return -1; 392 } 393 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 394 /* 395 * (protoff == skb->len) means the packet has not data, just 396 * IPv6 and possibly extensions headers, but it is tracked anyway 397 */ 398 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 399 pr_debug("can't find proto in pkt\n"); 400 return -1; 401 } 402 403 *protonum = nexthdr; 404 return protoff; 405 } 406 #endif 407 408 static int get_l4proto(const struct sk_buff *skb, 409 unsigned int nhoff, u8 pf, u8 *l4num) 410 { 411 switch (pf) { 412 case NFPROTO_IPV4: 413 return ipv4_get_l4proto(skb, nhoff, l4num); 414 #if IS_ENABLED(CONFIG_IPV6) 415 case NFPROTO_IPV6: 416 return ipv6_get_l4proto(skb, nhoff, l4num); 417 #endif 418 default: 419 *l4num = 0; 420 break; 421 } 422 return -1; 423 } 424 425 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 426 u_int16_t l3num, 427 struct net *net, struct nf_conntrack_tuple *tuple) 428 { 429 u8 protonum; 430 int protoff; 431 432 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 433 if (protoff <= 0) 434 return false; 435 436 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 437 } 438 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 439 440 bool 441 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 442 const struct nf_conntrack_tuple *orig) 443 { 444 memset(inverse, 0, sizeof(*inverse)); 445 446 inverse->src.l3num = orig->src.l3num; 447 448 switch (orig->src.l3num) { 449 case NFPROTO_IPV4: 450 inverse->src.u3.ip = orig->dst.u3.ip; 451 inverse->dst.u3.ip = orig->src.u3.ip; 452 break; 453 case NFPROTO_IPV6: 454 inverse->src.u3.in6 = orig->dst.u3.in6; 455 inverse->dst.u3.in6 = orig->src.u3.in6; 456 break; 457 default: 458 break; 459 } 460 461 inverse->dst.dir = !orig->dst.dir; 462 463 inverse->dst.protonum = orig->dst.protonum; 464 465 switch (orig->dst.protonum) { 466 case IPPROTO_ICMP: 467 return nf_conntrack_invert_icmp_tuple(inverse, orig); 468 #if IS_ENABLED(CONFIG_IPV6) 469 case IPPROTO_ICMPV6: 470 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 471 #endif 472 } 473 474 inverse->src.u.all = orig->dst.u.all; 475 inverse->dst.u.all = orig->src.u.all; 476 return true; 477 } 478 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 479 480 /* Generate a almost-unique pseudo-id for a given conntrack. 481 * 482 * intentionally doesn't re-use any of the seeds used for hash 483 * table location, we assume id gets exposed to userspace. 484 * 485 * Following nf_conn items do not change throughout lifetime 486 * of the nf_conn: 487 * 488 * 1. nf_conn address 489 * 2. nf_conn->master address (normally NULL) 490 * 3. the associated net namespace 491 * 4. the original direction tuple 492 */ 493 u32 nf_ct_get_id(const struct nf_conn *ct) 494 { 495 static siphash_aligned_key_t ct_id_seed; 496 unsigned long a, b, c, d; 497 498 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 499 500 a = (unsigned long)ct; 501 b = (unsigned long)ct->master; 502 c = (unsigned long)nf_ct_net(ct); 503 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 504 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 505 &ct_id_seed); 506 #ifdef CONFIG_64BIT 507 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 508 #else 509 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 510 #endif 511 } 512 EXPORT_SYMBOL_GPL(nf_ct_get_id); 513 514 static void 515 clean_from_lists(struct nf_conn *ct) 516 { 517 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 518 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 519 520 /* Destroy all pending expectations */ 521 nf_ct_remove_expectations(ct); 522 } 523 524 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 525 526 /* Released via nf_ct_destroy() */ 527 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 528 const struct nf_conntrack_zone *zone, 529 gfp_t flags) 530 { 531 struct nf_conn *tmpl, *p; 532 533 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 534 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 535 if (!tmpl) 536 return NULL; 537 538 p = tmpl; 539 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 540 if (tmpl != p) { 541 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 542 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 543 } 544 } else { 545 tmpl = kzalloc(sizeof(*tmpl), flags); 546 if (!tmpl) 547 return NULL; 548 } 549 550 tmpl->status = IPS_TEMPLATE; 551 write_pnet(&tmpl->ct_net, net); 552 nf_ct_zone_add(tmpl, zone); 553 refcount_set(&tmpl->ct_general.use, 1); 554 555 return tmpl; 556 } 557 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 558 559 void nf_ct_tmpl_free(struct nf_conn *tmpl) 560 { 561 kfree(tmpl->ext); 562 563 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 564 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 565 else 566 kfree(tmpl); 567 } 568 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 569 570 static void destroy_gre_conntrack(struct nf_conn *ct) 571 { 572 #ifdef CONFIG_NF_CT_PROTO_GRE 573 struct nf_conn *master = ct->master; 574 575 if (master) 576 nf_ct_gre_keymap_destroy(master); 577 #endif 578 } 579 580 void nf_ct_destroy(struct nf_conntrack *nfct) 581 { 582 struct nf_conn *ct = (struct nf_conn *)nfct; 583 584 WARN_ON(refcount_read(&nfct->use) != 0); 585 586 if (unlikely(nf_ct_is_template(ct))) { 587 nf_ct_tmpl_free(ct); 588 return; 589 } 590 591 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 592 destroy_gre_conntrack(ct); 593 594 /* Expectations will have been removed in clean_from_lists, 595 * except TFTP can create an expectation on the first packet, 596 * before connection is in the list, so we need to clean here, 597 * too. 598 */ 599 nf_ct_remove_expectations(ct); 600 601 if (ct->master) 602 nf_ct_put(ct->master); 603 604 nf_conntrack_free(ct); 605 } 606 EXPORT_SYMBOL(nf_ct_destroy); 607 608 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 609 { 610 struct net *net = nf_ct_net(ct); 611 unsigned int hash, reply_hash; 612 unsigned int sequence; 613 614 do { 615 sequence = read_seqcount_begin(&nf_conntrack_generation); 616 hash = hash_conntrack(net, 617 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 618 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 619 reply_hash = hash_conntrack(net, 620 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 621 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 622 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 623 624 clean_from_lists(ct); 625 nf_conntrack_double_unlock(hash, reply_hash); 626 } 627 628 static void nf_ct_delete_from_lists(struct nf_conn *ct) 629 { 630 nf_ct_helper_destroy(ct); 631 local_bh_disable(); 632 633 __nf_ct_delete_from_lists(ct); 634 635 local_bh_enable(); 636 } 637 638 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 639 { 640 #ifdef CONFIG_NF_CONNTRACK_EVENTS 641 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 642 643 spin_lock(&cnet->ecache.dying_lock); 644 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 645 &cnet->ecache.dying_list); 646 spin_unlock(&cnet->ecache.dying_lock); 647 #endif 648 } 649 650 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 651 { 652 struct nf_conn_tstamp *tstamp; 653 struct net *net; 654 655 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 656 return false; 657 658 tstamp = nf_conn_tstamp_find(ct); 659 if (tstamp) { 660 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 661 662 tstamp->stop = ktime_get_real_ns(); 663 if (timeout < 0) 664 tstamp->stop -= jiffies_to_nsecs(-timeout); 665 } 666 667 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 668 portid, report) < 0) { 669 /* destroy event was not delivered. nf_ct_put will 670 * be done by event cache worker on redelivery. 671 */ 672 nf_ct_helper_destroy(ct); 673 local_bh_disable(); 674 __nf_ct_delete_from_lists(ct); 675 nf_ct_add_to_ecache_list(ct); 676 local_bh_enable(); 677 678 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 679 return false; 680 } 681 682 net = nf_ct_net(ct); 683 if (nf_conntrack_ecache_dwork_pending(net)) 684 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 685 nf_ct_delete_from_lists(ct); 686 nf_ct_put(ct); 687 return true; 688 } 689 EXPORT_SYMBOL_GPL(nf_ct_delete); 690 691 static inline bool 692 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 693 const struct nf_conntrack_tuple *tuple, 694 const struct nf_conntrack_zone *zone, 695 const struct net *net) 696 { 697 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 698 699 /* A conntrack can be recreated with the equal tuple, 700 * so we need to check that the conntrack is confirmed 701 */ 702 return nf_ct_tuple_equal(tuple, &h->tuple) && 703 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 704 nf_ct_is_confirmed(ct) && 705 net_eq(net, nf_ct_net(ct)); 706 } 707 708 static inline bool 709 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 710 { 711 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 712 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 713 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 714 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 715 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 716 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 717 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 718 } 719 720 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 721 static void nf_ct_gc_expired(struct nf_conn *ct) 722 { 723 if (!refcount_inc_not_zero(&ct->ct_general.use)) 724 return; 725 726 /* load ->status after refcount increase */ 727 smp_acquire__after_ctrl_dep(); 728 729 if (nf_ct_should_gc(ct)) 730 nf_ct_kill(ct); 731 732 nf_ct_put(ct); 733 } 734 735 /* 736 * Warning : 737 * - Caller must take a reference on returned object 738 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 739 */ 740 static struct nf_conntrack_tuple_hash * 741 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 742 const struct nf_conntrack_tuple *tuple, u32 hash) 743 { 744 struct nf_conntrack_tuple_hash *h; 745 struct hlist_nulls_head *ct_hash; 746 struct hlist_nulls_node *n; 747 unsigned int bucket, hsize; 748 749 begin: 750 nf_conntrack_get_ht(&ct_hash, &hsize); 751 bucket = reciprocal_scale(hash, hsize); 752 753 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 754 struct nf_conn *ct; 755 756 ct = nf_ct_tuplehash_to_ctrack(h); 757 if (nf_ct_is_expired(ct)) { 758 nf_ct_gc_expired(ct); 759 continue; 760 } 761 762 if (nf_ct_key_equal(h, tuple, zone, net)) 763 return h; 764 } 765 /* 766 * if the nulls value we got at the end of this lookup is 767 * not the expected one, we must restart lookup. 768 * We probably met an item that was moved to another chain. 769 */ 770 if (get_nulls_value(n) != bucket) { 771 NF_CT_STAT_INC_ATOMIC(net, search_restart); 772 goto begin; 773 } 774 775 return NULL; 776 } 777 778 /* Find a connection corresponding to a tuple. */ 779 static struct nf_conntrack_tuple_hash * 780 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 781 const struct nf_conntrack_tuple *tuple, u32 hash) 782 { 783 struct nf_conntrack_tuple_hash *h; 784 struct nf_conn *ct; 785 786 h = ____nf_conntrack_find(net, zone, tuple, hash); 787 if (h) { 788 /* We have a candidate that matches the tuple we're interested 789 * in, try to obtain a reference and re-check tuple 790 */ 791 ct = nf_ct_tuplehash_to_ctrack(h); 792 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 793 /* re-check key after refcount */ 794 smp_acquire__after_ctrl_dep(); 795 796 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 797 return h; 798 799 /* TYPESAFE_BY_RCU recycled the candidate */ 800 nf_ct_put(ct); 801 } 802 803 h = NULL; 804 } 805 806 return h; 807 } 808 809 struct nf_conntrack_tuple_hash * 810 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 811 const struct nf_conntrack_tuple *tuple) 812 { 813 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 814 struct nf_conntrack_tuple_hash *thash; 815 816 rcu_read_lock(); 817 818 thash = __nf_conntrack_find_get(net, zone, tuple, 819 hash_conntrack_raw(tuple, zone_id, net)); 820 821 if (thash) 822 goto out_unlock; 823 824 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 825 if (rid != zone_id) 826 thash = __nf_conntrack_find_get(net, zone, tuple, 827 hash_conntrack_raw(tuple, rid, net)); 828 829 out_unlock: 830 rcu_read_unlock(); 831 return thash; 832 } 833 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 834 835 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 836 unsigned int hash, 837 unsigned int reply_hash) 838 { 839 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 840 &nf_conntrack_hash[hash]); 841 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 842 &nf_conntrack_hash[reply_hash]); 843 } 844 845 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 846 { 847 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 848 * may contain stale pointers to e.g. helper that has been removed. 849 * 850 * The helper can't clear this because the nf_conn object isn't in 851 * any hash and synchronize_rcu() isn't enough because associated skb 852 * might sit in a queue. 853 */ 854 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 855 } 856 857 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 858 { 859 if (!ext) 860 return true; 861 862 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 863 return false; 864 865 /* inserted into conntrack table, nf_ct_iterate_cleanup() 866 * will find it. Disable nf_ct_ext_find() id check. 867 */ 868 WRITE_ONCE(ext->gen_id, 0); 869 return true; 870 } 871 872 int 873 nf_conntrack_hash_check_insert(struct nf_conn *ct) 874 { 875 const struct nf_conntrack_zone *zone; 876 struct net *net = nf_ct_net(ct); 877 unsigned int hash, reply_hash; 878 struct nf_conntrack_tuple_hash *h; 879 struct hlist_nulls_node *n; 880 unsigned int max_chainlen; 881 unsigned int chainlen = 0; 882 unsigned int sequence; 883 int err = -EEXIST; 884 885 zone = nf_ct_zone(ct); 886 887 if (!nf_ct_ext_valid_pre(ct->ext)) { 888 NF_CT_STAT_INC_ATOMIC(net, insert_failed); 889 return -ETIMEDOUT; 890 } 891 892 local_bh_disable(); 893 do { 894 sequence = read_seqcount_begin(&nf_conntrack_generation); 895 hash = hash_conntrack(net, 896 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 897 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 898 reply_hash = hash_conntrack(net, 899 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 900 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 901 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 902 903 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 904 905 /* See if there's one in the list already, including reverse */ 906 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 907 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 908 zone, net)) 909 goto out; 910 911 if (chainlen++ > max_chainlen) 912 goto chaintoolong; 913 } 914 915 chainlen = 0; 916 917 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 918 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 919 zone, net)) 920 goto out; 921 if (chainlen++ > max_chainlen) 922 goto chaintoolong; 923 } 924 925 smp_wmb(); 926 /* The caller holds a reference to this object */ 927 refcount_set(&ct->ct_general.use, 2); 928 __nf_conntrack_hash_insert(ct, hash, reply_hash); 929 nf_conntrack_double_unlock(hash, reply_hash); 930 NF_CT_STAT_INC(net, insert); 931 local_bh_enable(); 932 933 if (!nf_ct_ext_valid_post(ct->ext)) { 934 nf_ct_kill(ct); 935 NF_CT_STAT_INC_ATOMIC(net, drop); 936 return -ETIMEDOUT; 937 } 938 939 return 0; 940 chaintoolong: 941 NF_CT_STAT_INC(net, chaintoolong); 942 err = -ENOSPC; 943 out: 944 nf_conntrack_double_unlock(hash, reply_hash); 945 local_bh_enable(); 946 return err; 947 } 948 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 949 950 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 951 unsigned int bytes) 952 { 953 struct nf_conn_acct *acct; 954 955 acct = nf_conn_acct_find(ct); 956 if (acct) { 957 struct nf_conn_counter *counter = acct->counter; 958 959 atomic64_add(packets, &counter[dir].packets); 960 atomic64_add(bytes, &counter[dir].bytes); 961 } 962 } 963 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 964 965 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 966 const struct nf_conn *loser_ct) 967 { 968 struct nf_conn_acct *acct; 969 970 acct = nf_conn_acct_find(loser_ct); 971 if (acct) { 972 struct nf_conn_counter *counter = acct->counter; 973 unsigned int bytes; 974 975 /* u32 should be fine since we must have seen one packet. */ 976 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 977 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 978 } 979 } 980 981 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 982 { 983 struct nf_conn_tstamp *tstamp; 984 985 refcount_inc(&ct->ct_general.use); 986 987 /* set conntrack timestamp, if enabled. */ 988 tstamp = nf_conn_tstamp_find(ct); 989 if (tstamp) 990 tstamp->start = ktime_get_real_ns(); 991 } 992 993 /* caller must hold locks to prevent concurrent changes */ 994 static int __nf_ct_resolve_clash(struct sk_buff *skb, 995 struct nf_conntrack_tuple_hash *h) 996 { 997 /* This is the conntrack entry already in hashes that won race. */ 998 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 999 enum ip_conntrack_info ctinfo; 1000 struct nf_conn *loser_ct; 1001 1002 loser_ct = nf_ct_get(skb, &ctinfo); 1003 1004 if (nf_ct_is_dying(ct)) 1005 return NF_DROP; 1006 1007 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1008 nf_ct_match(ct, loser_ct)) { 1009 struct net *net = nf_ct_net(ct); 1010 1011 nf_conntrack_get(&ct->ct_general); 1012 1013 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1014 nf_ct_put(loser_ct); 1015 nf_ct_set(skb, ct, ctinfo); 1016 1017 NF_CT_STAT_INC(net, clash_resolve); 1018 return NF_ACCEPT; 1019 } 1020 1021 return NF_DROP; 1022 } 1023 1024 /** 1025 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1026 * 1027 * @skb: skb that causes the collision 1028 * @repl_idx: hash slot for reply direction 1029 * 1030 * Called when origin or reply direction had a clash. 1031 * The skb can be handled without packet drop provided the reply direction 1032 * is unique or there the existing entry has the identical tuple in both 1033 * directions. 1034 * 1035 * Caller must hold conntrack table locks to prevent concurrent updates. 1036 * 1037 * Returns NF_DROP if the clash could not be handled. 1038 */ 1039 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1040 { 1041 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1042 const struct nf_conntrack_zone *zone; 1043 struct nf_conntrack_tuple_hash *h; 1044 struct hlist_nulls_node *n; 1045 struct net *net; 1046 1047 zone = nf_ct_zone(loser_ct); 1048 net = nf_ct_net(loser_ct); 1049 1050 /* Reply direction must never result in a clash, unless both origin 1051 * and reply tuples are identical. 1052 */ 1053 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1054 if (nf_ct_key_equal(h, 1055 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1056 zone, net)) 1057 return __nf_ct_resolve_clash(skb, h); 1058 } 1059 1060 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1061 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1062 1063 /* IPS_NAT_CLASH removes the entry automatically on the first 1064 * reply. Also prevents UDP tracker from moving the entry to 1065 * ASSURED state, i.e. the entry can always be evicted under 1066 * pressure. 1067 */ 1068 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1069 1070 __nf_conntrack_insert_prepare(loser_ct); 1071 1072 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1073 * already in the table. This also hides the clashing entry from 1074 * ctnetlink iteration, i.e. conntrack -L won't show them. 1075 */ 1076 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1077 1078 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1079 &nf_conntrack_hash[repl_idx]); 1080 1081 NF_CT_STAT_INC(net, clash_resolve); 1082 return NF_ACCEPT; 1083 } 1084 1085 /** 1086 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1087 * 1088 * @skb: skb that causes the clash 1089 * @h: tuplehash of the clashing entry already in table 1090 * @reply_hash: hash slot for reply direction 1091 * 1092 * A conntrack entry can be inserted to the connection tracking table 1093 * if there is no existing entry with an identical tuple. 1094 * 1095 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1096 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1097 * will find the already-existing entry. 1098 * 1099 * The major problem with such packet drop is the extra delay added by 1100 * the packet loss -- it will take some time for a retransmit to occur 1101 * (or the sender to time out when waiting for a reply). 1102 * 1103 * This function attempts to handle the situation without packet drop. 1104 * 1105 * If @skb has no NAT transformation or if the colliding entries are 1106 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1107 * and @skb is associated with the conntrack entry already in the table. 1108 * 1109 * Failing that, the new, unconfirmed conntrack is still added to the table 1110 * provided that the collision only occurs in the ORIGINAL direction. 1111 * The new entry will be added only in the non-clashing REPLY direction, 1112 * so packets in the ORIGINAL direction will continue to match the existing 1113 * entry. The new entry will also have a fixed timeout so it expires -- 1114 * due to the collision, it will only see reply traffic. 1115 * 1116 * Returns NF_DROP if the clash could not be resolved. 1117 */ 1118 static __cold noinline int 1119 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1120 u32 reply_hash) 1121 { 1122 /* This is the conntrack entry already in hashes that won race. */ 1123 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1124 const struct nf_conntrack_l4proto *l4proto; 1125 enum ip_conntrack_info ctinfo; 1126 struct nf_conn *loser_ct; 1127 struct net *net; 1128 int ret; 1129 1130 loser_ct = nf_ct_get(skb, &ctinfo); 1131 net = nf_ct_net(loser_ct); 1132 1133 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1134 if (!l4proto->allow_clash) 1135 goto drop; 1136 1137 ret = __nf_ct_resolve_clash(skb, h); 1138 if (ret == NF_ACCEPT) 1139 return ret; 1140 1141 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1142 if (ret == NF_ACCEPT) 1143 return ret; 1144 1145 drop: 1146 NF_CT_STAT_INC(net, drop); 1147 NF_CT_STAT_INC(net, insert_failed); 1148 return NF_DROP; 1149 } 1150 1151 /* Confirm a connection given skb; places it in hash table */ 1152 int 1153 __nf_conntrack_confirm(struct sk_buff *skb) 1154 { 1155 unsigned int chainlen = 0, sequence, max_chainlen; 1156 const struct nf_conntrack_zone *zone; 1157 unsigned int hash, reply_hash; 1158 struct nf_conntrack_tuple_hash *h; 1159 struct nf_conn *ct; 1160 struct nf_conn_help *help; 1161 struct hlist_nulls_node *n; 1162 enum ip_conntrack_info ctinfo; 1163 struct net *net; 1164 int ret = NF_DROP; 1165 1166 ct = nf_ct_get(skb, &ctinfo); 1167 net = nf_ct_net(ct); 1168 1169 /* ipt_REJECT uses nf_conntrack_attach to attach related 1170 ICMP/TCP RST packets in other direction. Actual packet 1171 which created connection will be IP_CT_NEW or for an 1172 expected connection, IP_CT_RELATED. */ 1173 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1174 return NF_ACCEPT; 1175 1176 zone = nf_ct_zone(ct); 1177 local_bh_disable(); 1178 1179 do { 1180 sequence = read_seqcount_begin(&nf_conntrack_generation); 1181 /* reuse the hash saved before */ 1182 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1183 hash = scale_hash(hash); 1184 reply_hash = hash_conntrack(net, 1185 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1186 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1187 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1188 1189 /* We're not in hash table, and we refuse to set up related 1190 * connections for unconfirmed conns. But packet copies and 1191 * REJECT will give spurious warnings here. 1192 */ 1193 1194 /* Another skb with the same unconfirmed conntrack may 1195 * win the race. This may happen for bridge(br_flood) 1196 * or broadcast/multicast packets do skb_clone with 1197 * unconfirmed conntrack. 1198 */ 1199 if (unlikely(nf_ct_is_confirmed(ct))) { 1200 WARN_ON_ONCE(1); 1201 nf_conntrack_double_unlock(hash, reply_hash); 1202 local_bh_enable(); 1203 return NF_DROP; 1204 } 1205 1206 if (!nf_ct_ext_valid_pre(ct->ext)) { 1207 NF_CT_STAT_INC(net, insert_failed); 1208 goto dying; 1209 } 1210 1211 /* We have to check the DYING flag after unlink to prevent 1212 * a race against nf_ct_get_next_corpse() possibly called from 1213 * user context, else we insert an already 'dead' hash, blocking 1214 * further use of that particular connection -JM. 1215 */ 1216 ct->status |= IPS_CONFIRMED; 1217 1218 if (unlikely(nf_ct_is_dying(ct))) { 1219 NF_CT_STAT_INC(net, insert_failed); 1220 goto dying; 1221 } 1222 1223 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1224 /* See if there's one in the list already, including reverse: 1225 NAT could have grabbed it without realizing, since we're 1226 not in the hash. If there is, we lost race. */ 1227 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1228 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1229 zone, net)) 1230 goto out; 1231 if (chainlen++ > max_chainlen) 1232 goto chaintoolong; 1233 } 1234 1235 chainlen = 0; 1236 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1237 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1238 zone, net)) 1239 goto out; 1240 if (chainlen++ > max_chainlen) { 1241 chaintoolong: 1242 NF_CT_STAT_INC(net, chaintoolong); 1243 NF_CT_STAT_INC(net, insert_failed); 1244 ret = NF_DROP; 1245 goto dying; 1246 } 1247 } 1248 1249 /* Timer relative to confirmation time, not original 1250 setting time, otherwise we'd get timer wrap in 1251 weird delay cases. */ 1252 ct->timeout += nfct_time_stamp; 1253 1254 __nf_conntrack_insert_prepare(ct); 1255 1256 /* Since the lookup is lockless, hash insertion must be done after 1257 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1258 * guarantee that no other CPU can find the conntrack before the above 1259 * stores are visible. 1260 */ 1261 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1262 nf_conntrack_double_unlock(hash, reply_hash); 1263 local_bh_enable(); 1264 1265 /* ext area is still valid (rcu read lock is held, 1266 * but will go out of scope soon, we need to remove 1267 * this conntrack again. 1268 */ 1269 if (!nf_ct_ext_valid_post(ct->ext)) { 1270 nf_ct_kill(ct); 1271 NF_CT_STAT_INC_ATOMIC(net, drop); 1272 return NF_DROP; 1273 } 1274 1275 help = nfct_help(ct); 1276 if (help && help->helper) 1277 nf_conntrack_event_cache(IPCT_HELPER, ct); 1278 1279 nf_conntrack_event_cache(master_ct(ct) ? 1280 IPCT_RELATED : IPCT_NEW, ct); 1281 return NF_ACCEPT; 1282 1283 out: 1284 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1285 dying: 1286 nf_conntrack_double_unlock(hash, reply_hash); 1287 local_bh_enable(); 1288 return ret; 1289 } 1290 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1291 1292 /* Returns true if a connection correspondings to the tuple (required 1293 for NAT). */ 1294 int 1295 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1296 const struct nf_conn *ignored_conntrack) 1297 { 1298 struct net *net = nf_ct_net(ignored_conntrack); 1299 const struct nf_conntrack_zone *zone; 1300 struct nf_conntrack_tuple_hash *h; 1301 struct hlist_nulls_head *ct_hash; 1302 unsigned int hash, hsize; 1303 struct hlist_nulls_node *n; 1304 struct nf_conn *ct; 1305 1306 zone = nf_ct_zone(ignored_conntrack); 1307 1308 rcu_read_lock(); 1309 begin: 1310 nf_conntrack_get_ht(&ct_hash, &hsize); 1311 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1312 1313 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1314 ct = nf_ct_tuplehash_to_ctrack(h); 1315 1316 if (ct == ignored_conntrack) 1317 continue; 1318 1319 if (nf_ct_is_expired(ct)) { 1320 nf_ct_gc_expired(ct); 1321 continue; 1322 } 1323 1324 if (nf_ct_key_equal(h, tuple, zone, net)) { 1325 /* Tuple is taken already, so caller will need to find 1326 * a new source port to use. 1327 * 1328 * Only exception: 1329 * If the *original tuples* are identical, then both 1330 * conntracks refer to the same flow. 1331 * This is a rare situation, it can occur e.g. when 1332 * more than one UDP packet is sent from same socket 1333 * in different threads. 1334 * 1335 * Let nf_ct_resolve_clash() deal with this later. 1336 */ 1337 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1338 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1339 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1340 continue; 1341 1342 NF_CT_STAT_INC_ATOMIC(net, found); 1343 rcu_read_unlock(); 1344 return 1; 1345 } 1346 } 1347 1348 if (get_nulls_value(n) != hash) { 1349 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1350 goto begin; 1351 } 1352 1353 rcu_read_unlock(); 1354 1355 return 0; 1356 } 1357 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1358 1359 #define NF_CT_EVICTION_RANGE 8 1360 1361 /* There's a small race here where we may free a just-assured 1362 connection. Too bad: we're in trouble anyway. */ 1363 static unsigned int early_drop_list(struct net *net, 1364 struct hlist_nulls_head *head) 1365 { 1366 struct nf_conntrack_tuple_hash *h; 1367 struct hlist_nulls_node *n; 1368 unsigned int drops = 0; 1369 struct nf_conn *tmp; 1370 1371 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1372 tmp = nf_ct_tuplehash_to_ctrack(h); 1373 1374 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) 1375 continue; 1376 1377 if (nf_ct_is_expired(tmp)) { 1378 nf_ct_gc_expired(tmp); 1379 continue; 1380 } 1381 1382 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1383 !net_eq(nf_ct_net(tmp), net) || 1384 nf_ct_is_dying(tmp)) 1385 continue; 1386 1387 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1388 continue; 1389 1390 /* load ->ct_net and ->status after refcount increase */ 1391 smp_acquire__after_ctrl_dep(); 1392 1393 /* kill only if still in same netns -- might have moved due to 1394 * SLAB_TYPESAFE_BY_RCU rules. 1395 * 1396 * We steal the timer reference. If that fails timer has 1397 * already fired or someone else deleted it. Just drop ref 1398 * and move to next entry. 1399 */ 1400 if (net_eq(nf_ct_net(tmp), net) && 1401 nf_ct_is_confirmed(tmp) && 1402 nf_ct_delete(tmp, 0, 0)) 1403 drops++; 1404 1405 nf_ct_put(tmp); 1406 } 1407 1408 return drops; 1409 } 1410 1411 static noinline int early_drop(struct net *net, unsigned int hash) 1412 { 1413 unsigned int i, bucket; 1414 1415 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1416 struct hlist_nulls_head *ct_hash; 1417 unsigned int hsize, drops; 1418 1419 rcu_read_lock(); 1420 nf_conntrack_get_ht(&ct_hash, &hsize); 1421 if (!i) 1422 bucket = reciprocal_scale(hash, hsize); 1423 else 1424 bucket = (bucket + 1) % hsize; 1425 1426 drops = early_drop_list(net, &ct_hash[bucket]); 1427 rcu_read_unlock(); 1428 1429 if (drops) { 1430 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1431 return true; 1432 } 1433 } 1434 1435 return false; 1436 } 1437 1438 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1439 { 1440 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1441 } 1442 1443 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1444 { 1445 const struct nf_conntrack_l4proto *l4proto; 1446 1447 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1448 return true; 1449 1450 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1451 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1452 return true; 1453 1454 return false; 1455 } 1456 1457 static void gc_worker(struct work_struct *work) 1458 { 1459 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1460 u32 end_time, start_time = nfct_time_stamp; 1461 struct conntrack_gc_work *gc_work; 1462 unsigned int expired_count = 0; 1463 unsigned long next_run; 1464 s32 delta_time; 1465 long count; 1466 1467 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1468 1469 i = gc_work->next_bucket; 1470 if (gc_work->early_drop) 1471 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1472 1473 if (i == 0) { 1474 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1475 gc_work->count = GC_SCAN_INITIAL_COUNT; 1476 gc_work->start_time = start_time; 1477 } 1478 1479 next_run = gc_work->avg_timeout; 1480 count = gc_work->count; 1481 1482 end_time = start_time + GC_SCAN_MAX_DURATION; 1483 1484 do { 1485 struct nf_conntrack_tuple_hash *h; 1486 struct hlist_nulls_head *ct_hash; 1487 struct hlist_nulls_node *n; 1488 struct nf_conn *tmp; 1489 1490 rcu_read_lock(); 1491 1492 nf_conntrack_get_ht(&ct_hash, &hashsz); 1493 if (i >= hashsz) { 1494 rcu_read_unlock(); 1495 break; 1496 } 1497 1498 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1499 struct nf_conntrack_net *cnet; 1500 struct net *net; 1501 long expires; 1502 1503 tmp = nf_ct_tuplehash_to_ctrack(h); 1504 1505 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1506 nf_ct_offload_timeout(tmp); 1507 continue; 1508 } 1509 1510 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1511 rcu_read_unlock(); 1512 1513 gc_work->next_bucket = i; 1514 gc_work->avg_timeout = next_run; 1515 gc_work->count = count; 1516 1517 delta_time = nfct_time_stamp - gc_work->start_time; 1518 1519 /* re-sched immediately if total cycle time is exceeded */ 1520 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1521 goto early_exit; 1522 } 1523 1524 if (nf_ct_is_expired(tmp)) { 1525 nf_ct_gc_expired(tmp); 1526 expired_count++; 1527 continue; 1528 } 1529 1530 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1531 expires = (expires - (long)next_run) / ++count; 1532 next_run += expires; 1533 1534 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1535 continue; 1536 1537 net = nf_ct_net(tmp); 1538 cnet = nf_ct_pernet(net); 1539 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1540 continue; 1541 1542 /* need to take reference to avoid possible races */ 1543 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1544 continue; 1545 1546 /* load ->status after refcount increase */ 1547 smp_acquire__after_ctrl_dep(); 1548 1549 if (gc_worker_skip_ct(tmp)) { 1550 nf_ct_put(tmp); 1551 continue; 1552 } 1553 1554 if (gc_worker_can_early_drop(tmp)) { 1555 nf_ct_kill(tmp); 1556 expired_count++; 1557 } 1558 1559 nf_ct_put(tmp); 1560 } 1561 1562 /* could check get_nulls_value() here and restart if ct 1563 * was moved to another chain. But given gc is best-effort 1564 * we will just continue with next hash slot. 1565 */ 1566 rcu_read_unlock(); 1567 cond_resched(); 1568 i++; 1569 1570 delta_time = nfct_time_stamp - end_time; 1571 if (delta_time > 0 && i < hashsz) { 1572 gc_work->avg_timeout = next_run; 1573 gc_work->count = count; 1574 gc_work->next_bucket = i; 1575 next_run = 0; 1576 goto early_exit; 1577 } 1578 } while (i < hashsz); 1579 1580 gc_work->next_bucket = 0; 1581 1582 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1583 1584 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1585 if (next_run > (unsigned long)delta_time) 1586 next_run -= delta_time; 1587 else 1588 next_run = 1; 1589 1590 early_exit: 1591 if (gc_work->exiting) 1592 return; 1593 1594 if (next_run) 1595 gc_work->early_drop = false; 1596 1597 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1598 } 1599 1600 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1601 { 1602 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1603 gc_work->exiting = false; 1604 } 1605 1606 static struct nf_conn * 1607 __nf_conntrack_alloc(struct net *net, 1608 const struct nf_conntrack_zone *zone, 1609 const struct nf_conntrack_tuple *orig, 1610 const struct nf_conntrack_tuple *repl, 1611 gfp_t gfp, u32 hash) 1612 { 1613 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1614 unsigned int ct_count; 1615 struct nf_conn *ct; 1616 1617 /* We don't want any race condition at early drop stage */ 1618 ct_count = atomic_inc_return(&cnet->count); 1619 1620 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1621 if (!early_drop(net, hash)) { 1622 if (!conntrack_gc_work.early_drop) 1623 conntrack_gc_work.early_drop = true; 1624 atomic_dec(&cnet->count); 1625 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1626 return ERR_PTR(-ENOMEM); 1627 } 1628 } 1629 1630 /* 1631 * Do not use kmem_cache_zalloc(), as this cache uses 1632 * SLAB_TYPESAFE_BY_RCU. 1633 */ 1634 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1635 if (ct == NULL) 1636 goto out; 1637 1638 spin_lock_init(&ct->lock); 1639 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1640 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1641 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1642 /* save hash for reusing when confirming */ 1643 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1644 ct->status = 0; 1645 WRITE_ONCE(ct->timeout, 0); 1646 write_pnet(&ct->ct_net, net); 1647 memset_after(ct, 0, __nfct_init_offset); 1648 1649 nf_ct_zone_add(ct, zone); 1650 1651 /* Because we use RCU lookups, we set ct_general.use to zero before 1652 * this is inserted in any list. 1653 */ 1654 refcount_set(&ct->ct_general.use, 0); 1655 return ct; 1656 out: 1657 atomic_dec(&cnet->count); 1658 return ERR_PTR(-ENOMEM); 1659 } 1660 1661 struct nf_conn *nf_conntrack_alloc(struct net *net, 1662 const struct nf_conntrack_zone *zone, 1663 const struct nf_conntrack_tuple *orig, 1664 const struct nf_conntrack_tuple *repl, 1665 gfp_t gfp) 1666 { 1667 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1668 } 1669 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1670 1671 void nf_conntrack_free(struct nf_conn *ct) 1672 { 1673 struct net *net = nf_ct_net(ct); 1674 struct nf_conntrack_net *cnet; 1675 1676 /* A freed object has refcnt == 0, that's 1677 * the golden rule for SLAB_TYPESAFE_BY_RCU 1678 */ 1679 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1680 1681 if (ct->status & IPS_SRC_NAT_DONE) { 1682 const struct nf_nat_hook *nat_hook; 1683 1684 rcu_read_lock(); 1685 nat_hook = rcu_dereference(nf_nat_hook); 1686 if (nat_hook) 1687 nat_hook->remove_nat_bysrc(ct); 1688 rcu_read_unlock(); 1689 } 1690 1691 kfree(ct->ext); 1692 kmem_cache_free(nf_conntrack_cachep, ct); 1693 cnet = nf_ct_pernet(net); 1694 1695 smp_mb__before_atomic(); 1696 atomic_dec(&cnet->count); 1697 } 1698 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1699 1700 1701 /* Allocate a new conntrack: we return -ENOMEM if classification 1702 failed due to stress. Otherwise it really is unclassifiable. */ 1703 static noinline struct nf_conntrack_tuple_hash * 1704 init_conntrack(struct net *net, struct nf_conn *tmpl, 1705 const struct nf_conntrack_tuple *tuple, 1706 struct sk_buff *skb, 1707 unsigned int dataoff, u32 hash) 1708 { 1709 struct nf_conn *ct; 1710 struct nf_conn_help *help; 1711 struct nf_conntrack_tuple repl_tuple; 1712 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1713 struct nf_conntrack_ecache *ecache; 1714 #endif 1715 struct nf_conntrack_expect *exp = NULL; 1716 const struct nf_conntrack_zone *zone; 1717 struct nf_conn_timeout *timeout_ext; 1718 struct nf_conntrack_zone tmp; 1719 struct nf_conntrack_net *cnet; 1720 1721 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1722 return NULL; 1723 1724 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1725 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1726 hash); 1727 if (IS_ERR(ct)) 1728 return (struct nf_conntrack_tuple_hash *)ct; 1729 1730 if (!nf_ct_add_synproxy(ct, tmpl)) { 1731 nf_conntrack_free(ct); 1732 return ERR_PTR(-ENOMEM); 1733 } 1734 1735 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1736 1737 if (timeout_ext) 1738 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1739 GFP_ATOMIC); 1740 1741 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1742 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1743 nf_ct_labels_ext_add(ct); 1744 1745 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1746 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1747 1748 if ((ecache || net->ct.sysctl_events) && 1749 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1750 ecache ? ecache->expmask : 0, 1751 GFP_ATOMIC)) { 1752 nf_conntrack_free(ct); 1753 return ERR_PTR(-ENOMEM); 1754 } 1755 #endif 1756 1757 cnet = nf_ct_pernet(net); 1758 if (cnet->expect_count) { 1759 spin_lock_bh(&nf_conntrack_expect_lock); 1760 exp = nf_ct_find_expectation(net, zone, tuple); 1761 if (exp) { 1762 /* Welcome, Mr. Bond. We've been expecting you... */ 1763 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1764 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1765 ct->master = exp->master; 1766 if (exp->helper) { 1767 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1768 if (help) 1769 rcu_assign_pointer(help->helper, exp->helper); 1770 } 1771 1772 #ifdef CONFIG_NF_CONNTRACK_MARK 1773 ct->mark = READ_ONCE(exp->master->mark); 1774 #endif 1775 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1776 ct->secmark = exp->master->secmark; 1777 #endif 1778 NF_CT_STAT_INC(net, expect_new); 1779 } 1780 spin_unlock_bh(&nf_conntrack_expect_lock); 1781 } 1782 if (!exp && tmpl) 1783 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1784 1785 /* Other CPU might have obtained a pointer to this object before it was 1786 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1787 * 1788 * After refcount_set(1) it will succeed; ensure that zeroing of 1789 * ct->status and the correct ct->net pointer are visible; else other 1790 * core might observe CONFIRMED bit which means the entry is valid and 1791 * in the hash table, but its not (anymore). 1792 */ 1793 smp_wmb(); 1794 1795 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1796 refcount_set(&ct->ct_general.use, 1); 1797 1798 if (exp) { 1799 if (exp->expectfn) 1800 exp->expectfn(ct, exp); 1801 nf_ct_expect_put(exp); 1802 } 1803 1804 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1805 } 1806 1807 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1808 static int 1809 resolve_normal_ct(struct nf_conn *tmpl, 1810 struct sk_buff *skb, 1811 unsigned int dataoff, 1812 u_int8_t protonum, 1813 const struct nf_hook_state *state) 1814 { 1815 const struct nf_conntrack_zone *zone; 1816 struct nf_conntrack_tuple tuple; 1817 struct nf_conntrack_tuple_hash *h; 1818 enum ip_conntrack_info ctinfo; 1819 struct nf_conntrack_zone tmp; 1820 u32 hash, zone_id, rid; 1821 struct nf_conn *ct; 1822 1823 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1824 dataoff, state->pf, protonum, state->net, 1825 &tuple)) 1826 return 0; 1827 1828 /* look for tuple match */ 1829 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1830 1831 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1832 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1833 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1834 1835 if (!h) { 1836 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1837 if (zone_id != rid) { 1838 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1839 1840 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1841 } 1842 } 1843 1844 if (!h) { 1845 h = init_conntrack(state->net, tmpl, &tuple, 1846 skb, dataoff, hash); 1847 if (!h) 1848 return 0; 1849 if (IS_ERR(h)) 1850 return PTR_ERR(h); 1851 } 1852 ct = nf_ct_tuplehash_to_ctrack(h); 1853 1854 /* It exists; we have (non-exclusive) reference. */ 1855 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1856 ctinfo = IP_CT_ESTABLISHED_REPLY; 1857 } else { 1858 unsigned long status = READ_ONCE(ct->status); 1859 1860 /* Once we've had two way comms, always ESTABLISHED. */ 1861 if (likely(status & IPS_SEEN_REPLY)) 1862 ctinfo = IP_CT_ESTABLISHED; 1863 else if (status & IPS_EXPECTED) 1864 ctinfo = IP_CT_RELATED; 1865 else 1866 ctinfo = IP_CT_NEW; 1867 } 1868 nf_ct_set(skb, ct, ctinfo); 1869 return 0; 1870 } 1871 1872 /* 1873 * icmp packets need special treatment to handle error messages that are 1874 * related to a connection. 1875 * 1876 * Callers need to check if skb has a conntrack assigned when this 1877 * helper returns; in such case skb belongs to an already known connection. 1878 */ 1879 static unsigned int __cold 1880 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1881 struct sk_buff *skb, 1882 unsigned int dataoff, 1883 u8 protonum, 1884 const struct nf_hook_state *state) 1885 { 1886 int ret; 1887 1888 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1889 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1890 #if IS_ENABLED(CONFIG_IPV6) 1891 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1892 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1893 #endif 1894 else 1895 return NF_ACCEPT; 1896 1897 if (ret <= 0) 1898 NF_CT_STAT_INC_ATOMIC(state->net, error); 1899 1900 return ret; 1901 } 1902 1903 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1904 enum ip_conntrack_info ctinfo) 1905 { 1906 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1907 1908 if (!timeout) 1909 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1910 1911 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1912 return NF_ACCEPT; 1913 } 1914 1915 /* Returns verdict for packet, or -1 for invalid. */ 1916 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1917 struct sk_buff *skb, 1918 unsigned int dataoff, 1919 enum ip_conntrack_info ctinfo, 1920 const struct nf_hook_state *state) 1921 { 1922 switch (nf_ct_protonum(ct)) { 1923 case IPPROTO_TCP: 1924 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1925 ctinfo, state); 1926 case IPPROTO_UDP: 1927 return nf_conntrack_udp_packet(ct, skb, dataoff, 1928 ctinfo, state); 1929 case IPPROTO_ICMP: 1930 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1931 #if IS_ENABLED(CONFIG_IPV6) 1932 case IPPROTO_ICMPV6: 1933 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1934 #endif 1935 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1936 case IPPROTO_UDPLITE: 1937 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1938 ctinfo, state); 1939 #endif 1940 #ifdef CONFIG_NF_CT_PROTO_SCTP 1941 case IPPROTO_SCTP: 1942 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1943 ctinfo, state); 1944 #endif 1945 #ifdef CONFIG_NF_CT_PROTO_DCCP 1946 case IPPROTO_DCCP: 1947 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1948 ctinfo, state); 1949 #endif 1950 #ifdef CONFIG_NF_CT_PROTO_GRE 1951 case IPPROTO_GRE: 1952 return nf_conntrack_gre_packet(ct, skb, dataoff, 1953 ctinfo, state); 1954 #endif 1955 } 1956 1957 return generic_packet(ct, skb, ctinfo); 1958 } 1959 1960 unsigned int 1961 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1962 { 1963 enum ip_conntrack_info ctinfo; 1964 struct nf_conn *ct, *tmpl; 1965 u_int8_t protonum; 1966 int dataoff, ret; 1967 1968 tmpl = nf_ct_get(skb, &ctinfo); 1969 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1970 /* Previously seen (loopback or untracked)? Ignore. */ 1971 if ((tmpl && !nf_ct_is_template(tmpl)) || 1972 ctinfo == IP_CT_UNTRACKED) 1973 return NF_ACCEPT; 1974 skb->_nfct = 0; 1975 } 1976 1977 /* rcu_read_lock()ed by nf_hook_thresh */ 1978 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1979 if (dataoff <= 0) { 1980 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1981 ret = NF_ACCEPT; 1982 goto out; 1983 } 1984 1985 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1986 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1987 protonum, state); 1988 if (ret <= 0) { 1989 ret = -ret; 1990 goto out; 1991 } 1992 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1993 if (skb->_nfct) 1994 goto out; 1995 } 1996 repeat: 1997 ret = resolve_normal_ct(tmpl, skb, dataoff, 1998 protonum, state); 1999 if (ret < 0) { 2000 /* Too stressed to deal. */ 2001 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2002 ret = NF_DROP; 2003 goto out; 2004 } 2005 2006 ct = nf_ct_get(skb, &ctinfo); 2007 if (!ct) { 2008 /* Not valid part of a connection */ 2009 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2010 ret = NF_ACCEPT; 2011 goto out; 2012 } 2013 2014 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2015 if (ret <= 0) { 2016 /* Invalid: inverse of the return code tells 2017 * the netfilter core what to do */ 2018 nf_ct_put(ct); 2019 skb->_nfct = 0; 2020 /* Special case: TCP tracker reports an attempt to reopen a 2021 * closed/aborted connection. We have to go back and create a 2022 * fresh conntrack. 2023 */ 2024 if (ret == -NF_REPEAT) 2025 goto repeat; 2026 2027 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2028 if (ret == -NF_DROP) 2029 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2030 2031 ret = -ret; 2032 goto out; 2033 } 2034 2035 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2036 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2037 nf_conntrack_event_cache(IPCT_REPLY, ct); 2038 out: 2039 if (tmpl) 2040 nf_ct_put(tmpl); 2041 2042 return ret; 2043 } 2044 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2045 2046 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 2047 implicitly racy: see __nf_conntrack_confirm */ 2048 void nf_conntrack_alter_reply(struct nf_conn *ct, 2049 const struct nf_conntrack_tuple *newreply) 2050 { 2051 struct nf_conn_help *help = nfct_help(ct); 2052 2053 /* Should be unconfirmed, so not in hash table yet */ 2054 WARN_ON(nf_ct_is_confirmed(ct)); 2055 2056 nf_ct_dump_tuple(newreply); 2057 2058 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 2059 if (ct->master || (help && !hlist_empty(&help->expectations))) 2060 return; 2061 } 2062 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 2063 2064 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2065 void __nf_ct_refresh_acct(struct nf_conn *ct, 2066 enum ip_conntrack_info ctinfo, 2067 const struct sk_buff *skb, 2068 u32 extra_jiffies, 2069 bool do_acct) 2070 { 2071 /* Only update if this is not a fixed timeout */ 2072 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2073 goto acct; 2074 2075 /* If not in hash table, timer will not be active yet */ 2076 if (nf_ct_is_confirmed(ct)) 2077 extra_jiffies += nfct_time_stamp; 2078 2079 if (READ_ONCE(ct->timeout) != extra_jiffies) 2080 WRITE_ONCE(ct->timeout, extra_jiffies); 2081 acct: 2082 if (do_acct) 2083 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2084 } 2085 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2086 2087 bool nf_ct_kill_acct(struct nf_conn *ct, 2088 enum ip_conntrack_info ctinfo, 2089 const struct sk_buff *skb) 2090 { 2091 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2092 2093 return nf_ct_delete(ct, 0, 0); 2094 } 2095 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2096 2097 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2098 2099 #include <linux/netfilter/nfnetlink.h> 2100 #include <linux/netfilter/nfnetlink_conntrack.h> 2101 #include <linux/mutex.h> 2102 2103 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2104 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2105 const struct nf_conntrack_tuple *tuple) 2106 { 2107 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2108 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2109 goto nla_put_failure; 2110 return 0; 2111 2112 nla_put_failure: 2113 return -1; 2114 } 2115 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2116 2117 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2118 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2119 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2120 }; 2121 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2122 2123 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2124 struct nf_conntrack_tuple *t, 2125 u_int32_t flags) 2126 { 2127 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2128 if (!tb[CTA_PROTO_SRC_PORT]) 2129 return -EINVAL; 2130 2131 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2132 } 2133 2134 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2135 if (!tb[CTA_PROTO_DST_PORT]) 2136 return -EINVAL; 2137 2138 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2139 } 2140 2141 return 0; 2142 } 2143 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2144 2145 unsigned int nf_ct_port_nlattr_tuple_size(void) 2146 { 2147 static unsigned int size __read_mostly; 2148 2149 if (!size) 2150 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2151 2152 return size; 2153 } 2154 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2155 #endif 2156 2157 /* Used by ipt_REJECT and ip6t_REJECT. */ 2158 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2159 { 2160 struct nf_conn *ct; 2161 enum ip_conntrack_info ctinfo; 2162 2163 /* This ICMP is in reverse direction to the packet which caused it */ 2164 ct = nf_ct_get(skb, &ctinfo); 2165 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2166 ctinfo = IP_CT_RELATED_REPLY; 2167 else 2168 ctinfo = IP_CT_RELATED; 2169 2170 /* Attach to new skbuff, and increment count */ 2171 nf_ct_set(nskb, ct, ctinfo); 2172 nf_conntrack_get(skb_nfct(nskb)); 2173 } 2174 2175 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2176 struct nf_conn *ct, 2177 enum ip_conntrack_info ctinfo) 2178 { 2179 const struct nf_nat_hook *nat_hook; 2180 struct nf_conntrack_tuple_hash *h; 2181 struct nf_conntrack_tuple tuple; 2182 unsigned int status; 2183 int dataoff; 2184 u16 l3num; 2185 u8 l4num; 2186 2187 l3num = nf_ct_l3num(ct); 2188 2189 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2190 if (dataoff <= 0) 2191 return -1; 2192 2193 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2194 l4num, net, &tuple)) 2195 return -1; 2196 2197 if (ct->status & IPS_SRC_NAT) { 2198 memcpy(tuple.src.u3.all, 2199 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2200 sizeof(tuple.src.u3.all)); 2201 tuple.src.u.all = 2202 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2203 } 2204 2205 if (ct->status & IPS_DST_NAT) { 2206 memcpy(tuple.dst.u3.all, 2207 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2208 sizeof(tuple.dst.u3.all)); 2209 tuple.dst.u.all = 2210 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2211 } 2212 2213 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2214 if (!h) 2215 return 0; 2216 2217 /* Store status bits of the conntrack that is clashing to re-do NAT 2218 * mangling according to what it has been done already to this packet. 2219 */ 2220 status = ct->status; 2221 2222 nf_ct_put(ct); 2223 ct = nf_ct_tuplehash_to_ctrack(h); 2224 nf_ct_set(skb, ct, ctinfo); 2225 2226 nat_hook = rcu_dereference(nf_nat_hook); 2227 if (!nat_hook) 2228 return 0; 2229 2230 if (status & IPS_SRC_NAT && 2231 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 2232 IP_CT_DIR_ORIGINAL) == NF_DROP) 2233 return -1; 2234 2235 if (status & IPS_DST_NAT && 2236 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 2237 IP_CT_DIR_ORIGINAL) == NF_DROP) 2238 return -1; 2239 2240 return 0; 2241 } 2242 2243 /* This packet is coming from userspace via nf_queue, complete the packet 2244 * processing after the helper invocation in nf_confirm(). 2245 */ 2246 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2247 enum ip_conntrack_info ctinfo) 2248 { 2249 const struct nf_conntrack_helper *helper; 2250 const struct nf_conn_help *help; 2251 int protoff; 2252 2253 help = nfct_help(ct); 2254 if (!help) 2255 return 0; 2256 2257 helper = rcu_dereference(help->helper); 2258 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2259 return 0; 2260 2261 switch (nf_ct_l3num(ct)) { 2262 case NFPROTO_IPV4: 2263 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2264 break; 2265 #if IS_ENABLED(CONFIG_IPV6) 2266 case NFPROTO_IPV6: { 2267 __be16 frag_off; 2268 u8 pnum; 2269 2270 pnum = ipv6_hdr(skb)->nexthdr; 2271 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2272 &frag_off); 2273 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2274 return 0; 2275 break; 2276 } 2277 #endif 2278 default: 2279 return 0; 2280 } 2281 2282 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2283 !nf_is_loopback_packet(skb)) { 2284 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2285 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2286 return -1; 2287 } 2288 } 2289 2290 /* We've seen it coming out the other side: confirm it */ 2291 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; 2292 } 2293 2294 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2295 { 2296 enum ip_conntrack_info ctinfo; 2297 struct nf_conn *ct; 2298 int err; 2299 2300 ct = nf_ct_get(skb, &ctinfo); 2301 if (!ct) 2302 return 0; 2303 2304 if (!nf_ct_is_confirmed(ct)) { 2305 err = __nf_conntrack_update(net, skb, ct, ctinfo); 2306 if (err < 0) 2307 return err; 2308 2309 ct = nf_ct_get(skb, &ctinfo); 2310 } 2311 2312 return nf_confirm_cthelper(skb, ct, ctinfo); 2313 } 2314 2315 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2316 const struct sk_buff *skb) 2317 { 2318 const struct nf_conntrack_tuple *src_tuple; 2319 const struct nf_conntrack_tuple_hash *hash; 2320 struct nf_conntrack_tuple srctuple; 2321 enum ip_conntrack_info ctinfo; 2322 struct nf_conn *ct; 2323 2324 ct = nf_ct_get(skb, &ctinfo); 2325 if (ct) { 2326 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2327 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2328 return true; 2329 } 2330 2331 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2332 NFPROTO_IPV4, dev_net(skb->dev), 2333 &srctuple)) 2334 return false; 2335 2336 hash = nf_conntrack_find_get(dev_net(skb->dev), 2337 &nf_ct_zone_dflt, 2338 &srctuple); 2339 if (!hash) 2340 return false; 2341 2342 ct = nf_ct_tuplehash_to_ctrack(hash); 2343 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2344 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2345 nf_ct_put(ct); 2346 2347 return true; 2348 } 2349 2350 /* Bring out ya dead! */ 2351 static struct nf_conn * 2352 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2353 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2354 { 2355 struct nf_conntrack_tuple_hash *h; 2356 struct nf_conn *ct; 2357 struct hlist_nulls_node *n; 2358 spinlock_t *lockp; 2359 2360 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2361 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2362 2363 if (hlist_nulls_empty(hslot)) 2364 continue; 2365 2366 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2367 local_bh_disable(); 2368 nf_conntrack_lock(lockp); 2369 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2370 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2371 continue; 2372 /* All nf_conn objects are added to hash table twice, one 2373 * for original direction tuple, once for the reply tuple. 2374 * 2375 * Exception: In the IPS_NAT_CLASH case, only the reply 2376 * tuple is added (the original tuple already existed for 2377 * a different object). 2378 * 2379 * We only need to call the iterator once for each 2380 * conntrack, so we just use the 'reply' direction 2381 * tuple while iterating. 2382 */ 2383 ct = nf_ct_tuplehash_to_ctrack(h); 2384 2385 if (iter_data->net && 2386 !net_eq(iter_data->net, nf_ct_net(ct))) 2387 continue; 2388 2389 if (iter(ct, iter_data->data)) 2390 goto found; 2391 } 2392 spin_unlock(lockp); 2393 local_bh_enable(); 2394 cond_resched(); 2395 } 2396 2397 return NULL; 2398 found: 2399 refcount_inc(&ct->ct_general.use); 2400 spin_unlock(lockp); 2401 local_bh_enable(); 2402 return ct; 2403 } 2404 2405 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2406 const struct nf_ct_iter_data *iter_data) 2407 { 2408 unsigned int bucket = 0; 2409 struct nf_conn *ct; 2410 2411 might_sleep(); 2412 2413 mutex_lock(&nf_conntrack_mutex); 2414 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2415 /* Time to push up daises... */ 2416 2417 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2418 nf_ct_put(ct); 2419 cond_resched(); 2420 } 2421 mutex_unlock(&nf_conntrack_mutex); 2422 } 2423 2424 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2425 const struct nf_ct_iter_data *iter_data) 2426 { 2427 struct net *net = iter_data->net; 2428 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2429 2430 might_sleep(); 2431 2432 if (atomic_read(&cnet->count) == 0) 2433 return; 2434 2435 nf_ct_iterate_cleanup(iter, iter_data); 2436 } 2437 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2438 2439 /** 2440 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2441 * @iter: callback to invoke for each conntrack 2442 * @data: data to pass to @iter 2443 * 2444 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2445 * unconfirmed list as dying (so they will not be inserted into 2446 * main table). 2447 * 2448 * Can only be called in module exit path. 2449 */ 2450 void 2451 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2452 { 2453 struct nf_ct_iter_data iter_data = {}; 2454 struct net *net; 2455 2456 down_read(&net_rwsem); 2457 for_each_net(net) { 2458 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2459 2460 if (atomic_read(&cnet->count) == 0) 2461 continue; 2462 nf_queue_nf_hook_drop(net); 2463 } 2464 up_read(&net_rwsem); 2465 2466 /* Need to wait for netns cleanup worker to finish, if its 2467 * running -- it might have deleted a net namespace from 2468 * the global list, so hook drop above might not have 2469 * affected all namespaces. 2470 */ 2471 net_ns_barrier(); 2472 2473 /* a skb w. unconfirmed conntrack could have been reinjected just 2474 * before we called nf_queue_nf_hook_drop(). 2475 * 2476 * This makes sure its inserted into conntrack table. 2477 */ 2478 synchronize_net(); 2479 2480 nf_ct_ext_bump_genid(); 2481 iter_data.data = data; 2482 nf_ct_iterate_cleanup(iter, &iter_data); 2483 2484 /* Another cpu might be in a rcu read section with 2485 * rcu protected pointer cleared in iter callback 2486 * or hidden via nf_ct_ext_bump_genid() above. 2487 * 2488 * Wait until those are done. 2489 */ 2490 synchronize_rcu(); 2491 } 2492 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2493 2494 static int kill_all(struct nf_conn *i, void *data) 2495 { 2496 return 1; 2497 } 2498 2499 void nf_conntrack_cleanup_start(void) 2500 { 2501 cleanup_nf_conntrack_bpf(); 2502 conntrack_gc_work.exiting = true; 2503 } 2504 2505 void nf_conntrack_cleanup_end(void) 2506 { 2507 RCU_INIT_POINTER(nf_ct_hook, NULL); 2508 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2509 kvfree(nf_conntrack_hash); 2510 2511 nf_conntrack_proto_fini(); 2512 nf_conntrack_helper_fini(); 2513 nf_conntrack_expect_fini(); 2514 2515 kmem_cache_destroy(nf_conntrack_cachep); 2516 } 2517 2518 /* 2519 * Mishearing the voices in his head, our hero wonders how he's 2520 * supposed to kill the mall. 2521 */ 2522 void nf_conntrack_cleanup_net(struct net *net) 2523 { 2524 LIST_HEAD(single); 2525 2526 list_add(&net->exit_list, &single); 2527 nf_conntrack_cleanup_net_list(&single); 2528 } 2529 2530 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2531 { 2532 struct nf_ct_iter_data iter_data = {}; 2533 struct net *net; 2534 int busy; 2535 2536 /* 2537 * This makes sure all current packets have passed through 2538 * netfilter framework. Roll on, two-stage module 2539 * delete... 2540 */ 2541 synchronize_net(); 2542 i_see_dead_people: 2543 busy = 0; 2544 list_for_each_entry(net, net_exit_list, exit_list) { 2545 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2546 2547 iter_data.net = net; 2548 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2549 if (atomic_read(&cnet->count) != 0) 2550 busy = 1; 2551 } 2552 if (busy) { 2553 schedule(); 2554 goto i_see_dead_people; 2555 } 2556 2557 list_for_each_entry(net, net_exit_list, exit_list) { 2558 nf_conntrack_ecache_pernet_fini(net); 2559 nf_conntrack_expect_pernet_fini(net); 2560 free_percpu(net->ct.stat); 2561 } 2562 } 2563 2564 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2565 { 2566 struct hlist_nulls_head *hash; 2567 unsigned int nr_slots, i; 2568 2569 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2570 return NULL; 2571 2572 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2573 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2574 2575 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2576 2577 if (hash && nulls) 2578 for (i = 0; i < nr_slots; i++) 2579 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2580 2581 return hash; 2582 } 2583 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2584 2585 int nf_conntrack_hash_resize(unsigned int hashsize) 2586 { 2587 int i, bucket; 2588 unsigned int old_size; 2589 struct hlist_nulls_head *hash, *old_hash; 2590 struct nf_conntrack_tuple_hash *h; 2591 struct nf_conn *ct; 2592 2593 if (!hashsize) 2594 return -EINVAL; 2595 2596 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2597 if (!hash) 2598 return -ENOMEM; 2599 2600 mutex_lock(&nf_conntrack_mutex); 2601 old_size = nf_conntrack_htable_size; 2602 if (old_size == hashsize) { 2603 mutex_unlock(&nf_conntrack_mutex); 2604 kvfree(hash); 2605 return 0; 2606 } 2607 2608 local_bh_disable(); 2609 nf_conntrack_all_lock(); 2610 write_seqcount_begin(&nf_conntrack_generation); 2611 2612 /* Lookups in the old hash might happen in parallel, which means we 2613 * might get false negatives during connection lookup. New connections 2614 * created because of a false negative won't make it into the hash 2615 * though since that required taking the locks. 2616 */ 2617 2618 for (i = 0; i < nf_conntrack_htable_size; i++) { 2619 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2620 unsigned int zone_id; 2621 2622 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2623 struct nf_conntrack_tuple_hash, hnnode); 2624 ct = nf_ct_tuplehash_to_ctrack(h); 2625 hlist_nulls_del_rcu(&h->hnnode); 2626 2627 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2628 bucket = __hash_conntrack(nf_ct_net(ct), 2629 &h->tuple, zone_id, hashsize); 2630 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2631 } 2632 } 2633 old_hash = nf_conntrack_hash; 2634 2635 nf_conntrack_hash = hash; 2636 nf_conntrack_htable_size = hashsize; 2637 2638 write_seqcount_end(&nf_conntrack_generation); 2639 nf_conntrack_all_unlock(); 2640 local_bh_enable(); 2641 2642 mutex_unlock(&nf_conntrack_mutex); 2643 2644 synchronize_net(); 2645 kvfree(old_hash); 2646 return 0; 2647 } 2648 2649 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2650 { 2651 unsigned int hashsize; 2652 int rc; 2653 2654 if (current->nsproxy->net_ns != &init_net) 2655 return -EOPNOTSUPP; 2656 2657 /* On boot, we can set this without any fancy locking. */ 2658 if (!nf_conntrack_hash) 2659 return param_set_uint(val, kp); 2660 2661 rc = kstrtouint(val, 0, &hashsize); 2662 if (rc) 2663 return rc; 2664 2665 return nf_conntrack_hash_resize(hashsize); 2666 } 2667 2668 int nf_conntrack_init_start(void) 2669 { 2670 unsigned long nr_pages = totalram_pages(); 2671 int max_factor = 8; 2672 int ret = -ENOMEM; 2673 int i; 2674 2675 seqcount_spinlock_init(&nf_conntrack_generation, 2676 &nf_conntrack_locks_all_lock); 2677 2678 for (i = 0; i < CONNTRACK_LOCKS; i++) 2679 spin_lock_init(&nf_conntrack_locks[i]); 2680 2681 if (!nf_conntrack_htable_size) { 2682 nf_conntrack_htable_size 2683 = (((nr_pages << PAGE_SHIFT) / 16384) 2684 / sizeof(struct hlist_head)); 2685 if (BITS_PER_LONG >= 64 && 2686 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2687 nf_conntrack_htable_size = 262144; 2688 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2689 nf_conntrack_htable_size = 65536; 2690 2691 if (nf_conntrack_htable_size < 1024) 2692 nf_conntrack_htable_size = 1024; 2693 /* Use a max. factor of one by default to keep the average 2694 * hash chain length at 2 entries. Each entry has to be added 2695 * twice (once for original direction, once for reply). 2696 * When a table size is given we use the old value of 8 to 2697 * avoid implicit reduction of the max entries setting. 2698 */ 2699 max_factor = 1; 2700 } 2701 2702 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2703 if (!nf_conntrack_hash) 2704 return -ENOMEM; 2705 2706 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2707 2708 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2709 sizeof(struct nf_conn), 2710 NFCT_INFOMASK + 1, 2711 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2712 if (!nf_conntrack_cachep) 2713 goto err_cachep; 2714 2715 ret = nf_conntrack_expect_init(); 2716 if (ret < 0) 2717 goto err_expect; 2718 2719 ret = nf_conntrack_helper_init(); 2720 if (ret < 0) 2721 goto err_helper; 2722 2723 ret = nf_conntrack_proto_init(); 2724 if (ret < 0) 2725 goto err_proto; 2726 2727 conntrack_gc_work_init(&conntrack_gc_work); 2728 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2729 2730 ret = register_nf_conntrack_bpf(); 2731 if (ret < 0) 2732 goto err_kfunc; 2733 2734 return 0; 2735 2736 err_kfunc: 2737 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2738 nf_conntrack_proto_fini(); 2739 err_proto: 2740 nf_conntrack_helper_fini(); 2741 err_helper: 2742 nf_conntrack_expect_fini(); 2743 err_expect: 2744 kmem_cache_destroy(nf_conntrack_cachep); 2745 err_cachep: 2746 kvfree(nf_conntrack_hash); 2747 return ret; 2748 } 2749 2750 static const struct nf_ct_hook nf_conntrack_hook = { 2751 .update = nf_conntrack_update, 2752 .destroy = nf_ct_destroy, 2753 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2754 .attach = nf_conntrack_attach, 2755 }; 2756 2757 void nf_conntrack_init_end(void) 2758 { 2759 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2760 } 2761 2762 /* 2763 * We need to use special "null" values, not used in hash table 2764 */ 2765 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2766 2767 int nf_conntrack_init_net(struct net *net) 2768 { 2769 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2770 int ret = -ENOMEM; 2771 2772 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2773 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2774 atomic_set(&cnet->count, 0); 2775 2776 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2777 if (!net->ct.stat) 2778 return ret; 2779 2780 ret = nf_conntrack_expect_pernet_init(net); 2781 if (ret < 0) 2782 goto err_expect; 2783 2784 nf_conntrack_acct_pernet_init(net); 2785 nf_conntrack_tstamp_pernet_init(net); 2786 nf_conntrack_ecache_pernet_init(net); 2787 nf_conntrack_proto_pernet_init(net); 2788 2789 return 0; 2790 2791 err_expect: 2792 free_percpu(net->ct.stat); 2793 return ret; 2794 } 2795 2796 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2797 2798 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2799 { 2800 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2801 return -EPERM; 2802 2803 __nf_ct_set_timeout(ct, timeout); 2804 2805 if (test_bit(IPS_DYING_BIT, &ct->status)) 2806 return -ETIME; 2807 2808 return 0; 2809 } 2810 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2811 2812 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2813 { 2814 unsigned int bit; 2815 2816 /* Ignore these unchangable bits */ 2817 on &= ~IPS_UNCHANGEABLE_MASK; 2818 off &= ~IPS_UNCHANGEABLE_MASK; 2819 2820 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2821 if (on & (1 << bit)) 2822 set_bit(bit, &ct->status); 2823 else if (off & (1 << bit)) 2824 clear_bit(bit, &ct->status); 2825 } 2826 } 2827 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2828 2829 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2830 { 2831 unsigned long d; 2832 2833 d = ct->status ^ status; 2834 2835 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2836 /* unchangeable */ 2837 return -EBUSY; 2838 2839 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2840 /* SEEN_REPLY bit can only be set */ 2841 return -EBUSY; 2842 2843 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2844 /* ASSURED bit can only be set */ 2845 return -EBUSY; 2846 2847 __nf_ct_change_status(ct, status, 0); 2848 return 0; 2849 } 2850 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2851