1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 count; 71 u32 start_time; 72 bool exiting; 73 bool early_drop; 74 }; 75 76 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 77 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 78 static __read_mostly bool nf_conntrack_locks_all; 79 80 /* serialize hash resizes and nf_ct_iterate_cleanup */ 81 static DEFINE_MUTEX(nf_conntrack_mutex); 82 83 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 84 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 85 86 /* clamp timeouts to this value (TCP unacked) */ 87 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 88 89 /* Initial bias pretending we have 100 entries at the upper bound so we don't 90 * wakeup often just because we have three entries with a 1s timeout while still 91 * allowing non-idle machines to wakeup more often when needed. 92 */ 93 #define GC_SCAN_INITIAL_COUNT 100 94 #define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX 95 96 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 97 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 98 99 #define MIN_CHAINLEN 50u 100 #define MAX_CHAINLEN (80u - MIN_CHAINLEN) 101 102 static struct conntrack_gc_work conntrack_gc_work; 103 104 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 105 { 106 /* 1) Acquire the lock */ 107 spin_lock(lock); 108 109 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 110 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 111 */ 112 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 113 return; 114 115 /* fast path failed, unlock */ 116 spin_unlock(lock); 117 118 /* Slow path 1) get global lock */ 119 spin_lock(&nf_conntrack_locks_all_lock); 120 121 /* Slow path 2) get the lock we want */ 122 spin_lock(lock); 123 124 /* Slow path 3) release the global lock */ 125 spin_unlock(&nf_conntrack_locks_all_lock); 126 } 127 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 128 129 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 130 { 131 h1 %= CONNTRACK_LOCKS; 132 h2 %= CONNTRACK_LOCKS; 133 spin_unlock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_unlock(&nf_conntrack_locks[h2]); 136 } 137 138 /* return true if we need to recompute hashes (in case hash table was resized) */ 139 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 140 unsigned int h2, unsigned int sequence) 141 { 142 h1 %= CONNTRACK_LOCKS; 143 h2 %= CONNTRACK_LOCKS; 144 if (h1 <= h2) { 145 nf_conntrack_lock(&nf_conntrack_locks[h1]); 146 if (h1 != h2) 147 spin_lock_nested(&nf_conntrack_locks[h2], 148 SINGLE_DEPTH_NESTING); 149 } else { 150 nf_conntrack_lock(&nf_conntrack_locks[h2]); 151 spin_lock_nested(&nf_conntrack_locks[h1], 152 SINGLE_DEPTH_NESTING); 153 } 154 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 155 nf_conntrack_double_unlock(h1, h2); 156 return true; 157 } 158 return false; 159 } 160 161 static void nf_conntrack_all_lock(void) 162 __acquires(&nf_conntrack_locks_all_lock) 163 { 164 int i; 165 166 spin_lock(&nf_conntrack_locks_all_lock); 167 168 /* For nf_contrack_locks_all, only the latest time when another 169 * CPU will see an update is controlled, by the "release" of the 170 * spin_lock below. 171 * The earliest time is not controlled, an thus KCSAN could detect 172 * a race when nf_conntract_lock() reads the variable. 173 * WRITE_ONCE() is used to ensure the compiler will not 174 * optimize the write. 175 */ 176 WRITE_ONCE(nf_conntrack_locks_all, true); 177 178 for (i = 0; i < CONNTRACK_LOCKS; i++) { 179 spin_lock(&nf_conntrack_locks[i]); 180 181 /* This spin_unlock provides the "release" to ensure that 182 * nf_conntrack_locks_all==true is visible to everyone that 183 * acquired spin_lock(&nf_conntrack_locks[]). 184 */ 185 spin_unlock(&nf_conntrack_locks[i]); 186 } 187 } 188 189 static void nf_conntrack_all_unlock(void) 190 __releases(&nf_conntrack_locks_all_lock) 191 { 192 /* All prior stores must be complete before we clear 193 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 194 * might observe the false value but not the entire 195 * critical section. 196 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 197 */ 198 smp_store_release(&nf_conntrack_locks_all, false); 199 spin_unlock(&nf_conntrack_locks_all_lock); 200 } 201 202 unsigned int nf_conntrack_htable_size __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 204 205 unsigned int nf_conntrack_max __read_mostly; 206 EXPORT_SYMBOL_GPL(nf_conntrack_max); 207 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 208 static siphash_aligned_key_t nf_conntrack_hash_rnd; 209 210 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 211 unsigned int zoneid, 212 const struct net *net) 213 { 214 siphash_key_t key; 215 216 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 217 218 key = nf_conntrack_hash_rnd; 219 220 key.key[0] ^= zoneid; 221 key.key[1] ^= net_hash_mix(net); 222 223 return siphash((void *)tuple, 224 offsetofend(struct nf_conntrack_tuple, dst.__nfct_hash_offsetend), 225 &key); 226 } 227 228 static u32 scale_hash(u32 hash) 229 { 230 return reciprocal_scale(hash, nf_conntrack_htable_size); 231 } 232 233 static u32 __hash_conntrack(const struct net *net, 234 const struct nf_conntrack_tuple *tuple, 235 unsigned int zoneid, 236 unsigned int size) 237 { 238 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 239 } 240 241 static u32 hash_conntrack(const struct net *net, 242 const struct nf_conntrack_tuple *tuple, 243 unsigned int zoneid) 244 { 245 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 246 } 247 248 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 249 unsigned int dataoff, 250 struct nf_conntrack_tuple *tuple) 251 { struct { 252 __be16 sport; 253 __be16 dport; 254 } _inet_hdr, *inet_hdr; 255 256 /* Actually only need first 4 bytes to get ports. */ 257 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 258 if (!inet_hdr) 259 return false; 260 261 tuple->src.u.udp.port = inet_hdr->sport; 262 tuple->dst.u.udp.port = inet_hdr->dport; 263 return true; 264 } 265 266 static bool 267 nf_ct_get_tuple(const struct sk_buff *skb, 268 unsigned int nhoff, 269 unsigned int dataoff, 270 u_int16_t l3num, 271 u_int8_t protonum, 272 struct net *net, 273 struct nf_conntrack_tuple *tuple) 274 { 275 unsigned int size; 276 const __be32 *ap; 277 __be32 _addrs[8]; 278 279 memset(tuple, 0, sizeof(*tuple)); 280 281 tuple->src.l3num = l3num; 282 switch (l3num) { 283 case NFPROTO_IPV4: 284 nhoff += offsetof(struct iphdr, saddr); 285 size = 2 * sizeof(__be32); 286 break; 287 case NFPROTO_IPV6: 288 nhoff += offsetof(struct ipv6hdr, saddr); 289 size = sizeof(_addrs); 290 break; 291 default: 292 return true; 293 } 294 295 ap = skb_header_pointer(skb, nhoff, size, _addrs); 296 if (!ap) 297 return false; 298 299 switch (l3num) { 300 case NFPROTO_IPV4: 301 tuple->src.u3.ip = ap[0]; 302 tuple->dst.u3.ip = ap[1]; 303 break; 304 case NFPROTO_IPV6: 305 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 306 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 307 break; 308 } 309 310 tuple->dst.protonum = protonum; 311 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 312 313 switch (protonum) { 314 #if IS_ENABLED(CONFIG_IPV6) 315 case IPPROTO_ICMPV6: 316 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 317 #endif 318 case IPPROTO_ICMP: 319 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 320 #ifdef CONFIG_NF_CT_PROTO_GRE 321 case IPPROTO_GRE: 322 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 323 #endif 324 case IPPROTO_TCP: 325 case IPPROTO_UDP: 326 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 327 case IPPROTO_UDPLITE: 328 #endif 329 #ifdef CONFIG_NF_CT_PROTO_SCTP 330 case IPPROTO_SCTP: 331 #endif 332 #ifdef CONFIG_NF_CT_PROTO_DCCP 333 case IPPROTO_DCCP: 334 #endif 335 /* fallthrough */ 336 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 337 default: 338 break; 339 } 340 341 return true; 342 } 343 344 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 345 u_int8_t *protonum) 346 { 347 int dataoff = -1; 348 const struct iphdr *iph; 349 struct iphdr _iph; 350 351 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 352 if (!iph) 353 return -1; 354 355 /* Conntrack defragments packets, we might still see fragments 356 * inside ICMP packets though. 357 */ 358 if (iph->frag_off & htons(IP_OFFSET)) 359 return -1; 360 361 dataoff = nhoff + (iph->ihl << 2); 362 *protonum = iph->protocol; 363 364 /* Check bogus IP headers */ 365 if (dataoff > skb->len) { 366 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 367 nhoff, iph->ihl << 2, skb->len); 368 return -1; 369 } 370 return dataoff; 371 } 372 373 #if IS_ENABLED(CONFIG_IPV6) 374 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 375 u8 *protonum) 376 { 377 int protoff = -1; 378 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 379 __be16 frag_off; 380 u8 nexthdr; 381 382 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 383 &nexthdr, sizeof(nexthdr)) != 0) { 384 pr_debug("can't get nexthdr\n"); 385 return -1; 386 } 387 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 388 /* 389 * (protoff == skb->len) means the packet has not data, just 390 * IPv6 and possibly extensions headers, but it is tracked anyway 391 */ 392 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 393 pr_debug("can't find proto in pkt\n"); 394 return -1; 395 } 396 397 *protonum = nexthdr; 398 return protoff; 399 } 400 #endif 401 402 static int get_l4proto(const struct sk_buff *skb, 403 unsigned int nhoff, u8 pf, u8 *l4num) 404 { 405 switch (pf) { 406 case NFPROTO_IPV4: 407 return ipv4_get_l4proto(skb, nhoff, l4num); 408 #if IS_ENABLED(CONFIG_IPV6) 409 case NFPROTO_IPV6: 410 return ipv6_get_l4proto(skb, nhoff, l4num); 411 #endif 412 default: 413 *l4num = 0; 414 break; 415 } 416 return -1; 417 } 418 419 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 420 u_int16_t l3num, 421 struct net *net, struct nf_conntrack_tuple *tuple) 422 { 423 u8 protonum; 424 int protoff; 425 426 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 427 if (protoff <= 0) 428 return false; 429 430 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 431 } 432 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 433 434 bool 435 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 436 const struct nf_conntrack_tuple *orig) 437 { 438 memset(inverse, 0, sizeof(*inverse)); 439 440 inverse->src.l3num = orig->src.l3num; 441 442 switch (orig->src.l3num) { 443 case NFPROTO_IPV4: 444 inverse->src.u3.ip = orig->dst.u3.ip; 445 inverse->dst.u3.ip = orig->src.u3.ip; 446 break; 447 case NFPROTO_IPV6: 448 inverse->src.u3.in6 = orig->dst.u3.in6; 449 inverse->dst.u3.in6 = orig->src.u3.in6; 450 break; 451 default: 452 break; 453 } 454 455 inverse->dst.dir = !orig->dst.dir; 456 457 inverse->dst.protonum = orig->dst.protonum; 458 459 switch (orig->dst.protonum) { 460 case IPPROTO_ICMP: 461 return nf_conntrack_invert_icmp_tuple(inverse, orig); 462 #if IS_ENABLED(CONFIG_IPV6) 463 case IPPROTO_ICMPV6: 464 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 465 #endif 466 } 467 468 inverse->src.u.all = orig->dst.u.all; 469 inverse->dst.u.all = orig->src.u.all; 470 return true; 471 } 472 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 473 474 /* Generate a almost-unique pseudo-id for a given conntrack. 475 * 476 * intentionally doesn't re-use any of the seeds used for hash 477 * table location, we assume id gets exposed to userspace. 478 * 479 * Following nf_conn items do not change throughout lifetime 480 * of the nf_conn: 481 * 482 * 1. nf_conn address 483 * 2. nf_conn->master address (normally NULL) 484 * 3. the associated net namespace 485 * 4. the original direction tuple 486 */ 487 u32 nf_ct_get_id(const struct nf_conn *ct) 488 { 489 static siphash_aligned_key_t ct_id_seed; 490 unsigned long a, b, c, d; 491 492 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 493 494 a = (unsigned long)ct; 495 b = (unsigned long)ct->master; 496 c = (unsigned long)nf_ct_net(ct); 497 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 498 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 499 &ct_id_seed); 500 #ifdef CONFIG_64BIT 501 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 502 #else 503 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 504 #endif 505 } 506 EXPORT_SYMBOL_GPL(nf_ct_get_id); 507 508 static void 509 clean_from_lists(struct nf_conn *ct) 510 { 511 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 512 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 513 514 /* Destroy all pending expectations */ 515 nf_ct_remove_expectations(ct); 516 } 517 518 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 519 520 /* Released via nf_ct_destroy() */ 521 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 522 const struct nf_conntrack_zone *zone, 523 gfp_t flags) 524 { 525 struct nf_conn *tmpl, *p; 526 527 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 528 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 529 if (!tmpl) 530 return NULL; 531 532 p = tmpl; 533 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 534 if (tmpl != p) { 535 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 536 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 537 } 538 } else { 539 tmpl = kzalloc(sizeof(*tmpl), flags); 540 if (!tmpl) 541 return NULL; 542 } 543 544 tmpl->status = IPS_TEMPLATE; 545 write_pnet(&tmpl->ct_net, net); 546 nf_ct_zone_add(tmpl, zone); 547 refcount_set(&tmpl->ct_general.use, 1); 548 549 return tmpl; 550 } 551 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 552 553 void nf_ct_tmpl_free(struct nf_conn *tmpl) 554 { 555 kfree(tmpl->ext); 556 557 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 558 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 559 else 560 kfree(tmpl); 561 } 562 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 563 564 static void destroy_gre_conntrack(struct nf_conn *ct) 565 { 566 #ifdef CONFIG_NF_CT_PROTO_GRE 567 struct nf_conn *master = ct->master; 568 569 if (master) 570 nf_ct_gre_keymap_destroy(master); 571 #endif 572 } 573 574 void nf_ct_destroy(struct nf_conntrack *nfct) 575 { 576 struct nf_conn *ct = (struct nf_conn *)nfct; 577 578 WARN_ON(refcount_read(&nfct->use) != 0); 579 580 if (unlikely(nf_ct_is_template(ct))) { 581 nf_ct_tmpl_free(ct); 582 return; 583 } 584 585 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 586 destroy_gre_conntrack(ct); 587 588 /* Expectations will have been removed in clean_from_lists, 589 * except TFTP can create an expectation on the first packet, 590 * before connection is in the list, so we need to clean here, 591 * too. 592 */ 593 nf_ct_remove_expectations(ct); 594 595 if (ct->master) 596 nf_ct_put(ct->master); 597 598 nf_conntrack_free(ct); 599 } 600 EXPORT_SYMBOL(nf_ct_destroy); 601 602 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 603 { 604 struct net *net = nf_ct_net(ct); 605 unsigned int hash, reply_hash; 606 unsigned int sequence; 607 608 do { 609 sequence = read_seqcount_begin(&nf_conntrack_generation); 610 hash = hash_conntrack(net, 611 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 612 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 613 reply_hash = hash_conntrack(net, 614 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 615 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 616 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 617 618 clean_from_lists(ct); 619 nf_conntrack_double_unlock(hash, reply_hash); 620 } 621 622 static void nf_ct_delete_from_lists(struct nf_conn *ct) 623 { 624 nf_ct_helper_destroy(ct); 625 local_bh_disable(); 626 627 __nf_ct_delete_from_lists(ct); 628 629 local_bh_enable(); 630 } 631 632 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 633 { 634 #ifdef CONFIG_NF_CONNTRACK_EVENTS 635 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 636 637 spin_lock(&cnet->ecache.dying_lock); 638 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 639 &cnet->ecache.dying_list); 640 spin_unlock(&cnet->ecache.dying_lock); 641 #endif 642 } 643 644 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 645 { 646 struct nf_conn_tstamp *tstamp; 647 struct net *net; 648 649 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 650 return false; 651 652 tstamp = nf_conn_tstamp_find(ct); 653 if (tstamp) { 654 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 655 656 tstamp->stop = ktime_get_real_ns(); 657 if (timeout < 0) 658 tstamp->stop -= jiffies_to_nsecs(-timeout); 659 } 660 661 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 662 portid, report) < 0) { 663 /* destroy event was not delivered. nf_ct_put will 664 * be done by event cache worker on redelivery. 665 */ 666 nf_ct_helper_destroy(ct); 667 local_bh_disable(); 668 __nf_ct_delete_from_lists(ct); 669 nf_ct_add_to_ecache_list(ct); 670 local_bh_enable(); 671 672 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 673 return false; 674 } 675 676 net = nf_ct_net(ct); 677 if (nf_conntrack_ecache_dwork_pending(net)) 678 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 679 nf_ct_delete_from_lists(ct); 680 nf_ct_put(ct); 681 return true; 682 } 683 EXPORT_SYMBOL_GPL(nf_ct_delete); 684 685 static inline bool 686 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 687 const struct nf_conntrack_tuple *tuple, 688 const struct nf_conntrack_zone *zone, 689 const struct net *net) 690 { 691 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 692 693 /* A conntrack can be recreated with the equal tuple, 694 * so we need to check that the conntrack is confirmed 695 */ 696 return nf_ct_tuple_equal(tuple, &h->tuple) && 697 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 698 nf_ct_is_confirmed(ct) && 699 net_eq(net, nf_ct_net(ct)); 700 } 701 702 static inline bool 703 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 704 { 705 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 706 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 707 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 708 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 709 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 710 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 711 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 712 } 713 714 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 715 static void nf_ct_gc_expired(struct nf_conn *ct) 716 { 717 if (!refcount_inc_not_zero(&ct->ct_general.use)) 718 return; 719 720 /* load ->status after refcount increase */ 721 smp_acquire__after_ctrl_dep(); 722 723 if (nf_ct_should_gc(ct)) 724 nf_ct_kill(ct); 725 726 nf_ct_put(ct); 727 } 728 729 /* 730 * Warning : 731 * - Caller must take a reference on returned object 732 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 733 */ 734 static struct nf_conntrack_tuple_hash * 735 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 736 const struct nf_conntrack_tuple *tuple, u32 hash) 737 { 738 struct nf_conntrack_tuple_hash *h; 739 struct hlist_nulls_head *ct_hash; 740 struct hlist_nulls_node *n; 741 unsigned int bucket, hsize; 742 743 begin: 744 nf_conntrack_get_ht(&ct_hash, &hsize); 745 bucket = reciprocal_scale(hash, hsize); 746 747 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 748 struct nf_conn *ct; 749 750 ct = nf_ct_tuplehash_to_ctrack(h); 751 if (nf_ct_is_expired(ct)) { 752 nf_ct_gc_expired(ct); 753 continue; 754 } 755 756 if (nf_ct_key_equal(h, tuple, zone, net)) 757 return h; 758 } 759 /* 760 * if the nulls value we got at the end of this lookup is 761 * not the expected one, we must restart lookup. 762 * We probably met an item that was moved to another chain. 763 */ 764 if (get_nulls_value(n) != bucket) { 765 NF_CT_STAT_INC_ATOMIC(net, search_restart); 766 goto begin; 767 } 768 769 return NULL; 770 } 771 772 /* Find a connection corresponding to a tuple. */ 773 static struct nf_conntrack_tuple_hash * 774 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 775 const struct nf_conntrack_tuple *tuple, u32 hash) 776 { 777 struct nf_conntrack_tuple_hash *h; 778 struct nf_conn *ct; 779 780 h = ____nf_conntrack_find(net, zone, tuple, hash); 781 if (h) { 782 /* We have a candidate that matches the tuple we're interested 783 * in, try to obtain a reference and re-check tuple 784 */ 785 ct = nf_ct_tuplehash_to_ctrack(h); 786 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 787 /* re-check key after refcount */ 788 smp_acquire__after_ctrl_dep(); 789 790 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 791 return h; 792 793 /* TYPESAFE_BY_RCU recycled the candidate */ 794 nf_ct_put(ct); 795 } 796 797 h = NULL; 798 } 799 800 return h; 801 } 802 803 struct nf_conntrack_tuple_hash * 804 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 805 const struct nf_conntrack_tuple *tuple) 806 { 807 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 808 struct nf_conntrack_tuple_hash *thash; 809 810 rcu_read_lock(); 811 812 thash = __nf_conntrack_find_get(net, zone, tuple, 813 hash_conntrack_raw(tuple, zone_id, net)); 814 815 if (thash) 816 goto out_unlock; 817 818 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 819 if (rid != zone_id) 820 thash = __nf_conntrack_find_get(net, zone, tuple, 821 hash_conntrack_raw(tuple, rid, net)); 822 823 out_unlock: 824 rcu_read_unlock(); 825 return thash; 826 } 827 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 828 829 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 830 unsigned int hash, 831 unsigned int reply_hash) 832 { 833 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 834 &nf_conntrack_hash[hash]); 835 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 836 &nf_conntrack_hash[reply_hash]); 837 } 838 839 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 840 { 841 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 842 * may contain stale pointers to e.g. helper that has been removed. 843 * 844 * The helper can't clear this because the nf_conn object isn't in 845 * any hash and synchronize_rcu() isn't enough because associated skb 846 * might sit in a queue. 847 */ 848 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 849 } 850 851 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 852 { 853 if (!ext) 854 return true; 855 856 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 857 return false; 858 859 /* inserted into conntrack table, nf_ct_iterate_cleanup() 860 * will find it. Disable nf_ct_ext_find() id check. 861 */ 862 WRITE_ONCE(ext->gen_id, 0); 863 return true; 864 } 865 866 int 867 nf_conntrack_hash_check_insert(struct nf_conn *ct) 868 { 869 const struct nf_conntrack_zone *zone; 870 struct net *net = nf_ct_net(ct); 871 unsigned int hash, reply_hash; 872 struct nf_conntrack_tuple_hash *h; 873 struct hlist_nulls_node *n; 874 unsigned int max_chainlen; 875 unsigned int chainlen = 0; 876 unsigned int sequence; 877 int err = -EEXIST; 878 879 zone = nf_ct_zone(ct); 880 881 if (!nf_ct_ext_valid_pre(ct->ext)) 882 return -EAGAIN; 883 884 local_bh_disable(); 885 do { 886 sequence = read_seqcount_begin(&nf_conntrack_generation); 887 hash = hash_conntrack(net, 888 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 889 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 890 reply_hash = hash_conntrack(net, 891 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 892 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 893 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 894 895 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 896 897 /* See if there's one in the list already, including reverse */ 898 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 899 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 900 zone, net)) 901 goto out; 902 903 if (chainlen++ > max_chainlen) 904 goto chaintoolong; 905 } 906 907 chainlen = 0; 908 909 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 910 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 911 zone, net)) 912 goto out; 913 if (chainlen++ > max_chainlen) 914 goto chaintoolong; 915 } 916 917 /* If genid has changed, we can't insert anymore because ct 918 * extensions could have stale pointers and nf_ct_iterate_destroy 919 * might have completed its table scan already. 920 * 921 * Increment of the ext genid right after this check is fine: 922 * nf_ct_iterate_destroy blocks until locks are released. 923 */ 924 if (!nf_ct_ext_valid_post(ct->ext)) { 925 err = -EAGAIN; 926 goto out; 927 } 928 929 smp_wmb(); 930 /* The caller holds a reference to this object */ 931 refcount_set(&ct->ct_general.use, 2); 932 __nf_conntrack_hash_insert(ct, hash, reply_hash); 933 nf_conntrack_double_unlock(hash, reply_hash); 934 NF_CT_STAT_INC(net, insert); 935 local_bh_enable(); 936 937 return 0; 938 chaintoolong: 939 NF_CT_STAT_INC(net, chaintoolong); 940 err = -ENOSPC; 941 out: 942 nf_conntrack_double_unlock(hash, reply_hash); 943 local_bh_enable(); 944 return err; 945 } 946 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 947 948 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 949 unsigned int bytes) 950 { 951 struct nf_conn_acct *acct; 952 953 acct = nf_conn_acct_find(ct); 954 if (acct) { 955 struct nf_conn_counter *counter = acct->counter; 956 957 atomic64_add(packets, &counter[dir].packets); 958 atomic64_add(bytes, &counter[dir].bytes); 959 } 960 } 961 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 962 963 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 964 const struct nf_conn *loser_ct) 965 { 966 struct nf_conn_acct *acct; 967 968 acct = nf_conn_acct_find(loser_ct); 969 if (acct) { 970 struct nf_conn_counter *counter = acct->counter; 971 unsigned int bytes; 972 973 /* u32 should be fine since we must have seen one packet. */ 974 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 975 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 976 } 977 } 978 979 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 980 { 981 struct nf_conn_tstamp *tstamp; 982 983 refcount_inc(&ct->ct_general.use); 984 985 /* set conntrack timestamp, if enabled. */ 986 tstamp = nf_conn_tstamp_find(ct); 987 if (tstamp) 988 tstamp->start = ktime_get_real_ns(); 989 } 990 991 /* caller must hold locks to prevent concurrent changes */ 992 static int __nf_ct_resolve_clash(struct sk_buff *skb, 993 struct nf_conntrack_tuple_hash *h) 994 { 995 /* This is the conntrack entry already in hashes that won race. */ 996 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 997 enum ip_conntrack_info ctinfo; 998 struct nf_conn *loser_ct; 999 1000 loser_ct = nf_ct_get(skb, &ctinfo); 1001 1002 if (nf_ct_is_dying(ct)) 1003 return NF_DROP; 1004 1005 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1006 nf_ct_match(ct, loser_ct)) { 1007 struct net *net = nf_ct_net(ct); 1008 1009 nf_conntrack_get(&ct->ct_general); 1010 1011 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1012 nf_ct_put(loser_ct); 1013 nf_ct_set(skb, ct, ctinfo); 1014 1015 NF_CT_STAT_INC(net, clash_resolve); 1016 return NF_ACCEPT; 1017 } 1018 1019 return NF_DROP; 1020 } 1021 1022 /** 1023 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1024 * 1025 * @skb: skb that causes the collision 1026 * @repl_idx: hash slot for reply direction 1027 * 1028 * Called when origin or reply direction had a clash. 1029 * The skb can be handled without packet drop provided the reply direction 1030 * is unique or there the existing entry has the identical tuple in both 1031 * directions. 1032 * 1033 * Caller must hold conntrack table locks to prevent concurrent updates. 1034 * 1035 * Returns NF_DROP if the clash could not be handled. 1036 */ 1037 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1038 { 1039 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1040 const struct nf_conntrack_zone *zone; 1041 struct nf_conntrack_tuple_hash *h; 1042 struct hlist_nulls_node *n; 1043 struct net *net; 1044 1045 zone = nf_ct_zone(loser_ct); 1046 net = nf_ct_net(loser_ct); 1047 1048 /* Reply direction must never result in a clash, unless both origin 1049 * and reply tuples are identical. 1050 */ 1051 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1052 if (nf_ct_key_equal(h, 1053 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1054 zone, net)) 1055 return __nf_ct_resolve_clash(skb, h); 1056 } 1057 1058 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1059 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1060 1061 /* IPS_NAT_CLASH removes the entry automatically on the first 1062 * reply. Also prevents UDP tracker from moving the entry to 1063 * ASSURED state, i.e. the entry can always be evicted under 1064 * pressure. 1065 */ 1066 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1067 1068 __nf_conntrack_insert_prepare(loser_ct); 1069 1070 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1071 * already in the table. This also hides the clashing entry from 1072 * ctnetlink iteration, i.e. conntrack -L won't show them. 1073 */ 1074 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1075 1076 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1077 &nf_conntrack_hash[repl_idx]); 1078 1079 NF_CT_STAT_INC(net, clash_resolve); 1080 return NF_ACCEPT; 1081 } 1082 1083 /** 1084 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1085 * 1086 * @skb: skb that causes the clash 1087 * @h: tuplehash of the clashing entry already in table 1088 * @reply_hash: hash slot for reply direction 1089 * 1090 * A conntrack entry can be inserted to the connection tracking table 1091 * if there is no existing entry with an identical tuple. 1092 * 1093 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1094 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1095 * will find the already-existing entry. 1096 * 1097 * The major problem with such packet drop is the extra delay added by 1098 * the packet loss -- it will take some time for a retransmit to occur 1099 * (or the sender to time out when waiting for a reply). 1100 * 1101 * This function attempts to handle the situation without packet drop. 1102 * 1103 * If @skb has no NAT transformation or if the colliding entries are 1104 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1105 * and @skb is associated with the conntrack entry already in the table. 1106 * 1107 * Failing that, the new, unconfirmed conntrack is still added to the table 1108 * provided that the collision only occurs in the ORIGINAL direction. 1109 * The new entry will be added only in the non-clashing REPLY direction, 1110 * so packets in the ORIGINAL direction will continue to match the existing 1111 * entry. The new entry will also have a fixed timeout so it expires -- 1112 * due to the collision, it will only see reply traffic. 1113 * 1114 * Returns NF_DROP if the clash could not be resolved. 1115 */ 1116 static __cold noinline int 1117 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1118 u32 reply_hash) 1119 { 1120 /* This is the conntrack entry already in hashes that won race. */ 1121 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1122 const struct nf_conntrack_l4proto *l4proto; 1123 enum ip_conntrack_info ctinfo; 1124 struct nf_conn *loser_ct; 1125 struct net *net; 1126 int ret; 1127 1128 loser_ct = nf_ct_get(skb, &ctinfo); 1129 net = nf_ct_net(loser_ct); 1130 1131 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1132 if (!l4proto->allow_clash) 1133 goto drop; 1134 1135 ret = __nf_ct_resolve_clash(skb, h); 1136 if (ret == NF_ACCEPT) 1137 return ret; 1138 1139 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1140 if (ret == NF_ACCEPT) 1141 return ret; 1142 1143 drop: 1144 NF_CT_STAT_INC(net, drop); 1145 NF_CT_STAT_INC(net, insert_failed); 1146 return NF_DROP; 1147 } 1148 1149 /* Confirm a connection given skb; places it in hash table */ 1150 int 1151 __nf_conntrack_confirm(struct sk_buff *skb) 1152 { 1153 unsigned int chainlen = 0, sequence, max_chainlen; 1154 const struct nf_conntrack_zone *zone; 1155 unsigned int hash, reply_hash; 1156 struct nf_conntrack_tuple_hash *h; 1157 struct nf_conn *ct; 1158 struct nf_conn_help *help; 1159 struct hlist_nulls_node *n; 1160 enum ip_conntrack_info ctinfo; 1161 struct net *net; 1162 int ret = NF_DROP; 1163 1164 ct = nf_ct_get(skb, &ctinfo); 1165 net = nf_ct_net(ct); 1166 1167 /* ipt_REJECT uses nf_conntrack_attach to attach related 1168 ICMP/TCP RST packets in other direction. Actual packet 1169 which created connection will be IP_CT_NEW or for an 1170 expected connection, IP_CT_RELATED. */ 1171 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1172 return NF_ACCEPT; 1173 1174 zone = nf_ct_zone(ct); 1175 local_bh_disable(); 1176 1177 do { 1178 sequence = read_seqcount_begin(&nf_conntrack_generation); 1179 /* reuse the hash saved before */ 1180 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1181 hash = scale_hash(hash); 1182 reply_hash = hash_conntrack(net, 1183 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1184 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1185 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1186 1187 /* We're not in hash table, and we refuse to set up related 1188 * connections for unconfirmed conns. But packet copies and 1189 * REJECT will give spurious warnings here. 1190 */ 1191 1192 /* Another skb with the same unconfirmed conntrack may 1193 * win the race. This may happen for bridge(br_flood) 1194 * or broadcast/multicast packets do skb_clone with 1195 * unconfirmed conntrack. 1196 */ 1197 if (unlikely(nf_ct_is_confirmed(ct))) { 1198 WARN_ON_ONCE(1); 1199 nf_conntrack_double_unlock(hash, reply_hash); 1200 local_bh_enable(); 1201 return NF_DROP; 1202 } 1203 1204 if (!nf_ct_ext_valid_pre(ct->ext)) { 1205 NF_CT_STAT_INC(net, insert_failed); 1206 goto dying; 1207 } 1208 1209 /* We have to check the DYING flag after unlink to prevent 1210 * a race against nf_ct_get_next_corpse() possibly called from 1211 * user context, else we insert an already 'dead' hash, blocking 1212 * further use of that particular connection -JM. 1213 */ 1214 ct->status |= IPS_CONFIRMED; 1215 1216 if (unlikely(nf_ct_is_dying(ct))) { 1217 NF_CT_STAT_INC(net, insert_failed); 1218 goto dying; 1219 } 1220 1221 max_chainlen = MIN_CHAINLEN + get_random_u32_below(MAX_CHAINLEN); 1222 /* See if there's one in the list already, including reverse: 1223 NAT could have grabbed it without realizing, since we're 1224 not in the hash. If there is, we lost race. */ 1225 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1226 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1227 zone, net)) 1228 goto out; 1229 if (chainlen++ > max_chainlen) 1230 goto chaintoolong; 1231 } 1232 1233 chainlen = 0; 1234 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1235 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1236 zone, net)) 1237 goto out; 1238 if (chainlen++ > max_chainlen) { 1239 chaintoolong: 1240 NF_CT_STAT_INC(net, chaintoolong); 1241 NF_CT_STAT_INC(net, insert_failed); 1242 ret = NF_DROP; 1243 goto dying; 1244 } 1245 } 1246 1247 /* Timer relative to confirmation time, not original 1248 setting time, otherwise we'd get timer wrap in 1249 weird delay cases. */ 1250 ct->timeout += nfct_time_stamp; 1251 1252 __nf_conntrack_insert_prepare(ct); 1253 1254 /* Since the lookup is lockless, hash insertion must be done after 1255 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1256 * guarantee that no other CPU can find the conntrack before the above 1257 * stores are visible. 1258 */ 1259 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1260 nf_conntrack_double_unlock(hash, reply_hash); 1261 local_bh_enable(); 1262 1263 /* ext area is still valid (rcu read lock is held, 1264 * but will go out of scope soon, we need to remove 1265 * this conntrack again. 1266 */ 1267 if (!nf_ct_ext_valid_post(ct->ext)) { 1268 nf_ct_kill(ct); 1269 NF_CT_STAT_INC_ATOMIC(net, drop); 1270 return NF_DROP; 1271 } 1272 1273 help = nfct_help(ct); 1274 if (help && help->helper) 1275 nf_conntrack_event_cache(IPCT_HELPER, ct); 1276 1277 nf_conntrack_event_cache(master_ct(ct) ? 1278 IPCT_RELATED : IPCT_NEW, ct); 1279 return NF_ACCEPT; 1280 1281 out: 1282 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1283 dying: 1284 nf_conntrack_double_unlock(hash, reply_hash); 1285 local_bh_enable(); 1286 return ret; 1287 } 1288 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1289 1290 /* Returns true if a connection corresponds to the tuple (required 1291 for NAT). */ 1292 int 1293 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1294 const struct nf_conn *ignored_conntrack) 1295 { 1296 struct net *net = nf_ct_net(ignored_conntrack); 1297 const struct nf_conntrack_zone *zone; 1298 struct nf_conntrack_tuple_hash *h; 1299 struct hlist_nulls_head *ct_hash; 1300 unsigned int hash, hsize; 1301 struct hlist_nulls_node *n; 1302 struct nf_conn *ct; 1303 1304 zone = nf_ct_zone(ignored_conntrack); 1305 1306 rcu_read_lock(); 1307 begin: 1308 nf_conntrack_get_ht(&ct_hash, &hsize); 1309 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1310 1311 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1312 ct = nf_ct_tuplehash_to_ctrack(h); 1313 1314 if (ct == ignored_conntrack) 1315 continue; 1316 1317 if (nf_ct_is_expired(ct)) { 1318 nf_ct_gc_expired(ct); 1319 continue; 1320 } 1321 1322 if (nf_ct_key_equal(h, tuple, zone, net)) { 1323 /* Tuple is taken already, so caller will need to find 1324 * a new source port to use. 1325 * 1326 * Only exception: 1327 * If the *original tuples* are identical, then both 1328 * conntracks refer to the same flow. 1329 * This is a rare situation, it can occur e.g. when 1330 * more than one UDP packet is sent from same socket 1331 * in different threads. 1332 * 1333 * Let nf_ct_resolve_clash() deal with this later. 1334 */ 1335 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1336 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1337 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1338 continue; 1339 1340 NF_CT_STAT_INC_ATOMIC(net, found); 1341 rcu_read_unlock(); 1342 return 1; 1343 } 1344 } 1345 1346 if (get_nulls_value(n) != hash) { 1347 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1348 goto begin; 1349 } 1350 1351 rcu_read_unlock(); 1352 1353 return 0; 1354 } 1355 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1356 1357 #define NF_CT_EVICTION_RANGE 8 1358 1359 /* There's a small race here where we may free a just-assured 1360 connection. Too bad: we're in trouble anyway. */ 1361 static unsigned int early_drop_list(struct net *net, 1362 struct hlist_nulls_head *head) 1363 { 1364 struct nf_conntrack_tuple_hash *h; 1365 struct hlist_nulls_node *n; 1366 unsigned int drops = 0; 1367 struct nf_conn *tmp; 1368 1369 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1370 tmp = nf_ct_tuplehash_to_ctrack(h); 1371 1372 if (nf_ct_is_expired(tmp)) { 1373 nf_ct_gc_expired(tmp); 1374 continue; 1375 } 1376 1377 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1378 !net_eq(nf_ct_net(tmp), net) || 1379 nf_ct_is_dying(tmp)) 1380 continue; 1381 1382 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1383 continue; 1384 1385 /* load ->ct_net and ->status after refcount increase */ 1386 smp_acquire__after_ctrl_dep(); 1387 1388 /* kill only if still in same netns -- might have moved due to 1389 * SLAB_TYPESAFE_BY_RCU rules. 1390 * 1391 * We steal the timer reference. If that fails timer has 1392 * already fired or someone else deleted it. Just drop ref 1393 * and move to next entry. 1394 */ 1395 if (net_eq(nf_ct_net(tmp), net) && 1396 nf_ct_is_confirmed(tmp) && 1397 nf_ct_delete(tmp, 0, 0)) 1398 drops++; 1399 1400 nf_ct_put(tmp); 1401 } 1402 1403 return drops; 1404 } 1405 1406 static noinline int early_drop(struct net *net, unsigned int hash) 1407 { 1408 unsigned int i, bucket; 1409 1410 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1411 struct hlist_nulls_head *ct_hash; 1412 unsigned int hsize, drops; 1413 1414 rcu_read_lock(); 1415 nf_conntrack_get_ht(&ct_hash, &hsize); 1416 if (!i) 1417 bucket = reciprocal_scale(hash, hsize); 1418 else 1419 bucket = (bucket + 1) % hsize; 1420 1421 drops = early_drop_list(net, &ct_hash[bucket]); 1422 rcu_read_unlock(); 1423 1424 if (drops) { 1425 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1426 return true; 1427 } 1428 } 1429 1430 return false; 1431 } 1432 1433 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1434 { 1435 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1436 } 1437 1438 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1439 { 1440 const struct nf_conntrack_l4proto *l4proto; 1441 u8 protonum = nf_ct_protonum(ct); 1442 1443 if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) 1444 return false; 1445 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1446 return true; 1447 1448 l4proto = nf_ct_l4proto_find(protonum); 1449 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1450 return true; 1451 1452 return false; 1453 } 1454 1455 static void gc_worker(struct work_struct *work) 1456 { 1457 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1458 u32 end_time, start_time = nfct_time_stamp; 1459 struct conntrack_gc_work *gc_work; 1460 unsigned int expired_count = 0; 1461 unsigned long next_run; 1462 s32 delta_time; 1463 long count; 1464 1465 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1466 1467 i = gc_work->next_bucket; 1468 if (gc_work->early_drop) 1469 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1470 1471 if (i == 0) { 1472 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1473 gc_work->count = GC_SCAN_INITIAL_COUNT; 1474 gc_work->start_time = start_time; 1475 } 1476 1477 next_run = gc_work->avg_timeout; 1478 count = gc_work->count; 1479 1480 end_time = start_time + GC_SCAN_MAX_DURATION; 1481 1482 do { 1483 struct nf_conntrack_tuple_hash *h; 1484 struct hlist_nulls_head *ct_hash; 1485 struct hlist_nulls_node *n; 1486 struct nf_conn *tmp; 1487 1488 rcu_read_lock(); 1489 1490 nf_conntrack_get_ht(&ct_hash, &hashsz); 1491 if (i >= hashsz) { 1492 rcu_read_unlock(); 1493 break; 1494 } 1495 1496 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1497 struct nf_conntrack_net *cnet; 1498 struct net *net; 1499 long expires; 1500 1501 tmp = nf_ct_tuplehash_to_ctrack(h); 1502 1503 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1504 nf_ct_offload_timeout(tmp); 1505 if (!nf_conntrack_max95) 1506 continue; 1507 } 1508 1509 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1510 rcu_read_unlock(); 1511 1512 gc_work->next_bucket = i; 1513 gc_work->avg_timeout = next_run; 1514 gc_work->count = count; 1515 1516 delta_time = nfct_time_stamp - gc_work->start_time; 1517 1518 /* re-sched immediately if total cycle time is exceeded */ 1519 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1520 goto early_exit; 1521 } 1522 1523 if (nf_ct_is_expired(tmp)) { 1524 nf_ct_gc_expired(tmp); 1525 expired_count++; 1526 continue; 1527 } 1528 1529 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1530 expires = (expires - (long)next_run) / ++count; 1531 next_run += expires; 1532 1533 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1534 continue; 1535 1536 net = nf_ct_net(tmp); 1537 cnet = nf_ct_pernet(net); 1538 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1539 continue; 1540 1541 /* need to take reference to avoid possible races */ 1542 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1543 continue; 1544 1545 /* load ->status after refcount increase */ 1546 smp_acquire__after_ctrl_dep(); 1547 1548 if (gc_worker_skip_ct(tmp)) { 1549 nf_ct_put(tmp); 1550 continue; 1551 } 1552 1553 if (gc_worker_can_early_drop(tmp)) { 1554 nf_ct_kill(tmp); 1555 expired_count++; 1556 } 1557 1558 nf_ct_put(tmp); 1559 } 1560 1561 /* could check get_nulls_value() here and restart if ct 1562 * was moved to another chain. But given gc is best-effort 1563 * we will just continue with next hash slot. 1564 */ 1565 rcu_read_unlock(); 1566 cond_resched(); 1567 i++; 1568 1569 delta_time = nfct_time_stamp - end_time; 1570 if (delta_time > 0 && i < hashsz) { 1571 gc_work->avg_timeout = next_run; 1572 gc_work->count = count; 1573 gc_work->next_bucket = i; 1574 next_run = 0; 1575 goto early_exit; 1576 } 1577 } while (i < hashsz); 1578 1579 gc_work->next_bucket = 0; 1580 1581 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1582 1583 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1584 if (next_run > (unsigned long)delta_time) 1585 next_run -= delta_time; 1586 else 1587 next_run = 1; 1588 1589 early_exit: 1590 if (gc_work->exiting) 1591 return; 1592 1593 if (next_run) 1594 gc_work->early_drop = false; 1595 1596 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1597 } 1598 1599 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1600 { 1601 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1602 gc_work->exiting = false; 1603 } 1604 1605 static struct nf_conn * 1606 __nf_conntrack_alloc(struct net *net, 1607 const struct nf_conntrack_zone *zone, 1608 const struct nf_conntrack_tuple *orig, 1609 const struct nf_conntrack_tuple *repl, 1610 gfp_t gfp, u32 hash) 1611 { 1612 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1613 unsigned int ct_count; 1614 struct nf_conn *ct; 1615 1616 /* We don't want any race condition at early drop stage */ 1617 ct_count = atomic_inc_return(&cnet->count); 1618 1619 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1620 if (!early_drop(net, hash)) { 1621 if (!conntrack_gc_work.early_drop) 1622 conntrack_gc_work.early_drop = true; 1623 atomic_dec(&cnet->count); 1624 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1625 return ERR_PTR(-ENOMEM); 1626 } 1627 } 1628 1629 /* 1630 * Do not use kmem_cache_zalloc(), as this cache uses 1631 * SLAB_TYPESAFE_BY_RCU. 1632 */ 1633 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1634 if (ct == NULL) 1635 goto out; 1636 1637 spin_lock_init(&ct->lock); 1638 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1639 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1640 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1641 /* save hash for reusing when confirming */ 1642 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1643 ct->status = 0; 1644 WRITE_ONCE(ct->timeout, 0); 1645 write_pnet(&ct->ct_net, net); 1646 memset_after(ct, 0, __nfct_init_offset); 1647 1648 nf_ct_zone_add(ct, zone); 1649 1650 /* Because we use RCU lookups, we set ct_general.use to zero before 1651 * this is inserted in any list. 1652 */ 1653 refcount_set(&ct->ct_general.use, 0); 1654 return ct; 1655 out: 1656 atomic_dec(&cnet->count); 1657 return ERR_PTR(-ENOMEM); 1658 } 1659 1660 struct nf_conn *nf_conntrack_alloc(struct net *net, 1661 const struct nf_conntrack_zone *zone, 1662 const struct nf_conntrack_tuple *orig, 1663 const struct nf_conntrack_tuple *repl, 1664 gfp_t gfp) 1665 { 1666 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1667 } 1668 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1669 1670 void nf_conntrack_free(struct nf_conn *ct) 1671 { 1672 struct net *net = nf_ct_net(ct); 1673 struct nf_conntrack_net *cnet; 1674 1675 /* A freed object has refcnt == 0, that's 1676 * the golden rule for SLAB_TYPESAFE_BY_RCU 1677 */ 1678 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1679 1680 if (ct->status & IPS_SRC_NAT_DONE) { 1681 const struct nf_nat_hook *nat_hook; 1682 1683 rcu_read_lock(); 1684 nat_hook = rcu_dereference(nf_nat_hook); 1685 if (nat_hook) 1686 nat_hook->remove_nat_bysrc(ct); 1687 rcu_read_unlock(); 1688 } 1689 1690 kfree(ct->ext); 1691 kmem_cache_free(nf_conntrack_cachep, ct); 1692 cnet = nf_ct_pernet(net); 1693 1694 smp_mb__before_atomic(); 1695 atomic_dec(&cnet->count); 1696 } 1697 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1698 1699 1700 /* Allocate a new conntrack: we return -ENOMEM if classification 1701 failed due to stress. Otherwise it really is unclassifiable. */ 1702 static noinline struct nf_conntrack_tuple_hash * 1703 init_conntrack(struct net *net, struct nf_conn *tmpl, 1704 const struct nf_conntrack_tuple *tuple, 1705 struct sk_buff *skb, 1706 unsigned int dataoff, u32 hash) 1707 { 1708 struct nf_conn *ct; 1709 struct nf_conn_help *help; 1710 struct nf_conntrack_tuple repl_tuple; 1711 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1712 struct nf_conntrack_ecache *ecache; 1713 #endif 1714 struct nf_conntrack_expect *exp = NULL; 1715 const struct nf_conntrack_zone *zone; 1716 struct nf_conn_timeout *timeout_ext; 1717 struct nf_conntrack_zone tmp; 1718 struct nf_conntrack_net *cnet; 1719 1720 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) 1721 return NULL; 1722 1723 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1724 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1725 hash); 1726 if (IS_ERR(ct)) 1727 return (struct nf_conntrack_tuple_hash *)ct; 1728 1729 if (!nf_ct_add_synproxy(ct, tmpl)) { 1730 nf_conntrack_free(ct); 1731 return ERR_PTR(-ENOMEM); 1732 } 1733 1734 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1735 1736 if (timeout_ext) 1737 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1738 GFP_ATOMIC); 1739 1740 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1741 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1742 nf_ct_labels_ext_add(ct); 1743 1744 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1745 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1746 1747 if ((ecache || net->ct.sysctl_events) && 1748 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1749 ecache ? ecache->expmask : 0, 1750 GFP_ATOMIC)) { 1751 nf_conntrack_free(ct); 1752 return ERR_PTR(-ENOMEM); 1753 } 1754 #endif 1755 1756 cnet = nf_ct_pernet(net); 1757 if (cnet->expect_count) { 1758 spin_lock_bh(&nf_conntrack_expect_lock); 1759 exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); 1760 if (exp) { 1761 /* Welcome, Mr. Bond. We've been expecting you... */ 1762 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1763 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1764 ct->master = exp->master; 1765 if (exp->helper) { 1766 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1767 if (help) 1768 rcu_assign_pointer(help->helper, exp->helper); 1769 } 1770 1771 #ifdef CONFIG_NF_CONNTRACK_MARK 1772 ct->mark = READ_ONCE(exp->master->mark); 1773 #endif 1774 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1775 ct->secmark = exp->master->secmark; 1776 #endif 1777 NF_CT_STAT_INC(net, expect_new); 1778 } 1779 spin_unlock_bh(&nf_conntrack_expect_lock); 1780 } 1781 if (!exp && tmpl) 1782 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1783 1784 /* Other CPU might have obtained a pointer to this object before it was 1785 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1786 * 1787 * After refcount_set(1) it will succeed; ensure that zeroing of 1788 * ct->status and the correct ct->net pointer are visible; else other 1789 * core might observe CONFIRMED bit which means the entry is valid and 1790 * in the hash table, but its not (anymore). 1791 */ 1792 smp_wmb(); 1793 1794 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1795 refcount_set(&ct->ct_general.use, 1); 1796 1797 if (exp) { 1798 if (exp->expectfn) 1799 exp->expectfn(ct, exp); 1800 nf_ct_expect_put(exp); 1801 } 1802 1803 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1804 } 1805 1806 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1807 static int 1808 resolve_normal_ct(struct nf_conn *tmpl, 1809 struct sk_buff *skb, 1810 unsigned int dataoff, 1811 u_int8_t protonum, 1812 const struct nf_hook_state *state) 1813 { 1814 const struct nf_conntrack_zone *zone; 1815 struct nf_conntrack_tuple tuple; 1816 struct nf_conntrack_tuple_hash *h; 1817 enum ip_conntrack_info ctinfo; 1818 struct nf_conntrack_zone tmp; 1819 u32 hash, zone_id, rid; 1820 struct nf_conn *ct; 1821 1822 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1823 dataoff, state->pf, protonum, state->net, 1824 &tuple)) 1825 return 0; 1826 1827 /* look for tuple match */ 1828 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1829 1830 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1831 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1832 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1833 1834 if (!h) { 1835 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1836 if (zone_id != rid) { 1837 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1838 1839 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1840 } 1841 } 1842 1843 if (!h) { 1844 h = init_conntrack(state->net, tmpl, &tuple, 1845 skb, dataoff, hash); 1846 if (!h) 1847 return 0; 1848 if (IS_ERR(h)) 1849 return PTR_ERR(h); 1850 } 1851 ct = nf_ct_tuplehash_to_ctrack(h); 1852 1853 /* It exists; we have (non-exclusive) reference. */ 1854 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1855 ctinfo = IP_CT_ESTABLISHED_REPLY; 1856 } else { 1857 unsigned long status = READ_ONCE(ct->status); 1858 1859 /* Once we've had two way comms, always ESTABLISHED. */ 1860 if (likely(status & IPS_SEEN_REPLY)) 1861 ctinfo = IP_CT_ESTABLISHED; 1862 else if (status & IPS_EXPECTED) 1863 ctinfo = IP_CT_RELATED; 1864 else 1865 ctinfo = IP_CT_NEW; 1866 } 1867 nf_ct_set(skb, ct, ctinfo); 1868 return 0; 1869 } 1870 1871 /* 1872 * icmp packets need special treatment to handle error messages that are 1873 * related to a connection. 1874 * 1875 * Callers need to check if skb has a conntrack assigned when this 1876 * helper returns; in such case skb belongs to an already known connection. 1877 */ 1878 static unsigned int __cold 1879 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1880 struct sk_buff *skb, 1881 unsigned int dataoff, 1882 u8 protonum, 1883 const struct nf_hook_state *state) 1884 { 1885 int ret; 1886 1887 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1888 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1889 #if IS_ENABLED(CONFIG_IPV6) 1890 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1891 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1892 #endif 1893 else 1894 return NF_ACCEPT; 1895 1896 if (ret <= 0) 1897 NF_CT_STAT_INC_ATOMIC(state->net, error); 1898 1899 return ret; 1900 } 1901 1902 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1903 enum ip_conntrack_info ctinfo) 1904 { 1905 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1906 1907 if (!timeout) 1908 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1909 1910 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1911 return NF_ACCEPT; 1912 } 1913 1914 /* Returns verdict for packet, or -1 for invalid. */ 1915 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1916 struct sk_buff *skb, 1917 unsigned int dataoff, 1918 enum ip_conntrack_info ctinfo, 1919 const struct nf_hook_state *state) 1920 { 1921 switch (nf_ct_protonum(ct)) { 1922 case IPPROTO_TCP: 1923 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1924 ctinfo, state); 1925 case IPPROTO_UDP: 1926 return nf_conntrack_udp_packet(ct, skb, dataoff, 1927 ctinfo, state); 1928 case IPPROTO_ICMP: 1929 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1930 #if IS_ENABLED(CONFIG_IPV6) 1931 case IPPROTO_ICMPV6: 1932 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1933 #endif 1934 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1935 case IPPROTO_UDPLITE: 1936 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1937 ctinfo, state); 1938 #endif 1939 #ifdef CONFIG_NF_CT_PROTO_SCTP 1940 case IPPROTO_SCTP: 1941 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1942 ctinfo, state); 1943 #endif 1944 #ifdef CONFIG_NF_CT_PROTO_DCCP 1945 case IPPROTO_DCCP: 1946 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1947 ctinfo, state); 1948 #endif 1949 #ifdef CONFIG_NF_CT_PROTO_GRE 1950 case IPPROTO_GRE: 1951 return nf_conntrack_gre_packet(ct, skb, dataoff, 1952 ctinfo, state); 1953 #endif 1954 } 1955 1956 return generic_packet(ct, skb, ctinfo); 1957 } 1958 1959 unsigned int 1960 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1961 { 1962 enum ip_conntrack_info ctinfo; 1963 struct nf_conn *ct, *tmpl; 1964 u_int8_t protonum; 1965 int dataoff, ret; 1966 1967 tmpl = nf_ct_get(skb, &ctinfo); 1968 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1969 /* Previously seen (loopback or untracked)? Ignore. */ 1970 if ((tmpl && !nf_ct_is_template(tmpl)) || 1971 ctinfo == IP_CT_UNTRACKED) 1972 return NF_ACCEPT; 1973 skb->_nfct = 0; 1974 } 1975 1976 /* rcu_read_lock()ed by nf_hook_thresh */ 1977 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1978 if (dataoff <= 0) { 1979 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1980 ret = NF_ACCEPT; 1981 goto out; 1982 } 1983 1984 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1985 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1986 protonum, state); 1987 if (ret <= 0) { 1988 ret = -ret; 1989 goto out; 1990 } 1991 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1992 if (skb->_nfct) 1993 goto out; 1994 } 1995 repeat: 1996 ret = resolve_normal_ct(tmpl, skb, dataoff, 1997 protonum, state); 1998 if (ret < 0) { 1999 /* Too stressed to deal. */ 2000 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2001 ret = NF_DROP; 2002 goto out; 2003 } 2004 2005 ct = nf_ct_get(skb, &ctinfo); 2006 if (!ct) { 2007 /* Not valid part of a connection */ 2008 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2009 ret = NF_ACCEPT; 2010 goto out; 2011 } 2012 2013 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2014 if (ret <= 0) { 2015 /* Invalid: inverse of the return code tells 2016 * the netfilter core what to do */ 2017 nf_ct_put(ct); 2018 skb->_nfct = 0; 2019 /* Special case: TCP tracker reports an attempt to reopen a 2020 * closed/aborted connection. We have to go back and create a 2021 * fresh conntrack. 2022 */ 2023 if (ret == -NF_REPEAT) 2024 goto repeat; 2025 2026 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2027 if (ret == -NF_DROP) 2028 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2029 2030 ret = -ret; 2031 goto out; 2032 } 2033 2034 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2035 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2036 nf_conntrack_event_cache(IPCT_REPLY, ct); 2037 out: 2038 if (tmpl) 2039 nf_ct_put(tmpl); 2040 2041 return ret; 2042 } 2043 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2044 2045 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 2046 implicitly racy: see __nf_conntrack_confirm */ 2047 void nf_conntrack_alter_reply(struct nf_conn *ct, 2048 const struct nf_conntrack_tuple *newreply) 2049 { 2050 struct nf_conn_help *help = nfct_help(ct); 2051 2052 /* Should be unconfirmed, so not in hash table yet */ 2053 WARN_ON(nf_ct_is_confirmed(ct)); 2054 2055 nf_ct_dump_tuple(newreply); 2056 2057 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 2058 if (ct->master || (help && !hlist_empty(&help->expectations))) 2059 return; 2060 } 2061 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 2062 2063 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2064 void __nf_ct_refresh_acct(struct nf_conn *ct, 2065 enum ip_conntrack_info ctinfo, 2066 const struct sk_buff *skb, 2067 u32 extra_jiffies, 2068 bool do_acct) 2069 { 2070 /* Only update if this is not a fixed timeout */ 2071 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2072 goto acct; 2073 2074 /* If not in hash table, timer will not be active yet */ 2075 if (nf_ct_is_confirmed(ct)) 2076 extra_jiffies += nfct_time_stamp; 2077 2078 if (READ_ONCE(ct->timeout) != extra_jiffies) 2079 WRITE_ONCE(ct->timeout, extra_jiffies); 2080 acct: 2081 if (do_acct) 2082 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2083 } 2084 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2085 2086 bool nf_ct_kill_acct(struct nf_conn *ct, 2087 enum ip_conntrack_info ctinfo, 2088 const struct sk_buff *skb) 2089 { 2090 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2091 2092 return nf_ct_delete(ct, 0, 0); 2093 } 2094 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2095 2096 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2097 2098 #include <linux/netfilter/nfnetlink.h> 2099 #include <linux/netfilter/nfnetlink_conntrack.h> 2100 #include <linux/mutex.h> 2101 2102 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2103 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2104 const struct nf_conntrack_tuple *tuple) 2105 { 2106 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2107 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2108 goto nla_put_failure; 2109 return 0; 2110 2111 nla_put_failure: 2112 return -1; 2113 } 2114 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2115 2116 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2117 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2118 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2119 }; 2120 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2121 2122 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2123 struct nf_conntrack_tuple *t, 2124 u_int32_t flags) 2125 { 2126 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2127 if (!tb[CTA_PROTO_SRC_PORT]) 2128 return -EINVAL; 2129 2130 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2131 } 2132 2133 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2134 if (!tb[CTA_PROTO_DST_PORT]) 2135 return -EINVAL; 2136 2137 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2138 } 2139 2140 return 0; 2141 } 2142 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2143 2144 unsigned int nf_ct_port_nlattr_tuple_size(void) 2145 { 2146 static unsigned int size __read_mostly; 2147 2148 if (!size) 2149 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2150 2151 return size; 2152 } 2153 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2154 #endif 2155 2156 /* Used by ipt_REJECT and ip6t_REJECT. */ 2157 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2158 { 2159 struct nf_conn *ct; 2160 enum ip_conntrack_info ctinfo; 2161 2162 /* This ICMP is in reverse direction to the packet which caused it */ 2163 ct = nf_ct_get(skb, &ctinfo); 2164 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2165 ctinfo = IP_CT_RELATED_REPLY; 2166 else 2167 ctinfo = IP_CT_RELATED; 2168 2169 /* Attach to new skbuff, and increment count */ 2170 nf_ct_set(nskb, ct, ctinfo); 2171 nf_conntrack_get(skb_nfct(nskb)); 2172 } 2173 2174 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2175 struct nf_conn *ct, 2176 enum ip_conntrack_info ctinfo) 2177 { 2178 const struct nf_nat_hook *nat_hook; 2179 struct nf_conntrack_tuple_hash *h; 2180 struct nf_conntrack_tuple tuple; 2181 unsigned int status; 2182 int dataoff; 2183 u16 l3num; 2184 u8 l4num; 2185 2186 l3num = nf_ct_l3num(ct); 2187 2188 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2189 if (dataoff <= 0) 2190 return -1; 2191 2192 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2193 l4num, net, &tuple)) 2194 return -1; 2195 2196 if (ct->status & IPS_SRC_NAT) { 2197 memcpy(tuple.src.u3.all, 2198 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2199 sizeof(tuple.src.u3.all)); 2200 tuple.src.u.all = 2201 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2202 } 2203 2204 if (ct->status & IPS_DST_NAT) { 2205 memcpy(tuple.dst.u3.all, 2206 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2207 sizeof(tuple.dst.u3.all)); 2208 tuple.dst.u.all = 2209 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2210 } 2211 2212 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2213 if (!h) 2214 return 0; 2215 2216 /* Store status bits of the conntrack that is clashing to re-do NAT 2217 * mangling according to what it has been done already to this packet. 2218 */ 2219 status = ct->status; 2220 2221 nf_ct_put(ct); 2222 ct = nf_ct_tuplehash_to_ctrack(h); 2223 nf_ct_set(skb, ct, ctinfo); 2224 2225 nat_hook = rcu_dereference(nf_nat_hook); 2226 if (!nat_hook) 2227 return 0; 2228 2229 if (status & IPS_SRC_NAT && 2230 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 2231 IP_CT_DIR_ORIGINAL) == NF_DROP) 2232 return -1; 2233 2234 if (status & IPS_DST_NAT && 2235 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 2236 IP_CT_DIR_ORIGINAL) == NF_DROP) 2237 return -1; 2238 2239 return 0; 2240 } 2241 2242 /* This packet is coming from userspace via nf_queue, complete the packet 2243 * processing after the helper invocation in nf_confirm(). 2244 */ 2245 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2246 enum ip_conntrack_info ctinfo) 2247 { 2248 const struct nf_conntrack_helper *helper; 2249 const struct nf_conn_help *help; 2250 int protoff; 2251 2252 help = nfct_help(ct); 2253 if (!help) 2254 return 0; 2255 2256 helper = rcu_dereference(help->helper); 2257 if (!helper) 2258 return 0; 2259 2260 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2261 return 0; 2262 2263 switch (nf_ct_l3num(ct)) { 2264 case NFPROTO_IPV4: 2265 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2266 break; 2267 #if IS_ENABLED(CONFIG_IPV6) 2268 case NFPROTO_IPV6: { 2269 __be16 frag_off; 2270 u8 pnum; 2271 2272 pnum = ipv6_hdr(skb)->nexthdr; 2273 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2274 &frag_off); 2275 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2276 return 0; 2277 break; 2278 } 2279 #endif 2280 default: 2281 return 0; 2282 } 2283 2284 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2285 !nf_is_loopback_packet(skb)) { 2286 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2287 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2288 return -1; 2289 } 2290 } 2291 2292 /* We've seen it coming out the other side: confirm it */ 2293 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; 2294 } 2295 2296 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2297 { 2298 enum ip_conntrack_info ctinfo; 2299 struct nf_conn *ct; 2300 int err; 2301 2302 ct = nf_ct_get(skb, &ctinfo); 2303 if (!ct) 2304 return 0; 2305 2306 if (!nf_ct_is_confirmed(ct)) { 2307 err = __nf_conntrack_update(net, skb, ct, ctinfo); 2308 if (err < 0) 2309 return err; 2310 2311 ct = nf_ct_get(skb, &ctinfo); 2312 } 2313 2314 return nf_confirm_cthelper(skb, ct, ctinfo); 2315 } 2316 2317 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2318 const struct sk_buff *skb) 2319 { 2320 const struct nf_conntrack_tuple *src_tuple; 2321 const struct nf_conntrack_tuple_hash *hash; 2322 struct nf_conntrack_tuple srctuple; 2323 enum ip_conntrack_info ctinfo; 2324 struct nf_conn *ct; 2325 2326 ct = nf_ct_get(skb, &ctinfo); 2327 if (ct) { 2328 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2329 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2330 return true; 2331 } 2332 2333 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2334 NFPROTO_IPV4, dev_net(skb->dev), 2335 &srctuple)) 2336 return false; 2337 2338 hash = nf_conntrack_find_get(dev_net(skb->dev), 2339 &nf_ct_zone_dflt, 2340 &srctuple); 2341 if (!hash) 2342 return false; 2343 2344 ct = nf_ct_tuplehash_to_ctrack(hash); 2345 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2346 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2347 nf_ct_put(ct); 2348 2349 return true; 2350 } 2351 2352 /* Bring out ya dead! */ 2353 static struct nf_conn * 2354 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2355 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2356 { 2357 struct nf_conntrack_tuple_hash *h; 2358 struct nf_conn *ct; 2359 struct hlist_nulls_node *n; 2360 spinlock_t *lockp; 2361 2362 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2363 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2364 2365 if (hlist_nulls_empty(hslot)) 2366 continue; 2367 2368 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2369 local_bh_disable(); 2370 nf_conntrack_lock(lockp); 2371 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2372 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2373 continue; 2374 /* All nf_conn objects are added to hash table twice, one 2375 * for original direction tuple, once for the reply tuple. 2376 * 2377 * Exception: In the IPS_NAT_CLASH case, only the reply 2378 * tuple is added (the original tuple already existed for 2379 * a different object). 2380 * 2381 * We only need to call the iterator once for each 2382 * conntrack, so we just use the 'reply' direction 2383 * tuple while iterating. 2384 */ 2385 ct = nf_ct_tuplehash_to_ctrack(h); 2386 2387 if (iter_data->net && 2388 !net_eq(iter_data->net, nf_ct_net(ct))) 2389 continue; 2390 2391 if (iter(ct, iter_data->data)) 2392 goto found; 2393 } 2394 spin_unlock(lockp); 2395 local_bh_enable(); 2396 cond_resched(); 2397 } 2398 2399 return NULL; 2400 found: 2401 refcount_inc(&ct->ct_general.use); 2402 spin_unlock(lockp); 2403 local_bh_enable(); 2404 return ct; 2405 } 2406 2407 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2408 const struct nf_ct_iter_data *iter_data) 2409 { 2410 unsigned int bucket = 0; 2411 struct nf_conn *ct; 2412 2413 might_sleep(); 2414 2415 mutex_lock(&nf_conntrack_mutex); 2416 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2417 /* Time to push up daises... */ 2418 2419 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2420 nf_ct_put(ct); 2421 cond_resched(); 2422 } 2423 mutex_unlock(&nf_conntrack_mutex); 2424 } 2425 2426 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2427 const struct nf_ct_iter_data *iter_data) 2428 { 2429 struct net *net = iter_data->net; 2430 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2431 2432 might_sleep(); 2433 2434 if (atomic_read(&cnet->count) == 0) 2435 return; 2436 2437 nf_ct_iterate_cleanup(iter, iter_data); 2438 } 2439 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2440 2441 /** 2442 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2443 * @iter: callback to invoke for each conntrack 2444 * @data: data to pass to @iter 2445 * 2446 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2447 * unconfirmed list as dying (so they will not be inserted into 2448 * main table). 2449 * 2450 * Can only be called in module exit path. 2451 */ 2452 void 2453 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2454 { 2455 struct nf_ct_iter_data iter_data = {}; 2456 struct net *net; 2457 2458 down_read(&net_rwsem); 2459 for_each_net(net) { 2460 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2461 2462 if (atomic_read(&cnet->count) == 0) 2463 continue; 2464 nf_queue_nf_hook_drop(net); 2465 } 2466 up_read(&net_rwsem); 2467 2468 /* Need to wait for netns cleanup worker to finish, if its 2469 * running -- it might have deleted a net namespace from 2470 * the global list, so hook drop above might not have 2471 * affected all namespaces. 2472 */ 2473 net_ns_barrier(); 2474 2475 /* a skb w. unconfirmed conntrack could have been reinjected just 2476 * before we called nf_queue_nf_hook_drop(). 2477 * 2478 * This makes sure its inserted into conntrack table. 2479 */ 2480 synchronize_net(); 2481 2482 nf_ct_ext_bump_genid(); 2483 iter_data.data = data; 2484 nf_ct_iterate_cleanup(iter, &iter_data); 2485 2486 /* Another cpu might be in a rcu read section with 2487 * rcu protected pointer cleared in iter callback 2488 * or hidden via nf_ct_ext_bump_genid() above. 2489 * 2490 * Wait until those are done. 2491 */ 2492 synchronize_rcu(); 2493 } 2494 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2495 2496 static int kill_all(struct nf_conn *i, void *data) 2497 { 2498 return 1; 2499 } 2500 2501 void nf_conntrack_cleanup_start(void) 2502 { 2503 cleanup_nf_conntrack_bpf(); 2504 conntrack_gc_work.exiting = true; 2505 } 2506 2507 void nf_conntrack_cleanup_end(void) 2508 { 2509 RCU_INIT_POINTER(nf_ct_hook, NULL); 2510 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2511 kvfree(nf_conntrack_hash); 2512 2513 nf_conntrack_proto_fini(); 2514 nf_conntrack_helper_fini(); 2515 nf_conntrack_expect_fini(); 2516 2517 kmem_cache_destroy(nf_conntrack_cachep); 2518 } 2519 2520 /* 2521 * Mishearing the voices in his head, our hero wonders how he's 2522 * supposed to kill the mall. 2523 */ 2524 void nf_conntrack_cleanup_net(struct net *net) 2525 { 2526 LIST_HEAD(single); 2527 2528 list_add(&net->exit_list, &single); 2529 nf_conntrack_cleanup_net_list(&single); 2530 } 2531 2532 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2533 { 2534 struct nf_ct_iter_data iter_data = {}; 2535 struct net *net; 2536 int busy; 2537 2538 /* 2539 * This makes sure all current packets have passed through 2540 * netfilter framework. Roll on, two-stage module 2541 * delete... 2542 */ 2543 synchronize_net(); 2544 i_see_dead_people: 2545 busy = 0; 2546 list_for_each_entry(net, net_exit_list, exit_list) { 2547 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2548 2549 iter_data.net = net; 2550 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2551 if (atomic_read(&cnet->count) != 0) 2552 busy = 1; 2553 } 2554 if (busy) { 2555 schedule(); 2556 goto i_see_dead_people; 2557 } 2558 2559 list_for_each_entry(net, net_exit_list, exit_list) { 2560 nf_conntrack_ecache_pernet_fini(net); 2561 nf_conntrack_expect_pernet_fini(net); 2562 free_percpu(net->ct.stat); 2563 } 2564 } 2565 2566 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2567 { 2568 struct hlist_nulls_head *hash; 2569 unsigned int nr_slots, i; 2570 2571 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2572 return NULL; 2573 2574 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2575 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2576 2577 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2578 2579 if (hash && nulls) 2580 for (i = 0; i < nr_slots; i++) 2581 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2582 2583 return hash; 2584 } 2585 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2586 2587 int nf_conntrack_hash_resize(unsigned int hashsize) 2588 { 2589 int i, bucket; 2590 unsigned int old_size; 2591 struct hlist_nulls_head *hash, *old_hash; 2592 struct nf_conntrack_tuple_hash *h; 2593 struct nf_conn *ct; 2594 2595 if (!hashsize) 2596 return -EINVAL; 2597 2598 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2599 if (!hash) 2600 return -ENOMEM; 2601 2602 mutex_lock(&nf_conntrack_mutex); 2603 old_size = nf_conntrack_htable_size; 2604 if (old_size == hashsize) { 2605 mutex_unlock(&nf_conntrack_mutex); 2606 kvfree(hash); 2607 return 0; 2608 } 2609 2610 local_bh_disable(); 2611 nf_conntrack_all_lock(); 2612 write_seqcount_begin(&nf_conntrack_generation); 2613 2614 /* Lookups in the old hash might happen in parallel, which means we 2615 * might get false negatives during connection lookup. New connections 2616 * created because of a false negative won't make it into the hash 2617 * though since that required taking the locks. 2618 */ 2619 2620 for (i = 0; i < nf_conntrack_htable_size; i++) { 2621 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2622 unsigned int zone_id; 2623 2624 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2625 struct nf_conntrack_tuple_hash, hnnode); 2626 ct = nf_ct_tuplehash_to_ctrack(h); 2627 hlist_nulls_del_rcu(&h->hnnode); 2628 2629 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2630 bucket = __hash_conntrack(nf_ct_net(ct), 2631 &h->tuple, zone_id, hashsize); 2632 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2633 } 2634 } 2635 old_hash = nf_conntrack_hash; 2636 2637 nf_conntrack_hash = hash; 2638 nf_conntrack_htable_size = hashsize; 2639 2640 write_seqcount_end(&nf_conntrack_generation); 2641 nf_conntrack_all_unlock(); 2642 local_bh_enable(); 2643 2644 mutex_unlock(&nf_conntrack_mutex); 2645 2646 synchronize_net(); 2647 kvfree(old_hash); 2648 return 0; 2649 } 2650 2651 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2652 { 2653 unsigned int hashsize; 2654 int rc; 2655 2656 if (current->nsproxy->net_ns != &init_net) 2657 return -EOPNOTSUPP; 2658 2659 /* On boot, we can set this without any fancy locking. */ 2660 if (!nf_conntrack_hash) 2661 return param_set_uint(val, kp); 2662 2663 rc = kstrtouint(val, 0, &hashsize); 2664 if (rc) 2665 return rc; 2666 2667 return nf_conntrack_hash_resize(hashsize); 2668 } 2669 2670 int nf_conntrack_init_start(void) 2671 { 2672 unsigned long nr_pages = totalram_pages(); 2673 int max_factor = 8; 2674 int ret = -ENOMEM; 2675 int i; 2676 2677 seqcount_spinlock_init(&nf_conntrack_generation, 2678 &nf_conntrack_locks_all_lock); 2679 2680 for (i = 0; i < CONNTRACK_LOCKS; i++) 2681 spin_lock_init(&nf_conntrack_locks[i]); 2682 2683 if (!nf_conntrack_htable_size) { 2684 nf_conntrack_htable_size 2685 = (((nr_pages << PAGE_SHIFT) / 16384) 2686 / sizeof(struct hlist_head)); 2687 if (BITS_PER_LONG >= 64 && 2688 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2689 nf_conntrack_htable_size = 262144; 2690 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2691 nf_conntrack_htable_size = 65536; 2692 2693 if (nf_conntrack_htable_size < 1024) 2694 nf_conntrack_htable_size = 1024; 2695 /* Use a max. factor of one by default to keep the average 2696 * hash chain length at 2 entries. Each entry has to be added 2697 * twice (once for original direction, once for reply). 2698 * When a table size is given we use the old value of 8 to 2699 * avoid implicit reduction of the max entries setting. 2700 */ 2701 max_factor = 1; 2702 } 2703 2704 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2705 if (!nf_conntrack_hash) 2706 return -ENOMEM; 2707 2708 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2709 2710 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2711 sizeof(struct nf_conn), 2712 NFCT_INFOMASK + 1, 2713 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2714 if (!nf_conntrack_cachep) 2715 goto err_cachep; 2716 2717 ret = nf_conntrack_expect_init(); 2718 if (ret < 0) 2719 goto err_expect; 2720 2721 ret = nf_conntrack_helper_init(); 2722 if (ret < 0) 2723 goto err_helper; 2724 2725 ret = nf_conntrack_proto_init(); 2726 if (ret < 0) 2727 goto err_proto; 2728 2729 conntrack_gc_work_init(&conntrack_gc_work); 2730 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2731 2732 ret = register_nf_conntrack_bpf(); 2733 if (ret < 0) 2734 goto err_kfunc; 2735 2736 return 0; 2737 2738 err_kfunc: 2739 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2740 nf_conntrack_proto_fini(); 2741 err_proto: 2742 nf_conntrack_helper_fini(); 2743 err_helper: 2744 nf_conntrack_expect_fini(); 2745 err_expect: 2746 kmem_cache_destroy(nf_conntrack_cachep); 2747 err_cachep: 2748 kvfree(nf_conntrack_hash); 2749 return ret; 2750 } 2751 2752 static void nf_conntrack_set_closing(struct nf_conntrack *nfct) 2753 { 2754 struct nf_conn *ct = nf_ct_to_nf_conn(nfct); 2755 2756 switch (nf_ct_protonum(ct)) { 2757 case IPPROTO_TCP: 2758 nf_conntrack_tcp_set_closing(ct); 2759 break; 2760 } 2761 } 2762 2763 static const struct nf_ct_hook nf_conntrack_hook = { 2764 .update = nf_conntrack_update, 2765 .destroy = nf_ct_destroy, 2766 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2767 .attach = nf_conntrack_attach, 2768 .set_closing = nf_conntrack_set_closing, 2769 .confirm = __nf_conntrack_confirm, 2770 }; 2771 2772 void nf_conntrack_init_end(void) 2773 { 2774 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2775 } 2776 2777 /* 2778 * We need to use special "null" values, not used in hash table 2779 */ 2780 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2781 2782 int nf_conntrack_init_net(struct net *net) 2783 { 2784 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2785 int ret = -ENOMEM; 2786 2787 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2788 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2789 atomic_set(&cnet->count, 0); 2790 2791 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2792 if (!net->ct.stat) 2793 return ret; 2794 2795 ret = nf_conntrack_expect_pernet_init(net); 2796 if (ret < 0) 2797 goto err_expect; 2798 2799 nf_conntrack_acct_pernet_init(net); 2800 nf_conntrack_tstamp_pernet_init(net); 2801 nf_conntrack_ecache_pernet_init(net); 2802 nf_conntrack_proto_pernet_init(net); 2803 2804 return 0; 2805 2806 err_expect: 2807 free_percpu(net->ct.stat); 2808 return ret; 2809 } 2810 2811 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2812 2813 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2814 { 2815 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2816 return -EPERM; 2817 2818 __nf_ct_set_timeout(ct, timeout); 2819 2820 if (test_bit(IPS_DYING_BIT, &ct->status)) 2821 return -ETIME; 2822 2823 return 0; 2824 } 2825 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2826 2827 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2828 { 2829 unsigned int bit; 2830 2831 /* Ignore these unchangable bits */ 2832 on &= ~IPS_UNCHANGEABLE_MASK; 2833 off &= ~IPS_UNCHANGEABLE_MASK; 2834 2835 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2836 if (on & (1 << bit)) 2837 set_bit(bit, &ct->status); 2838 else if (off & (1 << bit)) 2839 clear_bit(bit, &ct->status); 2840 } 2841 } 2842 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2843 2844 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2845 { 2846 unsigned long d; 2847 2848 d = ct->status ^ status; 2849 2850 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2851 /* unchangeable */ 2852 return -EBUSY; 2853 2854 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2855 /* SEEN_REPLY bit can only be set */ 2856 return -EBUSY; 2857 2858 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2859 /* ASSURED bit can only be set */ 2860 return -EBUSY; 2861 2862 __nf_ct_change_status(ct, status, 0); 2863 return 0; 2864 } 2865 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2866