1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/siphash.h> 25 #include <linux/err.h> 26 #include <linux/percpu.h> 27 #include <linux/moduleparam.h> 28 #include <linux/notifier.h> 29 #include <linux/kernel.h> 30 #include <linux/netdevice.h> 31 #include <linux/socket.h> 32 #include <linux/mm.h> 33 #include <linux/nsproxy.h> 34 #include <linux/rculist_nulls.h> 35 36 #include <net/netfilter/nf_conntrack.h> 37 #include <net/netfilter/nf_conntrack_bpf.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_core.h> 42 #include <net/netfilter/nf_conntrack_extend.h> 43 #include <net/netfilter/nf_conntrack_acct.h> 44 #include <net/netfilter/nf_conntrack_ecache.h> 45 #include <net/netfilter/nf_conntrack_zones.h> 46 #include <net/netfilter/nf_conntrack_timestamp.h> 47 #include <net/netfilter/nf_conntrack_timeout.h> 48 #include <net/netfilter/nf_conntrack_labels.h> 49 #include <net/netfilter/nf_conntrack_synproxy.h> 50 #include <net/netfilter/nf_nat.h> 51 #include <net/netfilter/nf_nat_helper.h> 52 #include <net/netns/hash.h> 53 #include <net/ip.h> 54 55 #include "nf_internals.h" 56 57 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 58 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 59 60 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 61 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 62 63 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 64 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 65 66 struct conntrack_gc_work { 67 struct delayed_work dwork; 68 u32 next_bucket; 69 u32 avg_timeout; 70 u32 start_time; 71 bool exiting; 72 bool early_drop; 73 }; 74 75 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 76 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 77 static __read_mostly bool nf_conntrack_locks_all; 78 79 /* serialize hash resizes and nf_ct_iterate_cleanup */ 80 static DEFINE_MUTEX(nf_conntrack_mutex); 81 82 #define GC_SCAN_INTERVAL_MAX (60ul * HZ) 83 #define GC_SCAN_INTERVAL_MIN (1ul * HZ) 84 85 /* clamp timeouts to this value (TCP unacked) */ 86 #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) 87 88 /* large initial bias so that we don't scan often just because we have 89 * three entries with a 1s timeout. 90 */ 91 #define GC_SCAN_INTERVAL_INIT INT_MAX 92 93 #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) 94 #define GC_SCAN_EXPIRED_MAX (64000u / HZ) 95 96 #define MIN_CHAINLEN 8u 97 #define MAX_CHAINLEN (32u - MIN_CHAINLEN) 98 99 static struct conntrack_gc_work conntrack_gc_work; 100 101 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 102 { 103 /* 1) Acquire the lock */ 104 spin_lock(lock); 105 106 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 107 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 108 */ 109 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 110 return; 111 112 /* fast path failed, unlock */ 113 spin_unlock(lock); 114 115 /* Slow path 1) get global lock */ 116 spin_lock(&nf_conntrack_locks_all_lock); 117 118 /* Slow path 2) get the lock we want */ 119 spin_lock(lock); 120 121 /* Slow path 3) release the global lock */ 122 spin_unlock(&nf_conntrack_locks_all_lock); 123 } 124 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 125 126 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 127 { 128 h1 %= CONNTRACK_LOCKS; 129 h2 %= CONNTRACK_LOCKS; 130 spin_unlock(&nf_conntrack_locks[h1]); 131 if (h1 != h2) 132 spin_unlock(&nf_conntrack_locks[h2]); 133 } 134 135 /* return true if we need to recompute hashes (in case hash table was resized) */ 136 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 137 unsigned int h2, unsigned int sequence) 138 { 139 h1 %= CONNTRACK_LOCKS; 140 h2 %= CONNTRACK_LOCKS; 141 if (h1 <= h2) { 142 nf_conntrack_lock(&nf_conntrack_locks[h1]); 143 if (h1 != h2) 144 spin_lock_nested(&nf_conntrack_locks[h2], 145 SINGLE_DEPTH_NESTING); 146 } else { 147 nf_conntrack_lock(&nf_conntrack_locks[h2]); 148 spin_lock_nested(&nf_conntrack_locks[h1], 149 SINGLE_DEPTH_NESTING); 150 } 151 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 152 nf_conntrack_double_unlock(h1, h2); 153 return true; 154 } 155 return false; 156 } 157 158 static void nf_conntrack_all_lock(void) 159 __acquires(&nf_conntrack_locks_all_lock) 160 { 161 int i; 162 163 spin_lock(&nf_conntrack_locks_all_lock); 164 165 /* For nf_contrack_locks_all, only the latest time when another 166 * CPU will see an update is controlled, by the "release" of the 167 * spin_lock below. 168 * The earliest time is not controlled, an thus KCSAN could detect 169 * a race when nf_conntract_lock() reads the variable. 170 * WRITE_ONCE() is used to ensure the compiler will not 171 * optimize the write. 172 */ 173 WRITE_ONCE(nf_conntrack_locks_all, true); 174 175 for (i = 0; i < CONNTRACK_LOCKS; i++) { 176 spin_lock(&nf_conntrack_locks[i]); 177 178 /* This spin_unlock provides the "release" to ensure that 179 * nf_conntrack_locks_all==true is visible to everyone that 180 * acquired spin_lock(&nf_conntrack_locks[]). 181 */ 182 spin_unlock(&nf_conntrack_locks[i]); 183 } 184 } 185 186 static void nf_conntrack_all_unlock(void) 187 __releases(&nf_conntrack_locks_all_lock) 188 { 189 /* All prior stores must be complete before we clear 190 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 191 * might observe the false value but not the entire 192 * critical section. 193 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 194 */ 195 smp_store_release(&nf_conntrack_locks_all, false); 196 spin_unlock(&nf_conntrack_locks_all_lock); 197 } 198 199 unsigned int nf_conntrack_htable_size __read_mostly; 200 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 201 202 unsigned int nf_conntrack_max __read_mostly; 203 EXPORT_SYMBOL_GPL(nf_conntrack_max); 204 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 205 static siphash_aligned_key_t nf_conntrack_hash_rnd; 206 207 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 208 unsigned int zoneid, 209 const struct net *net) 210 { 211 struct { 212 struct nf_conntrack_man src; 213 union nf_inet_addr dst_addr; 214 unsigned int zone; 215 u32 net_mix; 216 u16 dport; 217 u16 proto; 218 } __aligned(SIPHASH_ALIGNMENT) combined; 219 220 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 221 222 memset(&combined, 0, sizeof(combined)); 223 224 /* The direction must be ignored, so handle usable members manually. */ 225 combined.src = tuple->src; 226 combined.dst_addr = tuple->dst.u3; 227 combined.zone = zoneid; 228 combined.net_mix = net_hash_mix(net); 229 combined.dport = (__force __u16)tuple->dst.u.all; 230 combined.proto = tuple->dst.protonum; 231 232 return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); 233 } 234 235 static u32 scale_hash(u32 hash) 236 { 237 return reciprocal_scale(hash, nf_conntrack_htable_size); 238 } 239 240 static u32 __hash_conntrack(const struct net *net, 241 const struct nf_conntrack_tuple *tuple, 242 unsigned int zoneid, 243 unsigned int size) 244 { 245 return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); 246 } 247 248 static u32 hash_conntrack(const struct net *net, 249 const struct nf_conntrack_tuple *tuple, 250 unsigned int zoneid) 251 { 252 return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); 253 } 254 255 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 256 unsigned int dataoff, 257 struct nf_conntrack_tuple *tuple) 258 { struct { 259 __be16 sport; 260 __be16 dport; 261 } _inet_hdr, *inet_hdr; 262 263 /* Actually only need first 4 bytes to get ports. */ 264 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 265 if (!inet_hdr) 266 return false; 267 268 tuple->src.u.udp.port = inet_hdr->sport; 269 tuple->dst.u.udp.port = inet_hdr->dport; 270 return true; 271 } 272 273 static bool 274 nf_ct_get_tuple(const struct sk_buff *skb, 275 unsigned int nhoff, 276 unsigned int dataoff, 277 u_int16_t l3num, 278 u_int8_t protonum, 279 struct net *net, 280 struct nf_conntrack_tuple *tuple) 281 { 282 unsigned int size; 283 const __be32 *ap; 284 __be32 _addrs[8]; 285 286 memset(tuple, 0, sizeof(*tuple)); 287 288 tuple->src.l3num = l3num; 289 switch (l3num) { 290 case NFPROTO_IPV4: 291 nhoff += offsetof(struct iphdr, saddr); 292 size = 2 * sizeof(__be32); 293 break; 294 case NFPROTO_IPV6: 295 nhoff += offsetof(struct ipv6hdr, saddr); 296 size = sizeof(_addrs); 297 break; 298 default: 299 return true; 300 } 301 302 ap = skb_header_pointer(skb, nhoff, size, _addrs); 303 if (!ap) 304 return false; 305 306 switch (l3num) { 307 case NFPROTO_IPV4: 308 tuple->src.u3.ip = ap[0]; 309 tuple->dst.u3.ip = ap[1]; 310 break; 311 case NFPROTO_IPV6: 312 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 313 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 314 break; 315 } 316 317 tuple->dst.protonum = protonum; 318 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 319 320 switch (protonum) { 321 #if IS_ENABLED(CONFIG_IPV6) 322 case IPPROTO_ICMPV6: 323 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 324 #endif 325 case IPPROTO_ICMP: 326 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 327 #ifdef CONFIG_NF_CT_PROTO_GRE 328 case IPPROTO_GRE: 329 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 330 #endif 331 case IPPROTO_TCP: 332 case IPPROTO_UDP: 333 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 334 case IPPROTO_UDPLITE: 335 #endif 336 #ifdef CONFIG_NF_CT_PROTO_SCTP 337 case IPPROTO_SCTP: 338 #endif 339 #ifdef CONFIG_NF_CT_PROTO_DCCP 340 case IPPROTO_DCCP: 341 #endif 342 /* fallthrough */ 343 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 344 default: 345 break; 346 } 347 348 return true; 349 } 350 351 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 352 u_int8_t *protonum) 353 { 354 int dataoff = -1; 355 const struct iphdr *iph; 356 struct iphdr _iph; 357 358 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 359 if (!iph) 360 return -1; 361 362 /* Conntrack defragments packets, we might still see fragments 363 * inside ICMP packets though. 364 */ 365 if (iph->frag_off & htons(IP_OFFSET)) 366 return -1; 367 368 dataoff = nhoff + (iph->ihl << 2); 369 *protonum = iph->protocol; 370 371 /* Check bogus IP headers */ 372 if (dataoff > skb->len) { 373 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 374 nhoff, iph->ihl << 2, skb->len); 375 return -1; 376 } 377 return dataoff; 378 } 379 380 #if IS_ENABLED(CONFIG_IPV6) 381 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 382 u8 *protonum) 383 { 384 int protoff = -1; 385 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 386 __be16 frag_off; 387 u8 nexthdr; 388 389 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 390 &nexthdr, sizeof(nexthdr)) != 0) { 391 pr_debug("can't get nexthdr\n"); 392 return -1; 393 } 394 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 395 /* 396 * (protoff == skb->len) means the packet has not data, just 397 * IPv6 and possibly extensions headers, but it is tracked anyway 398 */ 399 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 400 pr_debug("can't find proto in pkt\n"); 401 return -1; 402 } 403 404 *protonum = nexthdr; 405 return protoff; 406 } 407 #endif 408 409 static int get_l4proto(const struct sk_buff *skb, 410 unsigned int nhoff, u8 pf, u8 *l4num) 411 { 412 switch (pf) { 413 case NFPROTO_IPV4: 414 return ipv4_get_l4proto(skb, nhoff, l4num); 415 #if IS_ENABLED(CONFIG_IPV6) 416 case NFPROTO_IPV6: 417 return ipv6_get_l4proto(skb, nhoff, l4num); 418 #endif 419 default: 420 *l4num = 0; 421 break; 422 } 423 return -1; 424 } 425 426 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 427 u_int16_t l3num, 428 struct net *net, struct nf_conntrack_tuple *tuple) 429 { 430 u8 protonum; 431 int protoff; 432 433 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 434 if (protoff <= 0) 435 return false; 436 437 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 438 } 439 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 440 441 bool 442 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 443 const struct nf_conntrack_tuple *orig) 444 { 445 memset(inverse, 0, sizeof(*inverse)); 446 447 inverse->src.l3num = orig->src.l3num; 448 449 switch (orig->src.l3num) { 450 case NFPROTO_IPV4: 451 inverse->src.u3.ip = orig->dst.u3.ip; 452 inverse->dst.u3.ip = orig->src.u3.ip; 453 break; 454 case NFPROTO_IPV6: 455 inverse->src.u3.in6 = orig->dst.u3.in6; 456 inverse->dst.u3.in6 = orig->src.u3.in6; 457 break; 458 default: 459 break; 460 } 461 462 inverse->dst.dir = !orig->dst.dir; 463 464 inverse->dst.protonum = orig->dst.protonum; 465 466 switch (orig->dst.protonum) { 467 case IPPROTO_ICMP: 468 return nf_conntrack_invert_icmp_tuple(inverse, orig); 469 #if IS_ENABLED(CONFIG_IPV6) 470 case IPPROTO_ICMPV6: 471 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 472 #endif 473 } 474 475 inverse->src.u.all = orig->dst.u.all; 476 inverse->dst.u.all = orig->src.u.all; 477 return true; 478 } 479 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 480 481 /* Generate a almost-unique pseudo-id for a given conntrack. 482 * 483 * intentionally doesn't re-use any of the seeds used for hash 484 * table location, we assume id gets exposed to userspace. 485 * 486 * Following nf_conn items do not change throughout lifetime 487 * of the nf_conn: 488 * 489 * 1. nf_conn address 490 * 2. nf_conn->master address (normally NULL) 491 * 3. the associated net namespace 492 * 4. the original direction tuple 493 */ 494 u32 nf_ct_get_id(const struct nf_conn *ct) 495 { 496 static siphash_aligned_key_t ct_id_seed; 497 unsigned long a, b, c, d; 498 499 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 500 501 a = (unsigned long)ct; 502 b = (unsigned long)ct->master; 503 c = (unsigned long)nf_ct_net(ct); 504 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 505 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 506 &ct_id_seed); 507 #ifdef CONFIG_64BIT 508 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 509 #else 510 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 511 #endif 512 } 513 EXPORT_SYMBOL_GPL(nf_ct_get_id); 514 515 static void 516 clean_from_lists(struct nf_conn *ct) 517 { 518 pr_debug("clean_from_lists(%p)\n", ct); 519 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 520 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 521 522 /* Destroy all pending expectations */ 523 nf_ct_remove_expectations(ct); 524 } 525 526 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 527 528 /* Released via nf_ct_destroy() */ 529 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 530 const struct nf_conntrack_zone *zone, 531 gfp_t flags) 532 { 533 struct nf_conn *tmpl, *p; 534 535 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 536 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 537 if (!tmpl) 538 return NULL; 539 540 p = tmpl; 541 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 542 if (tmpl != p) { 543 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 544 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 545 } 546 } else { 547 tmpl = kzalloc(sizeof(*tmpl), flags); 548 if (!tmpl) 549 return NULL; 550 } 551 552 tmpl->status = IPS_TEMPLATE; 553 write_pnet(&tmpl->ct_net, net); 554 nf_ct_zone_add(tmpl, zone); 555 refcount_set(&tmpl->ct_general.use, 1); 556 557 return tmpl; 558 } 559 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 560 561 void nf_ct_tmpl_free(struct nf_conn *tmpl) 562 { 563 kfree(tmpl->ext); 564 565 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 566 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 567 else 568 kfree(tmpl); 569 } 570 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 571 572 static void destroy_gre_conntrack(struct nf_conn *ct) 573 { 574 #ifdef CONFIG_NF_CT_PROTO_GRE 575 struct nf_conn *master = ct->master; 576 577 if (master) 578 nf_ct_gre_keymap_destroy(master); 579 #endif 580 } 581 582 void nf_ct_destroy(struct nf_conntrack *nfct) 583 { 584 struct nf_conn *ct = (struct nf_conn *)nfct; 585 586 pr_debug("%s(%p)\n", __func__, ct); 587 WARN_ON(refcount_read(&nfct->use) != 0); 588 589 if (unlikely(nf_ct_is_template(ct))) { 590 nf_ct_tmpl_free(ct); 591 return; 592 } 593 594 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 595 destroy_gre_conntrack(ct); 596 597 /* Expectations will have been removed in clean_from_lists, 598 * except TFTP can create an expectation on the first packet, 599 * before connection is in the list, so we need to clean here, 600 * too. 601 */ 602 nf_ct_remove_expectations(ct); 603 604 if (ct->master) 605 nf_ct_put(ct->master); 606 607 pr_debug("%s: returning ct=%p to slab\n", __func__, ct); 608 nf_conntrack_free(ct); 609 } 610 EXPORT_SYMBOL(nf_ct_destroy); 611 612 static void __nf_ct_delete_from_lists(struct nf_conn *ct) 613 { 614 struct net *net = nf_ct_net(ct); 615 unsigned int hash, reply_hash; 616 unsigned int sequence; 617 618 do { 619 sequence = read_seqcount_begin(&nf_conntrack_generation); 620 hash = hash_conntrack(net, 621 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 622 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 623 reply_hash = hash_conntrack(net, 624 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 625 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 626 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 627 628 clean_from_lists(ct); 629 nf_conntrack_double_unlock(hash, reply_hash); 630 } 631 632 static void nf_ct_delete_from_lists(struct nf_conn *ct) 633 { 634 nf_ct_helper_destroy(ct); 635 local_bh_disable(); 636 637 __nf_ct_delete_from_lists(ct); 638 639 local_bh_enable(); 640 } 641 642 static void nf_ct_add_to_ecache_list(struct nf_conn *ct) 643 { 644 #ifdef CONFIG_NF_CONNTRACK_EVENTS 645 struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct)); 646 647 spin_lock(&cnet->ecache.dying_lock); 648 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 649 &cnet->ecache.dying_list); 650 spin_unlock(&cnet->ecache.dying_lock); 651 #endif 652 } 653 654 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 655 { 656 struct nf_conn_tstamp *tstamp; 657 struct net *net; 658 659 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 660 return false; 661 662 tstamp = nf_conn_tstamp_find(ct); 663 if (tstamp) { 664 s32 timeout = READ_ONCE(ct->timeout) - nfct_time_stamp; 665 666 tstamp->stop = ktime_get_real_ns(); 667 if (timeout < 0) 668 tstamp->stop -= jiffies_to_nsecs(-timeout); 669 } 670 671 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 672 portid, report) < 0) { 673 /* destroy event was not delivered. nf_ct_put will 674 * be done by event cache worker on redelivery. 675 */ 676 nf_ct_helper_destroy(ct); 677 local_bh_disable(); 678 __nf_ct_delete_from_lists(ct); 679 nf_ct_add_to_ecache_list(ct); 680 local_bh_enable(); 681 682 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 683 return false; 684 } 685 686 net = nf_ct_net(ct); 687 if (nf_conntrack_ecache_dwork_pending(net)) 688 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 689 nf_ct_delete_from_lists(ct); 690 nf_ct_put(ct); 691 return true; 692 } 693 EXPORT_SYMBOL_GPL(nf_ct_delete); 694 695 static inline bool 696 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 697 const struct nf_conntrack_tuple *tuple, 698 const struct nf_conntrack_zone *zone, 699 const struct net *net) 700 { 701 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 702 703 /* A conntrack can be recreated with the equal tuple, 704 * so we need to check that the conntrack is confirmed 705 */ 706 return nf_ct_tuple_equal(tuple, &h->tuple) && 707 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 708 nf_ct_is_confirmed(ct) && 709 net_eq(net, nf_ct_net(ct)); 710 } 711 712 static inline bool 713 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 714 { 715 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 716 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 717 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 718 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 719 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 720 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 721 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 722 } 723 724 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 725 static void nf_ct_gc_expired(struct nf_conn *ct) 726 { 727 if (!refcount_inc_not_zero(&ct->ct_general.use)) 728 return; 729 730 /* load ->status after refcount increase */ 731 smp_acquire__after_ctrl_dep(); 732 733 if (nf_ct_should_gc(ct)) 734 nf_ct_kill(ct); 735 736 nf_ct_put(ct); 737 } 738 739 /* 740 * Warning : 741 * - Caller must take a reference on returned object 742 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 743 */ 744 static struct nf_conntrack_tuple_hash * 745 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 746 const struct nf_conntrack_tuple *tuple, u32 hash) 747 { 748 struct nf_conntrack_tuple_hash *h; 749 struct hlist_nulls_head *ct_hash; 750 struct hlist_nulls_node *n; 751 unsigned int bucket, hsize; 752 753 begin: 754 nf_conntrack_get_ht(&ct_hash, &hsize); 755 bucket = reciprocal_scale(hash, hsize); 756 757 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 758 struct nf_conn *ct; 759 760 ct = nf_ct_tuplehash_to_ctrack(h); 761 if (nf_ct_is_expired(ct)) { 762 nf_ct_gc_expired(ct); 763 continue; 764 } 765 766 if (nf_ct_key_equal(h, tuple, zone, net)) 767 return h; 768 } 769 /* 770 * if the nulls value we got at the end of this lookup is 771 * not the expected one, we must restart lookup. 772 * We probably met an item that was moved to another chain. 773 */ 774 if (get_nulls_value(n) != bucket) { 775 NF_CT_STAT_INC_ATOMIC(net, search_restart); 776 goto begin; 777 } 778 779 return NULL; 780 } 781 782 /* Find a connection corresponding to a tuple. */ 783 static struct nf_conntrack_tuple_hash * 784 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 785 const struct nf_conntrack_tuple *tuple, u32 hash) 786 { 787 struct nf_conntrack_tuple_hash *h; 788 struct nf_conn *ct; 789 790 rcu_read_lock(); 791 792 h = ____nf_conntrack_find(net, zone, tuple, hash); 793 if (h) { 794 /* We have a candidate that matches the tuple we're interested 795 * in, try to obtain a reference and re-check tuple 796 */ 797 ct = nf_ct_tuplehash_to_ctrack(h); 798 if (likely(refcount_inc_not_zero(&ct->ct_general.use))) { 799 /* re-check key after refcount */ 800 smp_acquire__after_ctrl_dep(); 801 802 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 803 goto found; 804 805 /* TYPESAFE_BY_RCU recycled the candidate */ 806 nf_ct_put(ct); 807 } 808 809 h = NULL; 810 } 811 found: 812 rcu_read_unlock(); 813 814 return h; 815 } 816 817 struct nf_conntrack_tuple_hash * 818 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 819 const struct nf_conntrack_tuple *tuple) 820 { 821 unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 822 struct nf_conntrack_tuple_hash *thash; 823 824 thash = __nf_conntrack_find_get(net, zone, tuple, 825 hash_conntrack_raw(tuple, zone_id, net)); 826 827 if (thash) 828 return thash; 829 830 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 831 if (rid != zone_id) 832 return __nf_conntrack_find_get(net, zone, tuple, 833 hash_conntrack_raw(tuple, rid, net)); 834 return thash; 835 } 836 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 837 838 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 839 unsigned int hash, 840 unsigned int reply_hash) 841 { 842 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 843 &nf_conntrack_hash[hash]); 844 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 845 &nf_conntrack_hash[reply_hash]); 846 } 847 848 static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext) 849 { 850 /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions 851 * may contain stale pointers to e.g. helper that has been removed. 852 * 853 * The helper can't clear this because the nf_conn object isn't in 854 * any hash and synchronize_rcu() isn't enough because associated skb 855 * might sit in a queue. 856 */ 857 return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid); 858 } 859 860 static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext) 861 { 862 if (!ext) 863 return true; 864 865 if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid)) 866 return false; 867 868 /* inserted into conntrack table, nf_ct_iterate_cleanup() 869 * will find it. Disable nf_ct_ext_find() id check. 870 */ 871 WRITE_ONCE(ext->gen_id, 0); 872 return true; 873 } 874 875 int 876 nf_conntrack_hash_check_insert(struct nf_conn *ct) 877 { 878 const struct nf_conntrack_zone *zone; 879 struct net *net = nf_ct_net(ct); 880 unsigned int hash, reply_hash; 881 struct nf_conntrack_tuple_hash *h; 882 struct hlist_nulls_node *n; 883 unsigned int max_chainlen; 884 unsigned int chainlen = 0; 885 unsigned int sequence; 886 int err = -EEXIST; 887 888 zone = nf_ct_zone(ct); 889 890 if (!nf_ct_ext_valid_pre(ct->ext)) { 891 NF_CT_STAT_INC(net, insert_failed); 892 return -ETIMEDOUT; 893 } 894 895 local_bh_disable(); 896 do { 897 sequence = read_seqcount_begin(&nf_conntrack_generation); 898 hash = hash_conntrack(net, 899 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 900 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); 901 reply_hash = hash_conntrack(net, 902 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 903 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 904 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 905 906 max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); 907 908 /* See if there's one in the list already, including reverse */ 909 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 910 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 911 zone, net)) 912 goto out; 913 914 if (chainlen++ > max_chainlen) 915 goto chaintoolong; 916 } 917 918 chainlen = 0; 919 920 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 921 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 922 zone, net)) 923 goto out; 924 if (chainlen++ > max_chainlen) 925 goto chaintoolong; 926 } 927 928 smp_wmb(); 929 /* The caller holds a reference to this object */ 930 refcount_set(&ct->ct_general.use, 2); 931 __nf_conntrack_hash_insert(ct, hash, reply_hash); 932 nf_conntrack_double_unlock(hash, reply_hash); 933 NF_CT_STAT_INC(net, insert); 934 local_bh_enable(); 935 936 if (!nf_ct_ext_valid_post(ct->ext)) { 937 nf_ct_kill(ct); 938 NF_CT_STAT_INC(net, drop); 939 return -ETIMEDOUT; 940 } 941 942 return 0; 943 chaintoolong: 944 NF_CT_STAT_INC(net, chaintoolong); 945 err = -ENOSPC; 946 out: 947 nf_conntrack_double_unlock(hash, reply_hash); 948 local_bh_enable(); 949 return err; 950 } 951 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 952 953 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 954 unsigned int bytes) 955 { 956 struct nf_conn_acct *acct; 957 958 acct = nf_conn_acct_find(ct); 959 if (acct) { 960 struct nf_conn_counter *counter = acct->counter; 961 962 atomic64_add(packets, &counter[dir].packets); 963 atomic64_add(bytes, &counter[dir].bytes); 964 } 965 } 966 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 967 968 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 969 const struct nf_conn *loser_ct) 970 { 971 struct nf_conn_acct *acct; 972 973 acct = nf_conn_acct_find(loser_ct); 974 if (acct) { 975 struct nf_conn_counter *counter = acct->counter; 976 unsigned int bytes; 977 978 /* u32 should be fine since we must have seen one packet. */ 979 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 980 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 981 } 982 } 983 984 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 985 { 986 struct nf_conn_tstamp *tstamp; 987 988 refcount_inc(&ct->ct_general.use); 989 990 /* set conntrack timestamp, if enabled. */ 991 tstamp = nf_conn_tstamp_find(ct); 992 if (tstamp) 993 tstamp->start = ktime_get_real_ns(); 994 } 995 996 /* caller must hold locks to prevent concurrent changes */ 997 static int __nf_ct_resolve_clash(struct sk_buff *skb, 998 struct nf_conntrack_tuple_hash *h) 999 { 1000 /* This is the conntrack entry already in hashes that won race. */ 1001 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1002 enum ip_conntrack_info ctinfo; 1003 struct nf_conn *loser_ct; 1004 1005 loser_ct = nf_ct_get(skb, &ctinfo); 1006 1007 if (nf_ct_is_dying(ct)) 1008 return NF_DROP; 1009 1010 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1011 nf_ct_match(ct, loser_ct)) { 1012 struct net *net = nf_ct_net(ct); 1013 1014 nf_conntrack_get(&ct->ct_general); 1015 1016 nf_ct_acct_merge(ct, ctinfo, loser_ct); 1017 nf_ct_put(loser_ct); 1018 nf_ct_set(skb, ct, ctinfo); 1019 1020 NF_CT_STAT_INC(net, clash_resolve); 1021 return NF_ACCEPT; 1022 } 1023 1024 return NF_DROP; 1025 } 1026 1027 /** 1028 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 1029 * 1030 * @skb: skb that causes the collision 1031 * @repl_idx: hash slot for reply direction 1032 * 1033 * Called when origin or reply direction had a clash. 1034 * The skb can be handled without packet drop provided the reply direction 1035 * is unique or there the existing entry has the identical tuple in both 1036 * directions. 1037 * 1038 * Caller must hold conntrack table locks to prevent concurrent updates. 1039 * 1040 * Returns NF_DROP if the clash could not be handled. 1041 */ 1042 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 1043 { 1044 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 1045 const struct nf_conntrack_zone *zone; 1046 struct nf_conntrack_tuple_hash *h; 1047 struct hlist_nulls_node *n; 1048 struct net *net; 1049 1050 zone = nf_ct_zone(loser_ct); 1051 net = nf_ct_net(loser_ct); 1052 1053 /* Reply direction must never result in a clash, unless both origin 1054 * and reply tuples are identical. 1055 */ 1056 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 1057 if (nf_ct_key_equal(h, 1058 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1059 zone, net)) 1060 return __nf_ct_resolve_clash(skb, h); 1061 } 1062 1063 /* We want the clashing entry to go away real soon: 1 second timeout. */ 1064 WRITE_ONCE(loser_ct->timeout, nfct_time_stamp + HZ); 1065 1066 /* IPS_NAT_CLASH removes the entry automatically on the first 1067 * reply. Also prevents UDP tracker from moving the entry to 1068 * ASSURED state, i.e. the entry can always be evicted under 1069 * pressure. 1070 */ 1071 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 1072 1073 __nf_conntrack_insert_prepare(loser_ct); 1074 1075 /* fake add for ORIGINAL dir: we want lookups to only find the entry 1076 * already in the table. This also hides the clashing entry from 1077 * ctnetlink iteration, i.e. conntrack -L won't show them. 1078 */ 1079 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1080 1081 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1082 &nf_conntrack_hash[repl_idx]); 1083 1084 NF_CT_STAT_INC(net, clash_resolve); 1085 return NF_ACCEPT; 1086 } 1087 1088 /** 1089 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1090 * 1091 * @skb: skb that causes the clash 1092 * @h: tuplehash of the clashing entry already in table 1093 * @reply_hash: hash slot for reply direction 1094 * 1095 * A conntrack entry can be inserted to the connection tracking table 1096 * if there is no existing entry with an identical tuple. 1097 * 1098 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1099 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1100 * will find the already-existing entry. 1101 * 1102 * The major problem with such packet drop is the extra delay added by 1103 * the packet loss -- it will take some time for a retransmit to occur 1104 * (or the sender to time out when waiting for a reply). 1105 * 1106 * This function attempts to handle the situation without packet drop. 1107 * 1108 * If @skb has no NAT transformation or if the colliding entries are 1109 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1110 * and @skb is associated with the conntrack entry already in the table. 1111 * 1112 * Failing that, the new, unconfirmed conntrack is still added to the table 1113 * provided that the collision only occurs in the ORIGINAL direction. 1114 * The new entry will be added only in the non-clashing REPLY direction, 1115 * so packets in the ORIGINAL direction will continue to match the existing 1116 * entry. The new entry will also have a fixed timeout so it expires -- 1117 * due to the collision, it will only see reply traffic. 1118 * 1119 * Returns NF_DROP if the clash could not be resolved. 1120 */ 1121 static __cold noinline int 1122 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1123 u32 reply_hash) 1124 { 1125 /* This is the conntrack entry already in hashes that won race. */ 1126 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1127 const struct nf_conntrack_l4proto *l4proto; 1128 enum ip_conntrack_info ctinfo; 1129 struct nf_conn *loser_ct; 1130 struct net *net; 1131 int ret; 1132 1133 loser_ct = nf_ct_get(skb, &ctinfo); 1134 net = nf_ct_net(loser_ct); 1135 1136 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1137 if (!l4proto->allow_clash) 1138 goto drop; 1139 1140 ret = __nf_ct_resolve_clash(skb, h); 1141 if (ret == NF_ACCEPT) 1142 return ret; 1143 1144 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1145 if (ret == NF_ACCEPT) 1146 return ret; 1147 1148 drop: 1149 NF_CT_STAT_INC(net, drop); 1150 NF_CT_STAT_INC(net, insert_failed); 1151 return NF_DROP; 1152 } 1153 1154 /* Confirm a connection given skb; places it in hash table */ 1155 int 1156 __nf_conntrack_confirm(struct sk_buff *skb) 1157 { 1158 unsigned int chainlen = 0, sequence, max_chainlen; 1159 const struct nf_conntrack_zone *zone; 1160 unsigned int hash, reply_hash; 1161 struct nf_conntrack_tuple_hash *h; 1162 struct nf_conn *ct; 1163 struct nf_conn_help *help; 1164 struct hlist_nulls_node *n; 1165 enum ip_conntrack_info ctinfo; 1166 struct net *net; 1167 int ret = NF_DROP; 1168 1169 ct = nf_ct_get(skb, &ctinfo); 1170 net = nf_ct_net(ct); 1171 1172 /* ipt_REJECT uses nf_conntrack_attach to attach related 1173 ICMP/TCP RST packets in other direction. Actual packet 1174 which created connection will be IP_CT_NEW or for an 1175 expected connection, IP_CT_RELATED. */ 1176 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1177 return NF_ACCEPT; 1178 1179 zone = nf_ct_zone(ct); 1180 local_bh_disable(); 1181 1182 do { 1183 sequence = read_seqcount_begin(&nf_conntrack_generation); 1184 /* reuse the hash saved before */ 1185 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1186 hash = scale_hash(hash); 1187 reply_hash = hash_conntrack(net, 1188 &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1189 nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); 1190 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1191 1192 /* We're not in hash table, and we refuse to set up related 1193 * connections for unconfirmed conns. But packet copies and 1194 * REJECT will give spurious warnings here. 1195 */ 1196 1197 /* Another skb with the same unconfirmed conntrack may 1198 * win the race. This may happen for bridge(br_flood) 1199 * or broadcast/multicast packets do skb_clone with 1200 * unconfirmed conntrack. 1201 */ 1202 if (unlikely(nf_ct_is_confirmed(ct))) { 1203 WARN_ON_ONCE(1); 1204 nf_conntrack_double_unlock(hash, reply_hash); 1205 local_bh_enable(); 1206 return NF_DROP; 1207 } 1208 1209 if (!nf_ct_ext_valid_pre(ct->ext)) { 1210 NF_CT_STAT_INC(net, insert_failed); 1211 goto dying; 1212 } 1213 1214 pr_debug("Confirming conntrack %p\n", ct); 1215 /* We have to check the DYING flag after unlink to prevent 1216 * a race against nf_ct_get_next_corpse() possibly called from 1217 * user context, else we insert an already 'dead' hash, blocking 1218 * further use of that particular connection -JM. 1219 */ 1220 ct->status |= IPS_CONFIRMED; 1221 1222 if (unlikely(nf_ct_is_dying(ct))) { 1223 NF_CT_STAT_INC(net, insert_failed); 1224 goto dying; 1225 } 1226 1227 max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); 1228 /* See if there's one in the list already, including reverse: 1229 NAT could have grabbed it without realizing, since we're 1230 not in the hash. If there is, we lost race. */ 1231 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { 1232 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1233 zone, net)) 1234 goto out; 1235 if (chainlen++ > max_chainlen) 1236 goto chaintoolong; 1237 } 1238 1239 chainlen = 0; 1240 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { 1241 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1242 zone, net)) 1243 goto out; 1244 if (chainlen++ > max_chainlen) { 1245 chaintoolong: 1246 NF_CT_STAT_INC(net, chaintoolong); 1247 NF_CT_STAT_INC(net, insert_failed); 1248 ret = NF_DROP; 1249 goto dying; 1250 } 1251 } 1252 1253 /* Timer relative to confirmation time, not original 1254 setting time, otherwise we'd get timer wrap in 1255 weird delay cases. */ 1256 ct->timeout += nfct_time_stamp; 1257 1258 __nf_conntrack_insert_prepare(ct); 1259 1260 /* Since the lookup is lockless, hash insertion must be done after 1261 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1262 * guarantee that no other CPU can find the conntrack before the above 1263 * stores are visible. 1264 */ 1265 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1266 nf_conntrack_double_unlock(hash, reply_hash); 1267 local_bh_enable(); 1268 1269 /* ext area is still valid (rcu read lock is held, 1270 * but will go out of scope soon, we need to remove 1271 * this conntrack again. 1272 */ 1273 if (!nf_ct_ext_valid_post(ct->ext)) { 1274 nf_ct_kill(ct); 1275 NF_CT_STAT_INC(net, drop); 1276 return NF_DROP; 1277 } 1278 1279 help = nfct_help(ct); 1280 if (help && help->helper) 1281 nf_conntrack_event_cache(IPCT_HELPER, ct); 1282 1283 nf_conntrack_event_cache(master_ct(ct) ? 1284 IPCT_RELATED : IPCT_NEW, ct); 1285 return NF_ACCEPT; 1286 1287 out: 1288 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1289 dying: 1290 nf_conntrack_double_unlock(hash, reply_hash); 1291 local_bh_enable(); 1292 return ret; 1293 } 1294 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1295 1296 /* Returns true if a connection correspondings to the tuple (required 1297 for NAT). */ 1298 int 1299 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1300 const struct nf_conn *ignored_conntrack) 1301 { 1302 struct net *net = nf_ct_net(ignored_conntrack); 1303 const struct nf_conntrack_zone *zone; 1304 struct nf_conntrack_tuple_hash *h; 1305 struct hlist_nulls_head *ct_hash; 1306 unsigned int hash, hsize; 1307 struct hlist_nulls_node *n; 1308 struct nf_conn *ct; 1309 1310 zone = nf_ct_zone(ignored_conntrack); 1311 1312 rcu_read_lock(); 1313 begin: 1314 nf_conntrack_get_ht(&ct_hash, &hsize); 1315 hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); 1316 1317 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1318 ct = nf_ct_tuplehash_to_ctrack(h); 1319 1320 if (ct == ignored_conntrack) 1321 continue; 1322 1323 if (nf_ct_is_expired(ct)) { 1324 nf_ct_gc_expired(ct); 1325 continue; 1326 } 1327 1328 if (nf_ct_key_equal(h, tuple, zone, net)) { 1329 /* Tuple is taken already, so caller will need to find 1330 * a new source port to use. 1331 * 1332 * Only exception: 1333 * If the *original tuples* are identical, then both 1334 * conntracks refer to the same flow. 1335 * This is a rare situation, it can occur e.g. when 1336 * more than one UDP packet is sent from same socket 1337 * in different threads. 1338 * 1339 * Let nf_ct_resolve_clash() deal with this later. 1340 */ 1341 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1342 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1343 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1344 continue; 1345 1346 NF_CT_STAT_INC_ATOMIC(net, found); 1347 rcu_read_unlock(); 1348 return 1; 1349 } 1350 } 1351 1352 if (get_nulls_value(n) != hash) { 1353 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1354 goto begin; 1355 } 1356 1357 rcu_read_unlock(); 1358 1359 return 0; 1360 } 1361 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1362 1363 #define NF_CT_EVICTION_RANGE 8 1364 1365 /* There's a small race here where we may free a just-assured 1366 connection. Too bad: we're in trouble anyway. */ 1367 static unsigned int early_drop_list(struct net *net, 1368 struct hlist_nulls_head *head) 1369 { 1370 struct nf_conntrack_tuple_hash *h; 1371 struct hlist_nulls_node *n; 1372 unsigned int drops = 0; 1373 struct nf_conn *tmp; 1374 1375 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1376 tmp = nf_ct_tuplehash_to_ctrack(h); 1377 1378 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) 1379 continue; 1380 1381 if (nf_ct_is_expired(tmp)) { 1382 nf_ct_gc_expired(tmp); 1383 continue; 1384 } 1385 1386 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1387 !net_eq(nf_ct_net(tmp), net) || 1388 nf_ct_is_dying(tmp)) 1389 continue; 1390 1391 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1392 continue; 1393 1394 /* load ->ct_net and ->status after refcount increase */ 1395 smp_acquire__after_ctrl_dep(); 1396 1397 /* kill only if still in same netns -- might have moved due to 1398 * SLAB_TYPESAFE_BY_RCU rules. 1399 * 1400 * We steal the timer reference. If that fails timer has 1401 * already fired or someone else deleted it. Just drop ref 1402 * and move to next entry. 1403 */ 1404 if (net_eq(nf_ct_net(tmp), net) && 1405 nf_ct_is_confirmed(tmp) && 1406 nf_ct_delete(tmp, 0, 0)) 1407 drops++; 1408 1409 nf_ct_put(tmp); 1410 } 1411 1412 return drops; 1413 } 1414 1415 static noinline int early_drop(struct net *net, unsigned int hash) 1416 { 1417 unsigned int i, bucket; 1418 1419 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1420 struct hlist_nulls_head *ct_hash; 1421 unsigned int hsize, drops; 1422 1423 rcu_read_lock(); 1424 nf_conntrack_get_ht(&ct_hash, &hsize); 1425 if (!i) 1426 bucket = reciprocal_scale(hash, hsize); 1427 else 1428 bucket = (bucket + 1) % hsize; 1429 1430 drops = early_drop_list(net, &ct_hash[bucket]); 1431 rcu_read_unlock(); 1432 1433 if (drops) { 1434 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1435 return true; 1436 } 1437 } 1438 1439 return false; 1440 } 1441 1442 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1443 { 1444 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1445 } 1446 1447 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1448 { 1449 const struct nf_conntrack_l4proto *l4proto; 1450 1451 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1452 return true; 1453 1454 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1455 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1456 return true; 1457 1458 return false; 1459 } 1460 1461 static void gc_worker(struct work_struct *work) 1462 { 1463 unsigned int i, hashsz, nf_conntrack_max95 = 0; 1464 u32 end_time, start_time = nfct_time_stamp; 1465 struct conntrack_gc_work *gc_work; 1466 unsigned int expired_count = 0; 1467 unsigned long next_run; 1468 s32 delta_time; 1469 1470 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1471 1472 i = gc_work->next_bucket; 1473 if (gc_work->early_drop) 1474 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1475 1476 if (i == 0) { 1477 gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; 1478 gc_work->start_time = start_time; 1479 } 1480 1481 next_run = gc_work->avg_timeout; 1482 1483 end_time = start_time + GC_SCAN_MAX_DURATION; 1484 1485 do { 1486 struct nf_conntrack_tuple_hash *h; 1487 struct hlist_nulls_head *ct_hash; 1488 struct hlist_nulls_node *n; 1489 struct nf_conn *tmp; 1490 1491 rcu_read_lock(); 1492 1493 nf_conntrack_get_ht(&ct_hash, &hashsz); 1494 if (i >= hashsz) { 1495 rcu_read_unlock(); 1496 break; 1497 } 1498 1499 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1500 struct nf_conntrack_net *cnet; 1501 unsigned long expires; 1502 struct net *net; 1503 1504 tmp = nf_ct_tuplehash_to_ctrack(h); 1505 1506 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1507 nf_ct_offload_timeout(tmp); 1508 continue; 1509 } 1510 1511 if (expired_count > GC_SCAN_EXPIRED_MAX) { 1512 rcu_read_unlock(); 1513 1514 gc_work->next_bucket = i; 1515 gc_work->avg_timeout = next_run; 1516 1517 delta_time = nfct_time_stamp - gc_work->start_time; 1518 1519 /* re-sched immediately if total cycle time is exceeded */ 1520 next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX; 1521 goto early_exit; 1522 } 1523 1524 if (nf_ct_is_expired(tmp)) { 1525 nf_ct_gc_expired(tmp); 1526 expired_count++; 1527 continue; 1528 } 1529 1530 expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); 1531 next_run += expires; 1532 next_run /= 2u; 1533 1534 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1535 continue; 1536 1537 net = nf_ct_net(tmp); 1538 cnet = nf_ct_pernet(net); 1539 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1540 continue; 1541 1542 /* need to take reference to avoid possible races */ 1543 if (!refcount_inc_not_zero(&tmp->ct_general.use)) 1544 continue; 1545 1546 /* load ->status after refcount increase */ 1547 smp_acquire__after_ctrl_dep(); 1548 1549 if (gc_worker_skip_ct(tmp)) { 1550 nf_ct_put(tmp); 1551 continue; 1552 } 1553 1554 if (gc_worker_can_early_drop(tmp)) { 1555 nf_ct_kill(tmp); 1556 expired_count++; 1557 } 1558 1559 nf_ct_put(tmp); 1560 } 1561 1562 /* could check get_nulls_value() here and restart if ct 1563 * was moved to another chain. But given gc is best-effort 1564 * we will just continue with next hash slot. 1565 */ 1566 rcu_read_unlock(); 1567 cond_resched(); 1568 i++; 1569 1570 delta_time = nfct_time_stamp - end_time; 1571 if (delta_time > 0 && i < hashsz) { 1572 gc_work->avg_timeout = next_run; 1573 gc_work->next_bucket = i; 1574 next_run = 0; 1575 goto early_exit; 1576 } 1577 } while (i < hashsz); 1578 1579 gc_work->next_bucket = 0; 1580 1581 next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX); 1582 1583 delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1); 1584 if (next_run > (unsigned long)delta_time) 1585 next_run -= delta_time; 1586 else 1587 next_run = 1; 1588 1589 early_exit: 1590 if (gc_work->exiting) 1591 return; 1592 1593 if (next_run) 1594 gc_work->early_drop = false; 1595 1596 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1597 } 1598 1599 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1600 { 1601 INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); 1602 gc_work->exiting = false; 1603 } 1604 1605 static struct nf_conn * 1606 __nf_conntrack_alloc(struct net *net, 1607 const struct nf_conntrack_zone *zone, 1608 const struct nf_conntrack_tuple *orig, 1609 const struct nf_conntrack_tuple *repl, 1610 gfp_t gfp, u32 hash) 1611 { 1612 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 1613 unsigned int ct_count; 1614 struct nf_conn *ct; 1615 1616 /* We don't want any race condition at early drop stage */ 1617 ct_count = atomic_inc_return(&cnet->count); 1618 1619 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1620 if (!early_drop(net, hash)) { 1621 if (!conntrack_gc_work.early_drop) 1622 conntrack_gc_work.early_drop = true; 1623 atomic_dec(&cnet->count); 1624 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1625 return ERR_PTR(-ENOMEM); 1626 } 1627 } 1628 1629 /* 1630 * Do not use kmem_cache_zalloc(), as this cache uses 1631 * SLAB_TYPESAFE_BY_RCU. 1632 */ 1633 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1634 if (ct == NULL) 1635 goto out; 1636 1637 spin_lock_init(&ct->lock); 1638 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1639 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1640 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1641 /* save hash for reusing when confirming */ 1642 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1643 ct->status = 0; 1644 WRITE_ONCE(ct->timeout, 0); 1645 write_pnet(&ct->ct_net, net); 1646 memset_after(ct, 0, __nfct_init_offset); 1647 1648 nf_ct_zone_add(ct, zone); 1649 1650 /* Because we use RCU lookups, we set ct_general.use to zero before 1651 * this is inserted in any list. 1652 */ 1653 refcount_set(&ct->ct_general.use, 0); 1654 return ct; 1655 out: 1656 atomic_dec(&cnet->count); 1657 return ERR_PTR(-ENOMEM); 1658 } 1659 1660 struct nf_conn *nf_conntrack_alloc(struct net *net, 1661 const struct nf_conntrack_zone *zone, 1662 const struct nf_conntrack_tuple *orig, 1663 const struct nf_conntrack_tuple *repl, 1664 gfp_t gfp) 1665 { 1666 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1667 } 1668 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1669 1670 void nf_conntrack_free(struct nf_conn *ct) 1671 { 1672 struct net *net = nf_ct_net(ct); 1673 struct nf_conntrack_net *cnet; 1674 1675 /* A freed object has refcnt == 0, that's 1676 * the golden rule for SLAB_TYPESAFE_BY_RCU 1677 */ 1678 WARN_ON(refcount_read(&ct->ct_general.use) != 0); 1679 1680 if (ct->status & IPS_SRC_NAT_DONE) { 1681 const struct nf_nat_hook *nat_hook; 1682 1683 rcu_read_lock(); 1684 nat_hook = rcu_dereference(nf_nat_hook); 1685 if (nat_hook) 1686 nat_hook->remove_nat_bysrc(ct); 1687 rcu_read_unlock(); 1688 } 1689 1690 kfree(ct->ext); 1691 kmem_cache_free(nf_conntrack_cachep, ct); 1692 cnet = nf_ct_pernet(net); 1693 1694 smp_mb__before_atomic(); 1695 atomic_dec(&cnet->count); 1696 } 1697 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1698 1699 1700 /* Allocate a new conntrack: we return -ENOMEM if classification 1701 failed due to stress. Otherwise it really is unclassifiable. */ 1702 static noinline struct nf_conntrack_tuple_hash * 1703 init_conntrack(struct net *net, struct nf_conn *tmpl, 1704 const struct nf_conntrack_tuple *tuple, 1705 struct sk_buff *skb, 1706 unsigned int dataoff, u32 hash) 1707 { 1708 struct nf_conn *ct; 1709 struct nf_conn_help *help; 1710 struct nf_conntrack_tuple repl_tuple; 1711 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1712 struct nf_conntrack_ecache *ecache; 1713 #endif 1714 struct nf_conntrack_expect *exp = NULL; 1715 const struct nf_conntrack_zone *zone; 1716 struct nf_conn_timeout *timeout_ext; 1717 struct nf_conntrack_zone tmp; 1718 struct nf_conntrack_net *cnet; 1719 1720 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { 1721 pr_debug("Can't invert tuple.\n"); 1722 return NULL; 1723 } 1724 1725 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1726 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1727 hash); 1728 if (IS_ERR(ct)) 1729 return (struct nf_conntrack_tuple_hash *)ct; 1730 1731 if (!nf_ct_add_synproxy(ct, tmpl)) { 1732 nf_conntrack_free(ct); 1733 return ERR_PTR(-ENOMEM); 1734 } 1735 1736 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1737 1738 if (timeout_ext) 1739 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1740 GFP_ATOMIC); 1741 1742 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1743 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1744 nf_ct_labels_ext_add(ct); 1745 1746 #ifdef CONFIG_NF_CONNTRACK_EVENTS 1747 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1748 1749 if ((ecache || net->ct.sysctl_events) && 1750 !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1751 ecache ? ecache->expmask : 0, 1752 GFP_ATOMIC)) { 1753 nf_conntrack_free(ct); 1754 return ERR_PTR(-ENOMEM); 1755 } 1756 #endif 1757 1758 cnet = nf_ct_pernet(net); 1759 if (cnet->expect_count) { 1760 spin_lock_bh(&nf_conntrack_expect_lock); 1761 exp = nf_ct_find_expectation(net, zone, tuple); 1762 if (exp) { 1763 pr_debug("expectation arrives ct=%p exp=%p\n", 1764 ct, exp); 1765 /* Welcome, Mr. Bond. We've been expecting you... */ 1766 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1767 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1768 ct->master = exp->master; 1769 if (exp->helper) { 1770 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1771 if (help) 1772 rcu_assign_pointer(help->helper, exp->helper); 1773 } 1774 1775 #ifdef CONFIG_NF_CONNTRACK_MARK 1776 ct->mark = exp->master->mark; 1777 #endif 1778 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1779 ct->secmark = exp->master->secmark; 1780 #endif 1781 NF_CT_STAT_INC(net, expect_new); 1782 } 1783 spin_unlock_bh(&nf_conntrack_expect_lock); 1784 } 1785 if (!exp) 1786 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1787 1788 /* Other CPU might have obtained a pointer to this object before it was 1789 * released. Because refcount is 0, refcount_inc_not_zero() will fail. 1790 * 1791 * After refcount_set(1) it will succeed; ensure that zeroing of 1792 * ct->status and the correct ct->net pointer are visible; else other 1793 * core might observe CONFIRMED bit which means the entry is valid and 1794 * in the hash table, but its not (anymore). 1795 */ 1796 smp_wmb(); 1797 1798 /* Now it is going to be associated with an sk_buff, set refcount to 1. */ 1799 refcount_set(&ct->ct_general.use, 1); 1800 1801 if (exp) { 1802 if (exp->expectfn) 1803 exp->expectfn(ct, exp); 1804 nf_ct_expect_put(exp); 1805 } 1806 1807 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1808 } 1809 1810 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1811 static int 1812 resolve_normal_ct(struct nf_conn *tmpl, 1813 struct sk_buff *skb, 1814 unsigned int dataoff, 1815 u_int8_t protonum, 1816 const struct nf_hook_state *state) 1817 { 1818 const struct nf_conntrack_zone *zone; 1819 struct nf_conntrack_tuple tuple; 1820 struct nf_conntrack_tuple_hash *h; 1821 enum ip_conntrack_info ctinfo; 1822 struct nf_conntrack_zone tmp; 1823 u32 hash, zone_id, rid; 1824 struct nf_conn *ct; 1825 1826 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1827 dataoff, state->pf, protonum, state->net, 1828 &tuple)) { 1829 pr_debug("Can't get tuple\n"); 1830 return 0; 1831 } 1832 1833 /* look for tuple match */ 1834 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1835 1836 zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); 1837 hash = hash_conntrack_raw(&tuple, zone_id, state->net); 1838 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1839 1840 if (!h) { 1841 rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); 1842 if (zone_id != rid) { 1843 u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); 1844 1845 h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); 1846 } 1847 } 1848 1849 if (!h) { 1850 h = init_conntrack(state->net, tmpl, &tuple, 1851 skb, dataoff, hash); 1852 if (!h) 1853 return 0; 1854 if (IS_ERR(h)) 1855 return PTR_ERR(h); 1856 } 1857 ct = nf_ct_tuplehash_to_ctrack(h); 1858 1859 /* It exists; we have (non-exclusive) reference. */ 1860 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1861 ctinfo = IP_CT_ESTABLISHED_REPLY; 1862 } else { 1863 /* Once we've had two way comms, always ESTABLISHED. */ 1864 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1865 pr_debug("normal packet for %p\n", ct); 1866 ctinfo = IP_CT_ESTABLISHED; 1867 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1868 pr_debug("related packet for %p\n", ct); 1869 ctinfo = IP_CT_RELATED; 1870 } else { 1871 pr_debug("new packet for %p\n", ct); 1872 ctinfo = IP_CT_NEW; 1873 } 1874 } 1875 nf_ct_set(skb, ct, ctinfo); 1876 return 0; 1877 } 1878 1879 /* 1880 * icmp packets need special treatment to handle error messages that are 1881 * related to a connection. 1882 * 1883 * Callers need to check if skb has a conntrack assigned when this 1884 * helper returns; in such case skb belongs to an already known connection. 1885 */ 1886 static unsigned int __cold 1887 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1888 struct sk_buff *skb, 1889 unsigned int dataoff, 1890 u8 protonum, 1891 const struct nf_hook_state *state) 1892 { 1893 int ret; 1894 1895 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1896 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1897 #if IS_ENABLED(CONFIG_IPV6) 1898 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1899 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1900 #endif 1901 else 1902 return NF_ACCEPT; 1903 1904 if (ret <= 0) 1905 NF_CT_STAT_INC_ATOMIC(state->net, error); 1906 1907 return ret; 1908 } 1909 1910 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1911 enum ip_conntrack_info ctinfo) 1912 { 1913 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1914 1915 if (!timeout) 1916 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1917 1918 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1919 return NF_ACCEPT; 1920 } 1921 1922 /* Returns verdict for packet, or -1 for invalid. */ 1923 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1924 struct sk_buff *skb, 1925 unsigned int dataoff, 1926 enum ip_conntrack_info ctinfo, 1927 const struct nf_hook_state *state) 1928 { 1929 switch (nf_ct_protonum(ct)) { 1930 case IPPROTO_TCP: 1931 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1932 ctinfo, state); 1933 case IPPROTO_UDP: 1934 return nf_conntrack_udp_packet(ct, skb, dataoff, 1935 ctinfo, state); 1936 case IPPROTO_ICMP: 1937 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1938 #if IS_ENABLED(CONFIG_IPV6) 1939 case IPPROTO_ICMPV6: 1940 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1941 #endif 1942 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1943 case IPPROTO_UDPLITE: 1944 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1945 ctinfo, state); 1946 #endif 1947 #ifdef CONFIG_NF_CT_PROTO_SCTP 1948 case IPPROTO_SCTP: 1949 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1950 ctinfo, state); 1951 #endif 1952 #ifdef CONFIG_NF_CT_PROTO_DCCP 1953 case IPPROTO_DCCP: 1954 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1955 ctinfo, state); 1956 #endif 1957 #ifdef CONFIG_NF_CT_PROTO_GRE 1958 case IPPROTO_GRE: 1959 return nf_conntrack_gre_packet(ct, skb, dataoff, 1960 ctinfo, state); 1961 #endif 1962 } 1963 1964 return generic_packet(ct, skb, ctinfo); 1965 } 1966 1967 unsigned int 1968 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1969 { 1970 enum ip_conntrack_info ctinfo; 1971 struct nf_conn *ct, *tmpl; 1972 u_int8_t protonum; 1973 int dataoff, ret; 1974 1975 tmpl = nf_ct_get(skb, &ctinfo); 1976 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1977 /* Previously seen (loopback or untracked)? Ignore. */ 1978 if ((tmpl && !nf_ct_is_template(tmpl)) || 1979 ctinfo == IP_CT_UNTRACKED) 1980 return NF_ACCEPT; 1981 skb->_nfct = 0; 1982 } 1983 1984 /* rcu_read_lock()ed by nf_hook_thresh */ 1985 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1986 if (dataoff <= 0) { 1987 pr_debug("not prepared to track yet or error occurred\n"); 1988 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1989 ret = NF_ACCEPT; 1990 goto out; 1991 } 1992 1993 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1994 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1995 protonum, state); 1996 if (ret <= 0) { 1997 ret = -ret; 1998 goto out; 1999 } 2000 /* ICMP[v6] protocol trackers may assign one conntrack. */ 2001 if (skb->_nfct) 2002 goto out; 2003 } 2004 repeat: 2005 ret = resolve_normal_ct(tmpl, skb, dataoff, 2006 protonum, state); 2007 if (ret < 0) { 2008 /* Too stressed to deal. */ 2009 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2010 ret = NF_DROP; 2011 goto out; 2012 } 2013 2014 ct = nf_ct_get(skb, &ctinfo); 2015 if (!ct) { 2016 /* Not valid part of a connection */ 2017 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2018 ret = NF_ACCEPT; 2019 goto out; 2020 } 2021 2022 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 2023 if (ret <= 0) { 2024 /* Invalid: inverse of the return code tells 2025 * the netfilter core what to do */ 2026 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 2027 nf_ct_put(ct); 2028 skb->_nfct = 0; 2029 /* Special case: TCP tracker reports an attempt to reopen a 2030 * closed/aborted connection. We have to go back and create a 2031 * fresh conntrack. 2032 */ 2033 if (ret == -NF_REPEAT) 2034 goto repeat; 2035 2036 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 2037 if (ret == -NF_DROP) 2038 NF_CT_STAT_INC_ATOMIC(state->net, drop); 2039 2040 ret = -ret; 2041 goto out; 2042 } 2043 2044 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 2045 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 2046 nf_conntrack_event_cache(IPCT_REPLY, ct); 2047 out: 2048 if (tmpl) 2049 nf_ct_put(tmpl); 2050 2051 return ret; 2052 } 2053 EXPORT_SYMBOL_GPL(nf_conntrack_in); 2054 2055 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 2056 implicitly racy: see __nf_conntrack_confirm */ 2057 void nf_conntrack_alter_reply(struct nf_conn *ct, 2058 const struct nf_conntrack_tuple *newreply) 2059 { 2060 struct nf_conn_help *help = nfct_help(ct); 2061 2062 /* Should be unconfirmed, so not in hash table yet */ 2063 WARN_ON(nf_ct_is_confirmed(ct)); 2064 2065 pr_debug("Altering reply tuple of %p to ", ct); 2066 nf_ct_dump_tuple(newreply); 2067 2068 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 2069 if (ct->master || (help && !hlist_empty(&help->expectations))) 2070 return; 2071 2072 rcu_read_lock(); 2073 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); 2074 rcu_read_unlock(); 2075 } 2076 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 2077 2078 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 2079 void __nf_ct_refresh_acct(struct nf_conn *ct, 2080 enum ip_conntrack_info ctinfo, 2081 const struct sk_buff *skb, 2082 u32 extra_jiffies, 2083 bool do_acct) 2084 { 2085 /* Only update if this is not a fixed timeout */ 2086 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2087 goto acct; 2088 2089 /* If not in hash table, timer will not be active yet */ 2090 if (nf_ct_is_confirmed(ct)) 2091 extra_jiffies += nfct_time_stamp; 2092 2093 if (READ_ONCE(ct->timeout) != extra_jiffies) 2094 WRITE_ONCE(ct->timeout, extra_jiffies); 2095 acct: 2096 if (do_acct) 2097 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2098 } 2099 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 2100 2101 bool nf_ct_kill_acct(struct nf_conn *ct, 2102 enum ip_conntrack_info ctinfo, 2103 const struct sk_buff *skb) 2104 { 2105 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 2106 2107 return nf_ct_delete(ct, 0, 0); 2108 } 2109 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 2110 2111 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 2112 2113 #include <linux/netfilter/nfnetlink.h> 2114 #include <linux/netfilter/nfnetlink_conntrack.h> 2115 #include <linux/mutex.h> 2116 2117 /* Generic function for tcp/udp/sctp/dccp and alike. */ 2118 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 2119 const struct nf_conntrack_tuple *tuple) 2120 { 2121 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 2122 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 2123 goto nla_put_failure; 2124 return 0; 2125 2126 nla_put_failure: 2127 return -1; 2128 } 2129 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 2130 2131 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 2132 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 2133 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 2134 }; 2135 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 2136 2137 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 2138 struct nf_conntrack_tuple *t, 2139 u_int32_t flags) 2140 { 2141 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 2142 if (!tb[CTA_PROTO_SRC_PORT]) 2143 return -EINVAL; 2144 2145 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 2146 } 2147 2148 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 2149 if (!tb[CTA_PROTO_DST_PORT]) 2150 return -EINVAL; 2151 2152 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 2153 } 2154 2155 return 0; 2156 } 2157 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 2158 2159 unsigned int nf_ct_port_nlattr_tuple_size(void) 2160 { 2161 static unsigned int size __read_mostly; 2162 2163 if (!size) 2164 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2165 2166 return size; 2167 } 2168 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2169 #endif 2170 2171 /* Used by ipt_REJECT and ip6t_REJECT. */ 2172 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2173 { 2174 struct nf_conn *ct; 2175 enum ip_conntrack_info ctinfo; 2176 2177 /* This ICMP is in reverse direction to the packet which caused it */ 2178 ct = nf_ct_get(skb, &ctinfo); 2179 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2180 ctinfo = IP_CT_RELATED_REPLY; 2181 else 2182 ctinfo = IP_CT_RELATED; 2183 2184 /* Attach to new skbuff, and increment count */ 2185 nf_ct_set(nskb, ct, ctinfo); 2186 nf_conntrack_get(skb_nfct(nskb)); 2187 } 2188 2189 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2190 struct nf_conn *ct, 2191 enum ip_conntrack_info ctinfo) 2192 { 2193 const struct nf_nat_hook *nat_hook; 2194 struct nf_conntrack_tuple_hash *h; 2195 struct nf_conntrack_tuple tuple; 2196 unsigned int status; 2197 int dataoff; 2198 u16 l3num; 2199 u8 l4num; 2200 2201 l3num = nf_ct_l3num(ct); 2202 2203 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2204 if (dataoff <= 0) 2205 return -1; 2206 2207 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2208 l4num, net, &tuple)) 2209 return -1; 2210 2211 if (ct->status & IPS_SRC_NAT) { 2212 memcpy(tuple.src.u3.all, 2213 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2214 sizeof(tuple.src.u3.all)); 2215 tuple.src.u.all = 2216 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2217 } 2218 2219 if (ct->status & IPS_DST_NAT) { 2220 memcpy(tuple.dst.u3.all, 2221 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2222 sizeof(tuple.dst.u3.all)); 2223 tuple.dst.u.all = 2224 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2225 } 2226 2227 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2228 if (!h) 2229 return 0; 2230 2231 /* Store status bits of the conntrack that is clashing to re-do NAT 2232 * mangling according to what it has been done already to this packet. 2233 */ 2234 status = ct->status; 2235 2236 nf_ct_put(ct); 2237 ct = nf_ct_tuplehash_to_ctrack(h); 2238 nf_ct_set(skb, ct, ctinfo); 2239 2240 nat_hook = rcu_dereference(nf_nat_hook); 2241 if (!nat_hook) 2242 return 0; 2243 2244 if (status & IPS_SRC_NAT && 2245 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 2246 IP_CT_DIR_ORIGINAL) == NF_DROP) 2247 return -1; 2248 2249 if (status & IPS_DST_NAT && 2250 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 2251 IP_CT_DIR_ORIGINAL) == NF_DROP) 2252 return -1; 2253 2254 return 0; 2255 } 2256 2257 /* This packet is coming from userspace via nf_queue, complete the packet 2258 * processing after the helper invocation in nf_confirm(). 2259 */ 2260 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2261 enum ip_conntrack_info ctinfo) 2262 { 2263 const struct nf_conntrack_helper *helper; 2264 const struct nf_conn_help *help; 2265 int protoff; 2266 2267 help = nfct_help(ct); 2268 if (!help) 2269 return 0; 2270 2271 helper = rcu_dereference(help->helper); 2272 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2273 return 0; 2274 2275 switch (nf_ct_l3num(ct)) { 2276 case NFPROTO_IPV4: 2277 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2278 break; 2279 #if IS_ENABLED(CONFIG_IPV6) 2280 case NFPROTO_IPV6: { 2281 __be16 frag_off; 2282 u8 pnum; 2283 2284 pnum = ipv6_hdr(skb)->nexthdr; 2285 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2286 &frag_off); 2287 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2288 return 0; 2289 break; 2290 } 2291 #endif 2292 default: 2293 return 0; 2294 } 2295 2296 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2297 !nf_is_loopback_packet(skb)) { 2298 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2299 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2300 return -1; 2301 } 2302 } 2303 2304 /* We've seen it coming out the other side: confirm it */ 2305 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; 2306 } 2307 2308 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2309 { 2310 enum ip_conntrack_info ctinfo; 2311 struct nf_conn *ct; 2312 int err; 2313 2314 ct = nf_ct_get(skb, &ctinfo); 2315 if (!ct) 2316 return 0; 2317 2318 if (!nf_ct_is_confirmed(ct)) { 2319 err = __nf_conntrack_update(net, skb, ct, ctinfo); 2320 if (err < 0) 2321 return err; 2322 2323 ct = nf_ct_get(skb, &ctinfo); 2324 } 2325 2326 return nf_confirm_cthelper(skb, ct, ctinfo); 2327 } 2328 2329 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2330 const struct sk_buff *skb) 2331 { 2332 const struct nf_conntrack_tuple *src_tuple; 2333 const struct nf_conntrack_tuple_hash *hash; 2334 struct nf_conntrack_tuple srctuple; 2335 enum ip_conntrack_info ctinfo; 2336 struct nf_conn *ct; 2337 2338 ct = nf_ct_get(skb, &ctinfo); 2339 if (ct) { 2340 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2341 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2342 return true; 2343 } 2344 2345 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2346 NFPROTO_IPV4, dev_net(skb->dev), 2347 &srctuple)) 2348 return false; 2349 2350 hash = nf_conntrack_find_get(dev_net(skb->dev), 2351 &nf_ct_zone_dflt, 2352 &srctuple); 2353 if (!hash) 2354 return false; 2355 2356 ct = nf_ct_tuplehash_to_ctrack(hash); 2357 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2358 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2359 nf_ct_put(ct); 2360 2361 return true; 2362 } 2363 2364 /* Bring out ya dead! */ 2365 static struct nf_conn * 2366 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2367 const struct nf_ct_iter_data *iter_data, unsigned int *bucket) 2368 { 2369 struct nf_conntrack_tuple_hash *h; 2370 struct nf_conn *ct; 2371 struct hlist_nulls_node *n; 2372 spinlock_t *lockp; 2373 2374 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2375 struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; 2376 2377 if (hlist_nulls_empty(hslot)) 2378 continue; 2379 2380 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2381 local_bh_disable(); 2382 nf_conntrack_lock(lockp); 2383 hlist_nulls_for_each_entry(h, n, hslot, hnnode) { 2384 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2385 continue; 2386 /* All nf_conn objects are added to hash table twice, one 2387 * for original direction tuple, once for the reply tuple. 2388 * 2389 * Exception: In the IPS_NAT_CLASH case, only the reply 2390 * tuple is added (the original tuple already existed for 2391 * a different object). 2392 * 2393 * We only need to call the iterator once for each 2394 * conntrack, so we just use the 'reply' direction 2395 * tuple while iterating. 2396 */ 2397 ct = nf_ct_tuplehash_to_ctrack(h); 2398 2399 if (iter_data->net && 2400 !net_eq(iter_data->net, nf_ct_net(ct))) 2401 continue; 2402 2403 if (iter(ct, iter_data->data)) 2404 goto found; 2405 } 2406 spin_unlock(lockp); 2407 local_bh_enable(); 2408 cond_resched(); 2409 } 2410 2411 return NULL; 2412 found: 2413 refcount_inc(&ct->ct_general.use); 2414 spin_unlock(lockp); 2415 local_bh_enable(); 2416 return ct; 2417 } 2418 2419 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2420 const struct nf_ct_iter_data *iter_data) 2421 { 2422 unsigned int bucket = 0; 2423 struct nf_conn *ct; 2424 2425 might_sleep(); 2426 2427 mutex_lock(&nf_conntrack_mutex); 2428 while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) { 2429 /* Time to push up daises... */ 2430 2431 nf_ct_delete(ct, iter_data->portid, iter_data->report); 2432 nf_ct_put(ct); 2433 cond_resched(); 2434 } 2435 mutex_unlock(&nf_conntrack_mutex); 2436 } 2437 2438 void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data), 2439 const struct nf_ct_iter_data *iter_data) 2440 { 2441 struct net *net = iter_data->net; 2442 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2443 2444 might_sleep(); 2445 2446 if (atomic_read(&cnet->count) == 0) 2447 return; 2448 2449 nf_ct_iterate_cleanup(iter, iter_data); 2450 } 2451 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2452 2453 /** 2454 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2455 * @iter: callback to invoke for each conntrack 2456 * @data: data to pass to @iter 2457 * 2458 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2459 * unconfirmed list as dying (so they will not be inserted into 2460 * main table). 2461 * 2462 * Can only be called in module exit path. 2463 */ 2464 void 2465 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2466 { 2467 struct nf_ct_iter_data iter_data = {}; 2468 struct net *net; 2469 2470 down_read(&net_rwsem); 2471 for_each_net(net) { 2472 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2473 2474 if (atomic_read(&cnet->count) == 0) 2475 continue; 2476 nf_queue_nf_hook_drop(net); 2477 } 2478 up_read(&net_rwsem); 2479 2480 /* Need to wait for netns cleanup worker to finish, if its 2481 * running -- it might have deleted a net namespace from 2482 * the global list, so hook drop above might not have 2483 * affected all namespaces. 2484 */ 2485 net_ns_barrier(); 2486 2487 /* a skb w. unconfirmed conntrack could have been reinjected just 2488 * before we called nf_queue_nf_hook_drop(). 2489 * 2490 * This makes sure its inserted into conntrack table. 2491 */ 2492 synchronize_net(); 2493 2494 nf_ct_ext_bump_genid(); 2495 iter_data.data = data; 2496 nf_ct_iterate_cleanup(iter, &iter_data); 2497 2498 /* Another cpu might be in a rcu read section with 2499 * rcu protected pointer cleared in iter callback 2500 * or hidden via nf_ct_ext_bump_genid() above. 2501 * 2502 * Wait until those are done. 2503 */ 2504 synchronize_rcu(); 2505 } 2506 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2507 2508 static int kill_all(struct nf_conn *i, void *data) 2509 { 2510 return 1; 2511 } 2512 2513 void nf_conntrack_cleanup_start(void) 2514 { 2515 cleanup_nf_conntrack_bpf(); 2516 conntrack_gc_work.exiting = true; 2517 } 2518 2519 void nf_conntrack_cleanup_end(void) 2520 { 2521 RCU_INIT_POINTER(nf_ct_hook, NULL); 2522 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2523 kvfree(nf_conntrack_hash); 2524 2525 nf_conntrack_proto_fini(); 2526 nf_conntrack_helper_fini(); 2527 nf_conntrack_expect_fini(); 2528 2529 kmem_cache_destroy(nf_conntrack_cachep); 2530 } 2531 2532 /* 2533 * Mishearing the voices in his head, our hero wonders how he's 2534 * supposed to kill the mall. 2535 */ 2536 void nf_conntrack_cleanup_net(struct net *net) 2537 { 2538 LIST_HEAD(single); 2539 2540 list_add(&net->exit_list, &single); 2541 nf_conntrack_cleanup_net_list(&single); 2542 } 2543 2544 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2545 { 2546 struct nf_ct_iter_data iter_data = {}; 2547 struct net *net; 2548 int busy; 2549 2550 /* 2551 * This makes sure all current packets have passed through 2552 * netfilter framework. Roll on, two-stage module 2553 * delete... 2554 */ 2555 synchronize_net(); 2556 i_see_dead_people: 2557 busy = 0; 2558 list_for_each_entry(net, net_exit_list, exit_list) { 2559 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2560 2561 iter_data.net = net; 2562 nf_ct_iterate_cleanup_net(kill_all, &iter_data); 2563 if (atomic_read(&cnet->count) != 0) 2564 busy = 1; 2565 } 2566 if (busy) { 2567 schedule(); 2568 goto i_see_dead_people; 2569 } 2570 2571 list_for_each_entry(net, net_exit_list, exit_list) { 2572 nf_conntrack_ecache_pernet_fini(net); 2573 nf_conntrack_expect_pernet_fini(net); 2574 free_percpu(net->ct.stat); 2575 } 2576 } 2577 2578 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2579 { 2580 struct hlist_nulls_head *hash; 2581 unsigned int nr_slots, i; 2582 2583 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2584 return NULL; 2585 2586 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2587 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2588 2589 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2590 2591 if (hash && nulls) 2592 for (i = 0; i < nr_slots; i++) 2593 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2594 2595 return hash; 2596 } 2597 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2598 2599 int nf_conntrack_hash_resize(unsigned int hashsize) 2600 { 2601 int i, bucket; 2602 unsigned int old_size; 2603 struct hlist_nulls_head *hash, *old_hash; 2604 struct nf_conntrack_tuple_hash *h; 2605 struct nf_conn *ct; 2606 2607 if (!hashsize) 2608 return -EINVAL; 2609 2610 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2611 if (!hash) 2612 return -ENOMEM; 2613 2614 mutex_lock(&nf_conntrack_mutex); 2615 old_size = nf_conntrack_htable_size; 2616 if (old_size == hashsize) { 2617 mutex_unlock(&nf_conntrack_mutex); 2618 kvfree(hash); 2619 return 0; 2620 } 2621 2622 local_bh_disable(); 2623 nf_conntrack_all_lock(); 2624 write_seqcount_begin(&nf_conntrack_generation); 2625 2626 /* Lookups in the old hash might happen in parallel, which means we 2627 * might get false negatives during connection lookup. New connections 2628 * created because of a false negative won't make it into the hash 2629 * though since that required taking the locks. 2630 */ 2631 2632 for (i = 0; i < nf_conntrack_htable_size; i++) { 2633 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2634 unsigned int zone_id; 2635 2636 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2637 struct nf_conntrack_tuple_hash, hnnode); 2638 ct = nf_ct_tuplehash_to_ctrack(h); 2639 hlist_nulls_del_rcu(&h->hnnode); 2640 2641 zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); 2642 bucket = __hash_conntrack(nf_ct_net(ct), 2643 &h->tuple, zone_id, hashsize); 2644 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2645 } 2646 } 2647 old_hash = nf_conntrack_hash; 2648 2649 nf_conntrack_hash = hash; 2650 nf_conntrack_htable_size = hashsize; 2651 2652 write_seqcount_end(&nf_conntrack_generation); 2653 nf_conntrack_all_unlock(); 2654 local_bh_enable(); 2655 2656 mutex_unlock(&nf_conntrack_mutex); 2657 2658 synchronize_net(); 2659 kvfree(old_hash); 2660 return 0; 2661 } 2662 2663 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2664 { 2665 unsigned int hashsize; 2666 int rc; 2667 2668 if (current->nsproxy->net_ns != &init_net) 2669 return -EOPNOTSUPP; 2670 2671 /* On boot, we can set this without any fancy locking. */ 2672 if (!nf_conntrack_hash) 2673 return param_set_uint(val, kp); 2674 2675 rc = kstrtouint(val, 0, &hashsize); 2676 if (rc) 2677 return rc; 2678 2679 return nf_conntrack_hash_resize(hashsize); 2680 } 2681 2682 int nf_conntrack_init_start(void) 2683 { 2684 unsigned long nr_pages = totalram_pages(); 2685 int max_factor = 8; 2686 int ret = -ENOMEM; 2687 int i; 2688 2689 seqcount_spinlock_init(&nf_conntrack_generation, 2690 &nf_conntrack_locks_all_lock); 2691 2692 for (i = 0; i < CONNTRACK_LOCKS; i++) 2693 spin_lock_init(&nf_conntrack_locks[i]); 2694 2695 if (!nf_conntrack_htable_size) { 2696 nf_conntrack_htable_size 2697 = (((nr_pages << PAGE_SHIFT) / 16384) 2698 / sizeof(struct hlist_head)); 2699 if (BITS_PER_LONG >= 64 && 2700 nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2701 nf_conntrack_htable_size = 262144; 2702 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2703 nf_conntrack_htable_size = 65536; 2704 2705 if (nf_conntrack_htable_size < 1024) 2706 nf_conntrack_htable_size = 1024; 2707 /* Use a max. factor of one by default to keep the average 2708 * hash chain length at 2 entries. Each entry has to be added 2709 * twice (once for original direction, once for reply). 2710 * When a table size is given we use the old value of 8 to 2711 * avoid implicit reduction of the max entries setting. 2712 */ 2713 max_factor = 1; 2714 } 2715 2716 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2717 if (!nf_conntrack_hash) 2718 return -ENOMEM; 2719 2720 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2721 2722 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2723 sizeof(struct nf_conn), 2724 NFCT_INFOMASK + 1, 2725 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2726 if (!nf_conntrack_cachep) 2727 goto err_cachep; 2728 2729 ret = nf_conntrack_expect_init(); 2730 if (ret < 0) 2731 goto err_expect; 2732 2733 ret = nf_conntrack_helper_init(); 2734 if (ret < 0) 2735 goto err_helper; 2736 2737 ret = nf_conntrack_proto_init(); 2738 if (ret < 0) 2739 goto err_proto; 2740 2741 conntrack_gc_work_init(&conntrack_gc_work); 2742 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2743 2744 ret = register_nf_conntrack_bpf(); 2745 if (ret < 0) 2746 goto err_kfunc; 2747 2748 return 0; 2749 2750 err_kfunc: 2751 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2752 nf_conntrack_proto_fini(); 2753 err_proto: 2754 nf_conntrack_helper_fini(); 2755 err_helper: 2756 nf_conntrack_expect_fini(); 2757 err_expect: 2758 kmem_cache_destroy(nf_conntrack_cachep); 2759 err_cachep: 2760 kvfree(nf_conntrack_hash); 2761 return ret; 2762 } 2763 2764 static const struct nf_ct_hook nf_conntrack_hook = { 2765 .update = nf_conntrack_update, 2766 .destroy = nf_ct_destroy, 2767 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2768 .attach = nf_conntrack_attach, 2769 }; 2770 2771 void nf_conntrack_init_end(void) 2772 { 2773 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2774 } 2775 2776 /* 2777 * We need to use special "null" values, not used in hash table 2778 */ 2779 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2780 2781 int nf_conntrack_init_net(struct net *net) 2782 { 2783 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 2784 int ret = -ENOMEM; 2785 2786 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2787 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2788 atomic_set(&cnet->count, 0); 2789 2790 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2791 if (!net->ct.stat) 2792 return ret; 2793 2794 ret = nf_conntrack_expect_pernet_init(net); 2795 if (ret < 0) 2796 goto err_expect; 2797 2798 nf_conntrack_acct_pernet_init(net); 2799 nf_conntrack_tstamp_pernet_init(net); 2800 nf_conntrack_ecache_pernet_init(net); 2801 nf_conntrack_helper_pernet_init(net); 2802 nf_conntrack_proto_pernet_init(net); 2803 2804 return 0; 2805 2806 err_expect: 2807 free_percpu(net->ct.stat); 2808 return ret; 2809 } 2810 2811 /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ 2812 2813 int __nf_ct_change_timeout(struct nf_conn *ct, u64 timeout) 2814 { 2815 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 2816 return -EPERM; 2817 2818 __nf_ct_set_timeout(ct, timeout); 2819 2820 if (test_bit(IPS_DYING_BIT, &ct->status)) 2821 return -ETIME; 2822 2823 return 0; 2824 } 2825 EXPORT_SYMBOL_GPL(__nf_ct_change_timeout); 2826 2827 void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off) 2828 { 2829 unsigned int bit; 2830 2831 /* Ignore these unchangable bits */ 2832 on &= ~IPS_UNCHANGEABLE_MASK; 2833 off &= ~IPS_UNCHANGEABLE_MASK; 2834 2835 for (bit = 0; bit < __IPS_MAX_BIT; bit++) { 2836 if (on & (1 << bit)) 2837 set_bit(bit, &ct->status); 2838 else if (off & (1 << bit)) 2839 clear_bit(bit, &ct->status); 2840 } 2841 } 2842 EXPORT_SYMBOL_GPL(__nf_ct_change_status); 2843 2844 int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status) 2845 { 2846 unsigned long d; 2847 2848 d = ct->status ^ status; 2849 2850 if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) 2851 /* unchangeable */ 2852 return -EBUSY; 2853 2854 if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) 2855 /* SEEN_REPLY bit can only be set */ 2856 return -EBUSY; 2857 2858 if (d & IPS_ASSURED && !(status & IPS_ASSURED)) 2859 /* ASSURED bit can only be set */ 2860 return -EBUSY; 2861 2862 __nf_ct_change_status(ct, status, 0); 2863 return 0; 2864 } 2865 EXPORT_SYMBOL_GPL(nf_ct_change_status_common); 2866