1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Connection state tracking for netfilter. This is separated from, 3 but required by, the NAT layer; it can also be used by an iptables 4 extension. */ 5 6 /* (C) 1999-2001 Paul `Rusty' Russell 7 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/types.h> 15 #include <linux/netfilter.h> 16 #include <linux/module.h> 17 #include <linux/sched.h> 18 #include <linux/skbuff.h> 19 #include <linux/proc_fs.h> 20 #include <linux/vmalloc.h> 21 #include <linux/stddef.h> 22 #include <linux/slab.h> 23 #include <linux/random.h> 24 #include <linux/jhash.h> 25 #include <linux/siphash.h> 26 #include <linux/err.h> 27 #include <linux/percpu.h> 28 #include <linux/moduleparam.h> 29 #include <linux/notifier.h> 30 #include <linux/kernel.h> 31 #include <linux/netdevice.h> 32 #include <linux/socket.h> 33 #include <linux/mm.h> 34 #include <linux/nsproxy.h> 35 #include <linux/rculist_nulls.h> 36 37 #include <net/netfilter/nf_conntrack.h> 38 #include <net/netfilter/nf_conntrack_l4proto.h> 39 #include <net/netfilter/nf_conntrack_expect.h> 40 #include <net/netfilter/nf_conntrack_helper.h> 41 #include <net/netfilter/nf_conntrack_seqadj.h> 42 #include <net/netfilter/nf_conntrack_core.h> 43 #include <net/netfilter/nf_conntrack_extend.h> 44 #include <net/netfilter/nf_conntrack_acct.h> 45 #include <net/netfilter/nf_conntrack_ecache.h> 46 #include <net/netfilter/nf_conntrack_zones.h> 47 #include <net/netfilter/nf_conntrack_timestamp.h> 48 #include <net/netfilter/nf_conntrack_timeout.h> 49 #include <net/netfilter/nf_conntrack_labels.h> 50 #include <net/netfilter/nf_conntrack_synproxy.h> 51 #include <net/netfilter/nf_nat.h> 52 #include <net/netfilter/nf_nat_helper.h> 53 #include <net/netns/hash.h> 54 #include <net/ip.h> 55 56 #include "nf_internals.h" 57 58 extern unsigned int nf_conntrack_net_id; 59 60 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 61 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 62 63 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 64 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 65 66 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 67 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 68 69 struct conntrack_gc_work { 70 struct delayed_work dwork; 71 u32 last_bucket; 72 bool exiting; 73 bool early_drop; 74 long next_gc_run; 75 }; 76 77 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 78 static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 79 static __read_mostly bool nf_conntrack_locks_all; 80 81 /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ 82 #define GC_MAX_BUCKETS_DIV 128u 83 /* upper bound of full table scan */ 84 #define GC_MAX_SCAN_JIFFIES (16u * HZ) 85 /* desired ratio of entries found to be expired */ 86 #define GC_EVICT_RATIO 50u 87 88 static struct conntrack_gc_work conntrack_gc_work; 89 90 extern unsigned int nf_conntrack_net_id; 91 92 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 93 { 94 /* 1) Acquire the lock */ 95 spin_lock(lock); 96 97 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 98 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 99 */ 100 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 101 return; 102 103 /* fast path failed, unlock */ 104 spin_unlock(lock); 105 106 /* Slow path 1) get global lock */ 107 spin_lock(&nf_conntrack_locks_all_lock); 108 109 /* Slow path 2) get the lock we want */ 110 spin_lock(lock); 111 112 /* Slow path 3) release the global lock */ 113 spin_unlock(&nf_conntrack_locks_all_lock); 114 } 115 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 116 117 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 118 { 119 h1 %= CONNTRACK_LOCKS; 120 h2 %= CONNTRACK_LOCKS; 121 spin_unlock(&nf_conntrack_locks[h1]); 122 if (h1 != h2) 123 spin_unlock(&nf_conntrack_locks[h2]); 124 } 125 126 /* return true if we need to recompute hashes (in case hash table was resized) */ 127 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 128 unsigned int h2, unsigned int sequence) 129 { 130 h1 %= CONNTRACK_LOCKS; 131 h2 %= CONNTRACK_LOCKS; 132 if (h1 <= h2) { 133 nf_conntrack_lock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_lock_nested(&nf_conntrack_locks[h2], 136 SINGLE_DEPTH_NESTING); 137 } else { 138 nf_conntrack_lock(&nf_conntrack_locks[h2]); 139 spin_lock_nested(&nf_conntrack_locks[h1], 140 SINGLE_DEPTH_NESTING); 141 } 142 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 143 nf_conntrack_double_unlock(h1, h2); 144 return true; 145 } 146 return false; 147 } 148 149 static void nf_conntrack_all_lock(void) 150 __acquires(&nf_conntrack_locks_all_lock) 151 { 152 int i; 153 154 spin_lock(&nf_conntrack_locks_all_lock); 155 156 nf_conntrack_locks_all = true; 157 158 for (i = 0; i < CONNTRACK_LOCKS; i++) { 159 spin_lock(&nf_conntrack_locks[i]); 160 161 /* This spin_unlock provides the "release" to ensure that 162 * nf_conntrack_locks_all==true is visible to everyone that 163 * acquired spin_lock(&nf_conntrack_locks[]). 164 */ 165 spin_unlock(&nf_conntrack_locks[i]); 166 } 167 } 168 169 static void nf_conntrack_all_unlock(void) 170 __releases(&nf_conntrack_locks_all_lock) 171 { 172 /* All prior stores must be complete before we clear 173 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 174 * might observe the false value but not the entire 175 * critical section. 176 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 177 */ 178 smp_store_release(&nf_conntrack_locks_all, false); 179 spin_unlock(&nf_conntrack_locks_all_lock); 180 } 181 182 unsigned int nf_conntrack_htable_size __read_mostly; 183 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 184 185 unsigned int nf_conntrack_max __read_mostly; 186 EXPORT_SYMBOL_GPL(nf_conntrack_max); 187 seqcount_spinlock_t nf_conntrack_generation __read_mostly; 188 static unsigned int nf_conntrack_hash_rnd __read_mostly; 189 190 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 191 const struct net *net) 192 { 193 unsigned int n; 194 u32 seed; 195 196 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 197 198 /* The direction must be ignored, so we hash everything up to the 199 * destination ports (which is a multiple of 4) and treat the last 200 * three bytes manually. 201 */ 202 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); 203 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 204 return jhash2((u32 *)tuple, n, seed ^ 205 (((__force __u16)tuple->dst.u.all << 16) | 206 tuple->dst.protonum)); 207 } 208 209 static u32 scale_hash(u32 hash) 210 { 211 return reciprocal_scale(hash, nf_conntrack_htable_size); 212 } 213 214 static u32 __hash_conntrack(const struct net *net, 215 const struct nf_conntrack_tuple *tuple, 216 unsigned int size) 217 { 218 return reciprocal_scale(hash_conntrack_raw(tuple, net), size); 219 } 220 221 static u32 hash_conntrack(const struct net *net, 222 const struct nf_conntrack_tuple *tuple) 223 { 224 return scale_hash(hash_conntrack_raw(tuple, net)); 225 } 226 227 static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, 228 unsigned int dataoff, 229 struct nf_conntrack_tuple *tuple) 230 { struct { 231 __be16 sport; 232 __be16 dport; 233 } _inet_hdr, *inet_hdr; 234 235 /* Actually only need first 4 bytes to get ports. */ 236 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 237 if (!inet_hdr) 238 return false; 239 240 tuple->src.u.udp.port = inet_hdr->sport; 241 tuple->dst.u.udp.port = inet_hdr->dport; 242 return true; 243 } 244 245 static bool 246 nf_ct_get_tuple(const struct sk_buff *skb, 247 unsigned int nhoff, 248 unsigned int dataoff, 249 u_int16_t l3num, 250 u_int8_t protonum, 251 struct net *net, 252 struct nf_conntrack_tuple *tuple) 253 { 254 unsigned int size; 255 const __be32 *ap; 256 __be32 _addrs[8]; 257 258 memset(tuple, 0, sizeof(*tuple)); 259 260 tuple->src.l3num = l3num; 261 switch (l3num) { 262 case NFPROTO_IPV4: 263 nhoff += offsetof(struct iphdr, saddr); 264 size = 2 * sizeof(__be32); 265 break; 266 case NFPROTO_IPV6: 267 nhoff += offsetof(struct ipv6hdr, saddr); 268 size = sizeof(_addrs); 269 break; 270 default: 271 return true; 272 } 273 274 ap = skb_header_pointer(skb, nhoff, size, _addrs); 275 if (!ap) 276 return false; 277 278 switch (l3num) { 279 case NFPROTO_IPV4: 280 tuple->src.u3.ip = ap[0]; 281 tuple->dst.u3.ip = ap[1]; 282 break; 283 case NFPROTO_IPV6: 284 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 285 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 286 break; 287 } 288 289 tuple->dst.protonum = protonum; 290 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 291 292 switch (protonum) { 293 #if IS_ENABLED(CONFIG_IPV6) 294 case IPPROTO_ICMPV6: 295 return icmpv6_pkt_to_tuple(skb, dataoff, net, tuple); 296 #endif 297 case IPPROTO_ICMP: 298 return icmp_pkt_to_tuple(skb, dataoff, net, tuple); 299 #ifdef CONFIG_NF_CT_PROTO_GRE 300 case IPPROTO_GRE: 301 return gre_pkt_to_tuple(skb, dataoff, net, tuple); 302 #endif 303 case IPPROTO_TCP: 304 case IPPROTO_UDP: /* fallthrough */ 305 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 306 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 307 case IPPROTO_UDPLITE: 308 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 309 #endif 310 #ifdef CONFIG_NF_CT_PROTO_SCTP 311 case IPPROTO_SCTP: 312 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 313 #endif 314 #ifdef CONFIG_NF_CT_PROTO_DCCP 315 case IPPROTO_DCCP: 316 return nf_ct_get_tuple_ports(skb, dataoff, tuple); 317 #endif 318 default: 319 break; 320 } 321 322 return true; 323 } 324 325 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 326 u_int8_t *protonum) 327 { 328 int dataoff = -1; 329 const struct iphdr *iph; 330 struct iphdr _iph; 331 332 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 333 if (!iph) 334 return -1; 335 336 /* Conntrack defragments packets, we might still see fragments 337 * inside ICMP packets though. 338 */ 339 if (iph->frag_off & htons(IP_OFFSET)) 340 return -1; 341 342 dataoff = nhoff + (iph->ihl << 2); 343 *protonum = iph->protocol; 344 345 /* Check bogus IP headers */ 346 if (dataoff > skb->len) { 347 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 348 nhoff, iph->ihl << 2, skb->len); 349 return -1; 350 } 351 return dataoff; 352 } 353 354 #if IS_ENABLED(CONFIG_IPV6) 355 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 356 u8 *protonum) 357 { 358 int protoff = -1; 359 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 360 __be16 frag_off; 361 u8 nexthdr; 362 363 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 364 &nexthdr, sizeof(nexthdr)) != 0) { 365 pr_debug("can't get nexthdr\n"); 366 return -1; 367 } 368 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 369 /* 370 * (protoff == skb->len) means the packet has not data, just 371 * IPv6 and possibly extensions headers, but it is tracked anyway 372 */ 373 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 374 pr_debug("can't find proto in pkt\n"); 375 return -1; 376 } 377 378 *protonum = nexthdr; 379 return protoff; 380 } 381 #endif 382 383 static int get_l4proto(const struct sk_buff *skb, 384 unsigned int nhoff, u8 pf, u8 *l4num) 385 { 386 switch (pf) { 387 case NFPROTO_IPV4: 388 return ipv4_get_l4proto(skb, nhoff, l4num); 389 #if IS_ENABLED(CONFIG_IPV6) 390 case NFPROTO_IPV6: 391 return ipv6_get_l4proto(skb, nhoff, l4num); 392 #endif 393 default: 394 *l4num = 0; 395 break; 396 } 397 return -1; 398 } 399 400 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 401 u_int16_t l3num, 402 struct net *net, struct nf_conntrack_tuple *tuple) 403 { 404 u8 protonum; 405 int protoff; 406 407 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 408 if (protoff <= 0) 409 return false; 410 411 return nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple); 412 } 413 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 414 415 bool 416 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 417 const struct nf_conntrack_tuple *orig) 418 { 419 memset(inverse, 0, sizeof(*inverse)); 420 421 inverse->src.l3num = orig->src.l3num; 422 423 switch (orig->src.l3num) { 424 case NFPROTO_IPV4: 425 inverse->src.u3.ip = orig->dst.u3.ip; 426 inverse->dst.u3.ip = orig->src.u3.ip; 427 break; 428 case NFPROTO_IPV6: 429 inverse->src.u3.in6 = orig->dst.u3.in6; 430 inverse->dst.u3.in6 = orig->src.u3.in6; 431 break; 432 default: 433 break; 434 } 435 436 inverse->dst.dir = !orig->dst.dir; 437 438 inverse->dst.protonum = orig->dst.protonum; 439 440 switch (orig->dst.protonum) { 441 case IPPROTO_ICMP: 442 return nf_conntrack_invert_icmp_tuple(inverse, orig); 443 #if IS_ENABLED(CONFIG_IPV6) 444 case IPPROTO_ICMPV6: 445 return nf_conntrack_invert_icmpv6_tuple(inverse, orig); 446 #endif 447 } 448 449 inverse->src.u.all = orig->dst.u.all; 450 inverse->dst.u.all = orig->src.u.all; 451 return true; 452 } 453 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 454 455 /* Generate a almost-unique pseudo-id for a given conntrack. 456 * 457 * intentionally doesn't re-use any of the seeds used for hash 458 * table location, we assume id gets exposed to userspace. 459 * 460 * Following nf_conn items do not change throughout lifetime 461 * of the nf_conn: 462 * 463 * 1. nf_conn address 464 * 2. nf_conn->master address (normally NULL) 465 * 3. the associated net namespace 466 * 4. the original direction tuple 467 */ 468 u32 nf_ct_get_id(const struct nf_conn *ct) 469 { 470 static __read_mostly siphash_key_t ct_id_seed; 471 unsigned long a, b, c, d; 472 473 net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); 474 475 a = (unsigned long)ct; 476 b = (unsigned long)ct->master; 477 c = (unsigned long)nf_ct_net(ct); 478 d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 479 sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), 480 &ct_id_seed); 481 #ifdef CONFIG_64BIT 482 return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); 483 #else 484 return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed); 485 #endif 486 } 487 EXPORT_SYMBOL_GPL(nf_ct_get_id); 488 489 static void 490 clean_from_lists(struct nf_conn *ct) 491 { 492 pr_debug("clean_from_lists(%p)\n", ct); 493 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 494 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 495 496 /* Destroy all pending expectations */ 497 nf_ct_remove_expectations(ct); 498 } 499 500 /* must be called with local_bh_disable */ 501 static void nf_ct_add_to_dying_list(struct nf_conn *ct) 502 { 503 struct ct_pcpu *pcpu; 504 505 /* add this conntrack to the (per cpu) dying list */ 506 ct->cpu = smp_processor_id(); 507 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 508 509 spin_lock(&pcpu->lock); 510 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 511 &pcpu->dying); 512 spin_unlock(&pcpu->lock); 513 } 514 515 /* must be called with local_bh_disable */ 516 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) 517 { 518 struct ct_pcpu *pcpu; 519 520 /* add this conntrack to the (per cpu) unconfirmed list */ 521 ct->cpu = smp_processor_id(); 522 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 523 524 spin_lock(&pcpu->lock); 525 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 526 &pcpu->unconfirmed); 527 spin_unlock(&pcpu->lock); 528 } 529 530 /* must be called with local_bh_disable */ 531 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) 532 { 533 struct ct_pcpu *pcpu; 534 535 /* We overload first tuple to link into unconfirmed or dying list.*/ 536 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 537 538 spin_lock(&pcpu->lock); 539 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 540 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 541 spin_unlock(&pcpu->lock); 542 } 543 544 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 545 546 /* Released via destroy_conntrack() */ 547 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 548 const struct nf_conntrack_zone *zone, 549 gfp_t flags) 550 { 551 struct nf_conn *tmpl, *p; 552 553 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 554 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 555 if (!tmpl) 556 return NULL; 557 558 p = tmpl; 559 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 560 if (tmpl != p) { 561 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 562 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 563 } 564 } else { 565 tmpl = kzalloc(sizeof(*tmpl), flags); 566 if (!tmpl) 567 return NULL; 568 } 569 570 tmpl->status = IPS_TEMPLATE; 571 write_pnet(&tmpl->ct_net, net); 572 nf_ct_zone_add(tmpl, zone); 573 atomic_set(&tmpl->ct_general.use, 0); 574 575 return tmpl; 576 } 577 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 578 579 void nf_ct_tmpl_free(struct nf_conn *tmpl) 580 { 581 nf_ct_ext_destroy(tmpl); 582 583 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 584 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 585 else 586 kfree(tmpl); 587 } 588 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 589 590 static void destroy_gre_conntrack(struct nf_conn *ct) 591 { 592 #ifdef CONFIG_NF_CT_PROTO_GRE 593 struct nf_conn *master = ct->master; 594 595 if (master) 596 nf_ct_gre_keymap_destroy(master); 597 #endif 598 } 599 600 static void 601 destroy_conntrack(struct nf_conntrack *nfct) 602 { 603 struct nf_conn *ct = (struct nf_conn *)nfct; 604 605 pr_debug("destroy_conntrack(%p)\n", ct); 606 WARN_ON(atomic_read(&nfct->use) != 0); 607 608 if (unlikely(nf_ct_is_template(ct))) { 609 nf_ct_tmpl_free(ct); 610 return; 611 } 612 613 if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE)) 614 destroy_gre_conntrack(ct); 615 616 local_bh_disable(); 617 /* Expectations will have been removed in clean_from_lists, 618 * except TFTP can create an expectation on the first packet, 619 * before connection is in the list, so we need to clean here, 620 * too. 621 */ 622 nf_ct_remove_expectations(ct); 623 624 nf_ct_del_from_dying_or_unconfirmed_list(ct); 625 626 local_bh_enable(); 627 628 if (ct->master) 629 nf_ct_put(ct->master); 630 631 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); 632 nf_conntrack_free(ct); 633 } 634 635 static void nf_ct_delete_from_lists(struct nf_conn *ct) 636 { 637 struct net *net = nf_ct_net(ct); 638 unsigned int hash, reply_hash; 639 unsigned int sequence; 640 641 nf_ct_helper_destroy(ct); 642 643 local_bh_disable(); 644 do { 645 sequence = read_seqcount_begin(&nf_conntrack_generation); 646 hash = hash_conntrack(net, 647 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 648 reply_hash = hash_conntrack(net, 649 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 650 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 651 652 clean_from_lists(ct); 653 nf_conntrack_double_unlock(hash, reply_hash); 654 655 nf_ct_add_to_dying_list(ct); 656 657 local_bh_enable(); 658 } 659 660 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 661 { 662 struct nf_conn_tstamp *tstamp; 663 struct net *net; 664 665 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 666 return false; 667 668 tstamp = nf_conn_tstamp_find(ct); 669 if (tstamp && tstamp->stop == 0) 670 tstamp->stop = ktime_get_real_ns(); 671 672 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 673 portid, report) < 0) { 674 /* destroy event was not delivered. nf_ct_put will 675 * be done by event cache worker on redelivery. 676 */ 677 nf_ct_delete_from_lists(ct); 678 nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL); 679 return false; 680 } 681 682 net = nf_ct_net(ct); 683 if (nf_conntrack_ecache_dwork_pending(net)) 684 nf_conntrack_ecache_work(net, NFCT_ECACHE_DESTROY_SENT); 685 nf_ct_delete_from_lists(ct); 686 nf_ct_put(ct); 687 return true; 688 } 689 EXPORT_SYMBOL_GPL(nf_ct_delete); 690 691 static inline bool 692 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 693 const struct nf_conntrack_tuple *tuple, 694 const struct nf_conntrack_zone *zone, 695 const struct net *net) 696 { 697 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 698 699 /* A conntrack can be recreated with the equal tuple, 700 * so we need to check that the conntrack is confirmed 701 */ 702 return nf_ct_tuple_equal(tuple, &h->tuple) && 703 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 704 nf_ct_is_confirmed(ct) && 705 net_eq(net, nf_ct_net(ct)); 706 } 707 708 static inline bool 709 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 710 { 711 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 712 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 713 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 714 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 715 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 716 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 717 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 718 } 719 720 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 721 static void nf_ct_gc_expired(struct nf_conn *ct) 722 { 723 if (!atomic_inc_not_zero(&ct->ct_general.use)) 724 return; 725 726 if (nf_ct_should_gc(ct)) 727 nf_ct_kill(ct); 728 729 nf_ct_put(ct); 730 } 731 732 /* 733 * Warning : 734 * - Caller must take a reference on returned object 735 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 736 */ 737 static struct nf_conntrack_tuple_hash * 738 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 739 const struct nf_conntrack_tuple *tuple, u32 hash) 740 { 741 struct nf_conntrack_tuple_hash *h; 742 struct hlist_nulls_head *ct_hash; 743 struct hlist_nulls_node *n; 744 unsigned int bucket, hsize; 745 746 begin: 747 nf_conntrack_get_ht(&ct_hash, &hsize); 748 bucket = reciprocal_scale(hash, hsize); 749 750 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 751 struct nf_conn *ct; 752 753 ct = nf_ct_tuplehash_to_ctrack(h); 754 if (nf_ct_is_expired(ct)) { 755 nf_ct_gc_expired(ct); 756 continue; 757 } 758 759 if (nf_ct_key_equal(h, tuple, zone, net)) 760 return h; 761 } 762 /* 763 * if the nulls value we got at the end of this lookup is 764 * not the expected one, we must restart lookup. 765 * We probably met an item that was moved to another chain. 766 */ 767 if (get_nulls_value(n) != bucket) { 768 NF_CT_STAT_INC_ATOMIC(net, search_restart); 769 goto begin; 770 } 771 772 return NULL; 773 } 774 775 /* Find a connection corresponding to a tuple. */ 776 static struct nf_conntrack_tuple_hash * 777 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 778 const struct nf_conntrack_tuple *tuple, u32 hash) 779 { 780 struct nf_conntrack_tuple_hash *h; 781 struct nf_conn *ct; 782 783 rcu_read_lock(); 784 785 h = ____nf_conntrack_find(net, zone, tuple, hash); 786 if (h) { 787 /* We have a candidate that matches the tuple we're interested 788 * in, try to obtain a reference and re-check tuple 789 */ 790 ct = nf_ct_tuplehash_to_ctrack(h); 791 if (likely(atomic_inc_not_zero(&ct->ct_general.use))) { 792 if (likely(nf_ct_key_equal(h, tuple, zone, net))) 793 goto found; 794 795 /* TYPESAFE_BY_RCU recycled the candidate */ 796 nf_ct_put(ct); 797 } 798 799 h = NULL; 800 } 801 found: 802 rcu_read_unlock(); 803 804 return h; 805 } 806 807 struct nf_conntrack_tuple_hash * 808 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 809 const struct nf_conntrack_tuple *tuple) 810 { 811 return __nf_conntrack_find_get(net, zone, tuple, 812 hash_conntrack_raw(tuple, net)); 813 } 814 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 815 816 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 817 unsigned int hash, 818 unsigned int reply_hash) 819 { 820 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 821 &nf_conntrack_hash[hash]); 822 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 823 &nf_conntrack_hash[reply_hash]); 824 } 825 826 int 827 nf_conntrack_hash_check_insert(struct nf_conn *ct) 828 { 829 const struct nf_conntrack_zone *zone; 830 struct net *net = nf_ct_net(ct); 831 unsigned int hash, reply_hash; 832 struct nf_conntrack_tuple_hash *h; 833 struct hlist_nulls_node *n; 834 unsigned int sequence; 835 836 zone = nf_ct_zone(ct); 837 838 local_bh_disable(); 839 do { 840 sequence = read_seqcount_begin(&nf_conntrack_generation); 841 hash = hash_conntrack(net, 842 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 843 reply_hash = hash_conntrack(net, 844 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 845 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 846 847 /* See if there's one in the list already, including reverse */ 848 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 849 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 850 zone, net)) 851 goto out; 852 853 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 854 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 855 zone, net)) 856 goto out; 857 858 smp_wmb(); 859 /* The caller holds a reference to this object */ 860 atomic_set(&ct->ct_general.use, 2); 861 __nf_conntrack_hash_insert(ct, hash, reply_hash); 862 nf_conntrack_double_unlock(hash, reply_hash); 863 NF_CT_STAT_INC(net, insert); 864 local_bh_enable(); 865 return 0; 866 867 out: 868 nf_conntrack_double_unlock(hash, reply_hash); 869 local_bh_enable(); 870 return -EEXIST; 871 } 872 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 873 874 void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets, 875 unsigned int bytes) 876 { 877 struct nf_conn_acct *acct; 878 879 acct = nf_conn_acct_find(ct); 880 if (acct) { 881 struct nf_conn_counter *counter = acct->counter; 882 883 atomic64_add(packets, &counter[dir].packets); 884 atomic64_add(bytes, &counter[dir].bytes); 885 } 886 } 887 EXPORT_SYMBOL_GPL(nf_ct_acct_add); 888 889 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 890 const struct nf_conn *loser_ct) 891 { 892 struct nf_conn_acct *acct; 893 894 acct = nf_conn_acct_find(loser_ct); 895 if (acct) { 896 struct nf_conn_counter *counter = acct->counter; 897 unsigned int bytes; 898 899 /* u32 should be fine since we must have seen one packet. */ 900 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 901 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); 902 } 903 } 904 905 static void __nf_conntrack_insert_prepare(struct nf_conn *ct) 906 { 907 struct nf_conn_tstamp *tstamp; 908 909 atomic_inc(&ct->ct_general.use); 910 ct->status |= IPS_CONFIRMED; 911 912 /* set conntrack timestamp, if enabled. */ 913 tstamp = nf_conn_tstamp_find(ct); 914 if (tstamp) 915 tstamp->start = ktime_get_real_ns(); 916 } 917 918 /* caller must hold locks to prevent concurrent changes */ 919 static int __nf_ct_resolve_clash(struct sk_buff *skb, 920 struct nf_conntrack_tuple_hash *h) 921 { 922 /* This is the conntrack entry already in hashes that won race. */ 923 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 924 enum ip_conntrack_info ctinfo; 925 struct nf_conn *loser_ct; 926 927 loser_ct = nf_ct_get(skb, &ctinfo); 928 929 if (nf_ct_is_dying(ct)) 930 return NF_DROP; 931 932 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 933 nf_ct_match(ct, loser_ct)) { 934 struct net *net = nf_ct_net(ct); 935 936 nf_conntrack_get(&ct->ct_general); 937 938 nf_ct_acct_merge(ct, ctinfo, loser_ct); 939 nf_ct_add_to_dying_list(loser_ct); 940 nf_conntrack_put(&loser_ct->ct_general); 941 nf_ct_set(skb, ct, ctinfo); 942 943 NF_CT_STAT_INC(net, clash_resolve); 944 return NF_ACCEPT; 945 } 946 947 return NF_DROP; 948 } 949 950 /** 951 * nf_ct_resolve_clash_harder - attempt to insert clashing conntrack entry 952 * 953 * @skb: skb that causes the collision 954 * @repl_idx: hash slot for reply direction 955 * 956 * Called when origin or reply direction had a clash. 957 * The skb can be handled without packet drop provided the reply direction 958 * is unique or there the existing entry has the identical tuple in both 959 * directions. 960 * 961 * Caller must hold conntrack table locks to prevent concurrent updates. 962 * 963 * Returns NF_DROP if the clash could not be handled. 964 */ 965 static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) 966 { 967 struct nf_conn *loser_ct = (struct nf_conn *)skb_nfct(skb); 968 const struct nf_conntrack_zone *zone; 969 struct nf_conntrack_tuple_hash *h; 970 struct hlist_nulls_node *n; 971 struct net *net; 972 973 zone = nf_ct_zone(loser_ct); 974 net = nf_ct_net(loser_ct); 975 976 /* Reply direction must never result in a clash, unless both origin 977 * and reply tuples are identical. 978 */ 979 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[repl_idx], hnnode) { 980 if (nf_ct_key_equal(h, 981 &loser_ct->tuplehash[IP_CT_DIR_REPLY].tuple, 982 zone, net)) 983 return __nf_ct_resolve_clash(skb, h); 984 } 985 986 /* We want the clashing entry to go away real soon: 1 second timeout. */ 987 loser_ct->timeout = nfct_time_stamp + HZ; 988 989 /* IPS_NAT_CLASH removes the entry automatically on the first 990 * reply. Also prevents UDP tracker from moving the entry to 991 * ASSURED state, i.e. the entry can always be evicted under 992 * pressure. 993 */ 994 loser_ct->status |= IPS_FIXED_TIMEOUT | IPS_NAT_CLASH; 995 996 __nf_conntrack_insert_prepare(loser_ct); 997 998 /* fake add for ORIGINAL dir: we want lookups to only find the entry 999 * already in the table. This also hides the clashing entry from 1000 * ctnetlink iteration, i.e. conntrack -L won't show them. 1001 */ 1002 hlist_nulls_add_fake(&loser_ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 1003 1004 hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 1005 &nf_conntrack_hash[repl_idx]); 1006 1007 NF_CT_STAT_INC(net, clash_resolve); 1008 return NF_ACCEPT; 1009 } 1010 1011 /** 1012 * nf_ct_resolve_clash - attempt to handle clash without packet drop 1013 * 1014 * @skb: skb that causes the clash 1015 * @h: tuplehash of the clashing entry already in table 1016 * @reply_hash: hash slot for reply direction 1017 * 1018 * A conntrack entry can be inserted to the connection tracking table 1019 * if there is no existing entry with an identical tuple. 1020 * 1021 * If there is one, @skb (and the assocated, unconfirmed conntrack) has 1022 * to be dropped. In case @skb is retransmitted, next conntrack lookup 1023 * will find the already-existing entry. 1024 * 1025 * The major problem with such packet drop is the extra delay added by 1026 * the packet loss -- it will take some time for a retransmit to occur 1027 * (or the sender to time out when waiting for a reply). 1028 * 1029 * This function attempts to handle the situation without packet drop. 1030 * 1031 * If @skb has no NAT transformation or if the colliding entries are 1032 * exactly the same, only the to-be-confirmed conntrack entry is discarded 1033 * and @skb is associated with the conntrack entry already in the table. 1034 * 1035 * Failing that, the new, unconfirmed conntrack is still added to the table 1036 * provided that the collision only occurs in the ORIGINAL direction. 1037 * The new entry will be added only in the non-clashing REPLY direction, 1038 * so packets in the ORIGINAL direction will continue to match the existing 1039 * entry. The new entry will also have a fixed timeout so it expires -- 1040 * due to the collision, it will only see reply traffic. 1041 * 1042 * Returns NF_DROP if the clash could not be resolved. 1043 */ 1044 static __cold noinline int 1045 nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h, 1046 u32 reply_hash) 1047 { 1048 /* This is the conntrack entry already in hashes that won race. */ 1049 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 1050 const struct nf_conntrack_l4proto *l4proto; 1051 enum ip_conntrack_info ctinfo; 1052 struct nf_conn *loser_ct; 1053 struct net *net; 1054 int ret; 1055 1056 loser_ct = nf_ct_get(skb, &ctinfo); 1057 net = nf_ct_net(loser_ct); 1058 1059 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1060 if (!l4proto->allow_clash) 1061 goto drop; 1062 1063 ret = __nf_ct_resolve_clash(skb, h); 1064 if (ret == NF_ACCEPT) 1065 return ret; 1066 1067 ret = nf_ct_resolve_clash_harder(skb, reply_hash); 1068 if (ret == NF_ACCEPT) 1069 return ret; 1070 1071 drop: 1072 nf_ct_add_to_dying_list(loser_ct); 1073 NF_CT_STAT_INC(net, drop); 1074 NF_CT_STAT_INC(net, insert_failed); 1075 return NF_DROP; 1076 } 1077 1078 /* Confirm a connection given skb; places it in hash table */ 1079 int 1080 __nf_conntrack_confirm(struct sk_buff *skb) 1081 { 1082 const struct nf_conntrack_zone *zone; 1083 unsigned int hash, reply_hash; 1084 struct nf_conntrack_tuple_hash *h; 1085 struct nf_conn *ct; 1086 struct nf_conn_help *help; 1087 struct hlist_nulls_node *n; 1088 enum ip_conntrack_info ctinfo; 1089 struct net *net; 1090 unsigned int sequence; 1091 int ret = NF_DROP; 1092 1093 ct = nf_ct_get(skb, &ctinfo); 1094 net = nf_ct_net(ct); 1095 1096 /* ipt_REJECT uses nf_conntrack_attach to attach related 1097 ICMP/TCP RST packets in other direction. Actual packet 1098 which created connection will be IP_CT_NEW or for an 1099 expected connection, IP_CT_RELATED. */ 1100 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 1101 return NF_ACCEPT; 1102 1103 zone = nf_ct_zone(ct); 1104 local_bh_disable(); 1105 1106 do { 1107 sequence = read_seqcount_begin(&nf_conntrack_generation); 1108 /* reuse the hash saved before */ 1109 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 1110 hash = scale_hash(hash); 1111 reply_hash = hash_conntrack(net, 1112 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 1113 1114 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 1115 1116 /* We're not in hash table, and we refuse to set up related 1117 * connections for unconfirmed conns. But packet copies and 1118 * REJECT will give spurious warnings here. 1119 */ 1120 1121 /* Another skb with the same unconfirmed conntrack may 1122 * win the race. This may happen for bridge(br_flood) 1123 * or broadcast/multicast packets do skb_clone with 1124 * unconfirmed conntrack. 1125 */ 1126 if (unlikely(nf_ct_is_confirmed(ct))) { 1127 WARN_ON_ONCE(1); 1128 nf_conntrack_double_unlock(hash, reply_hash); 1129 local_bh_enable(); 1130 return NF_DROP; 1131 } 1132 1133 pr_debug("Confirming conntrack %p\n", ct); 1134 /* We have to check the DYING flag after unlink to prevent 1135 * a race against nf_ct_get_next_corpse() possibly called from 1136 * user context, else we insert an already 'dead' hash, blocking 1137 * further use of that particular connection -JM. 1138 */ 1139 nf_ct_del_from_dying_or_unconfirmed_list(ct); 1140 1141 if (unlikely(nf_ct_is_dying(ct))) { 1142 nf_ct_add_to_dying_list(ct); 1143 NF_CT_STAT_INC(net, insert_failed); 1144 goto dying; 1145 } 1146 1147 /* See if there's one in the list already, including reverse: 1148 NAT could have grabbed it without realizing, since we're 1149 not in the hash. If there is, we lost race. */ 1150 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 1151 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1152 zone, net)) 1153 goto out; 1154 1155 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 1156 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 1157 zone, net)) 1158 goto out; 1159 1160 /* Timer relative to confirmation time, not original 1161 setting time, otherwise we'd get timer wrap in 1162 weird delay cases. */ 1163 ct->timeout += nfct_time_stamp; 1164 1165 __nf_conntrack_insert_prepare(ct); 1166 1167 /* Since the lookup is lockless, hash insertion must be done after 1168 * starting the timer and setting the CONFIRMED bit. The RCU barriers 1169 * guarantee that no other CPU can find the conntrack before the above 1170 * stores are visible. 1171 */ 1172 __nf_conntrack_hash_insert(ct, hash, reply_hash); 1173 nf_conntrack_double_unlock(hash, reply_hash); 1174 local_bh_enable(); 1175 1176 help = nfct_help(ct); 1177 if (help && help->helper) 1178 nf_conntrack_event_cache(IPCT_HELPER, ct); 1179 1180 nf_conntrack_event_cache(master_ct(ct) ? 1181 IPCT_RELATED : IPCT_NEW, ct); 1182 return NF_ACCEPT; 1183 1184 out: 1185 ret = nf_ct_resolve_clash(skb, h, reply_hash); 1186 dying: 1187 nf_conntrack_double_unlock(hash, reply_hash); 1188 local_bh_enable(); 1189 return ret; 1190 } 1191 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 1192 1193 /* Returns true if a connection correspondings to the tuple (required 1194 for NAT). */ 1195 int 1196 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 1197 const struct nf_conn *ignored_conntrack) 1198 { 1199 struct net *net = nf_ct_net(ignored_conntrack); 1200 const struct nf_conntrack_zone *zone; 1201 struct nf_conntrack_tuple_hash *h; 1202 struct hlist_nulls_head *ct_hash; 1203 unsigned int hash, hsize; 1204 struct hlist_nulls_node *n; 1205 struct nf_conn *ct; 1206 1207 zone = nf_ct_zone(ignored_conntrack); 1208 1209 rcu_read_lock(); 1210 begin: 1211 nf_conntrack_get_ht(&ct_hash, &hsize); 1212 hash = __hash_conntrack(net, tuple, hsize); 1213 1214 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 1215 ct = nf_ct_tuplehash_to_ctrack(h); 1216 1217 if (ct == ignored_conntrack) 1218 continue; 1219 1220 if (nf_ct_is_expired(ct)) { 1221 nf_ct_gc_expired(ct); 1222 continue; 1223 } 1224 1225 if (nf_ct_key_equal(h, tuple, zone, net)) { 1226 /* Tuple is taken already, so caller will need to find 1227 * a new source port to use. 1228 * 1229 * Only exception: 1230 * If the *original tuples* are identical, then both 1231 * conntracks refer to the same flow. 1232 * This is a rare situation, it can occur e.g. when 1233 * more than one UDP packet is sent from same socket 1234 * in different threads. 1235 * 1236 * Let nf_ct_resolve_clash() deal with this later. 1237 */ 1238 if (nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1239 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 1240 nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) 1241 continue; 1242 1243 NF_CT_STAT_INC_ATOMIC(net, found); 1244 rcu_read_unlock(); 1245 return 1; 1246 } 1247 } 1248 1249 if (get_nulls_value(n) != hash) { 1250 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1251 goto begin; 1252 } 1253 1254 rcu_read_unlock(); 1255 1256 return 0; 1257 } 1258 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1259 1260 #define NF_CT_EVICTION_RANGE 8 1261 1262 /* There's a small race here where we may free a just-assured 1263 connection. Too bad: we're in trouble anyway. */ 1264 static unsigned int early_drop_list(struct net *net, 1265 struct hlist_nulls_head *head) 1266 { 1267 struct nf_conntrack_tuple_hash *h; 1268 struct hlist_nulls_node *n; 1269 unsigned int drops = 0; 1270 struct nf_conn *tmp; 1271 1272 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1273 tmp = nf_ct_tuplehash_to_ctrack(h); 1274 1275 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) 1276 continue; 1277 1278 if (nf_ct_is_expired(tmp)) { 1279 nf_ct_gc_expired(tmp); 1280 continue; 1281 } 1282 1283 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1284 !net_eq(nf_ct_net(tmp), net) || 1285 nf_ct_is_dying(tmp)) 1286 continue; 1287 1288 if (!atomic_inc_not_zero(&tmp->ct_general.use)) 1289 continue; 1290 1291 /* kill only if still in same netns -- might have moved due to 1292 * SLAB_TYPESAFE_BY_RCU rules. 1293 * 1294 * We steal the timer reference. If that fails timer has 1295 * already fired or someone else deleted it. Just drop ref 1296 * and move to next entry. 1297 */ 1298 if (net_eq(nf_ct_net(tmp), net) && 1299 nf_ct_is_confirmed(tmp) && 1300 nf_ct_delete(tmp, 0, 0)) 1301 drops++; 1302 1303 nf_ct_put(tmp); 1304 } 1305 1306 return drops; 1307 } 1308 1309 static noinline int early_drop(struct net *net, unsigned int hash) 1310 { 1311 unsigned int i, bucket; 1312 1313 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1314 struct hlist_nulls_head *ct_hash; 1315 unsigned int hsize, drops; 1316 1317 rcu_read_lock(); 1318 nf_conntrack_get_ht(&ct_hash, &hsize); 1319 if (!i) 1320 bucket = reciprocal_scale(hash, hsize); 1321 else 1322 bucket = (bucket + 1) % hsize; 1323 1324 drops = early_drop_list(net, &ct_hash[bucket]); 1325 rcu_read_unlock(); 1326 1327 if (drops) { 1328 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1329 return true; 1330 } 1331 } 1332 1333 return false; 1334 } 1335 1336 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1337 { 1338 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1339 } 1340 1341 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1342 { 1343 const struct nf_conntrack_l4proto *l4proto; 1344 1345 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1346 return true; 1347 1348 l4proto = nf_ct_l4proto_find(nf_ct_protonum(ct)); 1349 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1350 return true; 1351 1352 return false; 1353 } 1354 1355 static void gc_worker(struct work_struct *work) 1356 { 1357 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); 1358 unsigned int i, goal, buckets = 0, expired_count = 0; 1359 unsigned int nf_conntrack_max95 = 0; 1360 struct conntrack_gc_work *gc_work; 1361 unsigned int ratio, scanned = 0; 1362 unsigned long next_run; 1363 1364 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1365 1366 goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; 1367 i = gc_work->last_bucket; 1368 if (gc_work->early_drop) 1369 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1370 1371 do { 1372 struct nf_conntrack_tuple_hash *h; 1373 struct hlist_nulls_head *ct_hash; 1374 struct hlist_nulls_node *n; 1375 unsigned int hashsz; 1376 struct nf_conn *tmp; 1377 1378 i++; 1379 rcu_read_lock(); 1380 1381 nf_conntrack_get_ht(&ct_hash, &hashsz); 1382 if (i >= hashsz) 1383 i = 0; 1384 1385 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1386 struct nf_conntrack_net *cnet; 1387 struct net *net; 1388 1389 tmp = nf_ct_tuplehash_to_ctrack(h); 1390 1391 scanned++; 1392 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1393 nf_ct_offload_timeout(tmp); 1394 continue; 1395 } 1396 1397 if (nf_ct_is_expired(tmp)) { 1398 nf_ct_gc_expired(tmp); 1399 expired_count++; 1400 continue; 1401 } 1402 1403 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1404 continue; 1405 1406 net = nf_ct_net(tmp); 1407 cnet = net_generic(net, nf_conntrack_net_id); 1408 if (atomic_read(&cnet->count) < nf_conntrack_max95) 1409 continue; 1410 1411 /* need to take reference to avoid possible races */ 1412 if (!atomic_inc_not_zero(&tmp->ct_general.use)) 1413 continue; 1414 1415 if (gc_worker_skip_ct(tmp)) { 1416 nf_ct_put(tmp); 1417 continue; 1418 } 1419 1420 if (gc_worker_can_early_drop(tmp)) 1421 nf_ct_kill(tmp); 1422 1423 nf_ct_put(tmp); 1424 } 1425 1426 /* could check get_nulls_value() here and restart if ct 1427 * was moved to another chain. But given gc is best-effort 1428 * we will just continue with next hash slot. 1429 */ 1430 rcu_read_unlock(); 1431 cond_resched(); 1432 } while (++buckets < goal); 1433 1434 if (gc_work->exiting) 1435 return; 1436 1437 /* 1438 * Eviction will normally happen from the packet path, and not 1439 * from this gc worker. 1440 * 1441 * This worker is only here to reap expired entries when system went 1442 * idle after a busy period. 1443 * 1444 * The heuristics below are supposed to balance conflicting goals: 1445 * 1446 * 1. Minimize time until we notice a stale entry 1447 * 2. Maximize scan intervals to not waste cycles 1448 * 1449 * Normally, expire ratio will be close to 0. 1450 * 1451 * As soon as a sizeable fraction of the entries have expired 1452 * increase scan frequency. 1453 */ 1454 ratio = scanned ? expired_count * 100 / scanned : 0; 1455 if (ratio > GC_EVICT_RATIO) { 1456 gc_work->next_gc_run = min_interval; 1457 } else { 1458 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; 1459 1460 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); 1461 1462 gc_work->next_gc_run += min_interval; 1463 if (gc_work->next_gc_run > max) 1464 gc_work->next_gc_run = max; 1465 } 1466 1467 next_run = gc_work->next_gc_run; 1468 gc_work->last_bucket = i; 1469 gc_work->early_drop = false; 1470 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1471 } 1472 1473 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1474 { 1475 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); 1476 gc_work->next_gc_run = HZ; 1477 gc_work->exiting = false; 1478 } 1479 1480 static struct nf_conn * 1481 __nf_conntrack_alloc(struct net *net, 1482 const struct nf_conntrack_zone *zone, 1483 const struct nf_conntrack_tuple *orig, 1484 const struct nf_conntrack_tuple *repl, 1485 gfp_t gfp, u32 hash) 1486 { 1487 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); 1488 unsigned int ct_count; 1489 struct nf_conn *ct; 1490 1491 /* We don't want any race condition at early drop stage */ 1492 ct_count = atomic_inc_return(&cnet->count); 1493 1494 if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { 1495 if (!early_drop(net, hash)) { 1496 if (!conntrack_gc_work.early_drop) 1497 conntrack_gc_work.early_drop = true; 1498 atomic_dec(&cnet->count); 1499 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1500 return ERR_PTR(-ENOMEM); 1501 } 1502 } 1503 1504 /* 1505 * Do not use kmem_cache_zalloc(), as this cache uses 1506 * SLAB_TYPESAFE_BY_RCU. 1507 */ 1508 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1509 if (ct == NULL) 1510 goto out; 1511 1512 spin_lock_init(&ct->lock); 1513 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1514 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1515 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1516 /* save hash for reusing when confirming */ 1517 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1518 ct->status = 0; 1519 ct->timeout = 0; 1520 write_pnet(&ct->ct_net, net); 1521 memset(&ct->__nfct_init_offset, 0, 1522 offsetof(struct nf_conn, proto) - 1523 offsetof(struct nf_conn, __nfct_init_offset)); 1524 1525 nf_ct_zone_add(ct, zone); 1526 1527 /* Because we use RCU lookups, we set ct_general.use to zero before 1528 * this is inserted in any list. 1529 */ 1530 atomic_set(&ct->ct_general.use, 0); 1531 return ct; 1532 out: 1533 atomic_dec(&cnet->count); 1534 return ERR_PTR(-ENOMEM); 1535 } 1536 1537 struct nf_conn *nf_conntrack_alloc(struct net *net, 1538 const struct nf_conntrack_zone *zone, 1539 const struct nf_conntrack_tuple *orig, 1540 const struct nf_conntrack_tuple *repl, 1541 gfp_t gfp) 1542 { 1543 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1544 } 1545 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1546 1547 void nf_conntrack_free(struct nf_conn *ct) 1548 { 1549 struct net *net = nf_ct_net(ct); 1550 struct nf_conntrack_net *cnet; 1551 1552 /* A freed object has refcnt == 0, that's 1553 * the golden rule for SLAB_TYPESAFE_BY_RCU 1554 */ 1555 WARN_ON(atomic_read(&ct->ct_general.use) != 0); 1556 1557 nf_ct_ext_destroy(ct); 1558 kmem_cache_free(nf_conntrack_cachep, ct); 1559 cnet = net_generic(net, nf_conntrack_net_id); 1560 1561 smp_mb__before_atomic(); 1562 atomic_dec(&cnet->count); 1563 } 1564 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1565 1566 1567 /* Allocate a new conntrack: we return -ENOMEM if classification 1568 failed due to stress. Otherwise it really is unclassifiable. */ 1569 static noinline struct nf_conntrack_tuple_hash * 1570 init_conntrack(struct net *net, struct nf_conn *tmpl, 1571 const struct nf_conntrack_tuple *tuple, 1572 struct sk_buff *skb, 1573 unsigned int dataoff, u32 hash) 1574 { 1575 struct nf_conn *ct; 1576 struct nf_conn_help *help; 1577 struct nf_conntrack_tuple repl_tuple; 1578 struct nf_conntrack_ecache *ecache; 1579 struct nf_conntrack_expect *exp = NULL; 1580 const struct nf_conntrack_zone *zone; 1581 struct nf_conn_timeout *timeout_ext; 1582 struct nf_conntrack_zone tmp; 1583 struct nf_conntrack_net *cnet; 1584 1585 if (!nf_ct_invert_tuple(&repl_tuple, tuple)) { 1586 pr_debug("Can't invert tuple.\n"); 1587 return NULL; 1588 } 1589 1590 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1591 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1592 hash); 1593 if (IS_ERR(ct)) 1594 return (struct nf_conntrack_tuple_hash *)ct; 1595 1596 if (!nf_ct_add_synproxy(ct, tmpl)) { 1597 nf_conntrack_free(ct); 1598 return ERR_PTR(-ENOMEM); 1599 } 1600 1601 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1602 1603 if (timeout_ext) 1604 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1605 GFP_ATOMIC); 1606 1607 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1608 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1609 nf_ct_labels_ext_add(ct); 1610 1611 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1612 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1613 ecache ? ecache->expmask : 0, 1614 GFP_ATOMIC); 1615 1616 local_bh_disable(); 1617 cnet = net_generic(net, nf_conntrack_net_id); 1618 if (cnet->expect_count) { 1619 spin_lock(&nf_conntrack_expect_lock); 1620 exp = nf_ct_find_expectation(net, zone, tuple); 1621 if (exp) { 1622 pr_debug("expectation arrives ct=%p exp=%p\n", 1623 ct, exp); 1624 /* Welcome, Mr. Bond. We've been expecting you... */ 1625 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1626 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1627 ct->master = exp->master; 1628 if (exp->helper) { 1629 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1630 if (help) 1631 rcu_assign_pointer(help->helper, exp->helper); 1632 } 1633 1634 #ifdef CONFIG_NF_CONNTRACK_MARK 1635 ct->mark = exp->master->mark; 1636 #endif 1637 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1638 ct->secmark = exp->master->secmark; 1639 #endif 1640 NF_CT_STAT_INC(net, expect_new); 1641 } 1642 spin_unlock(&nf_conntrack_expect_lock); 1643 } 1644 if (!exp) 1645 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1646 1647 /* Now it is inserted into the unconfirmed list, bump refcount */ 1648 nf_conntrack_get(&ct->ct_general); 1649 nf_ct_add_to_unconfirmed_list(ct); 1650 1651 local_bh_enable(); 1652 1653 if (exp) { 1654 if (exp->expectfn) 1655 exp->expectfn(ct, exp); 1656 nf_ct_expect_put(exp); 1657 } 1658 1659 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1660 } 1661 1662 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1663 static int 1664 resolve_normal_ct(struct nf_conn *tmpl, 1665 struct sk_buff *skb, 1666 unsigned int dataoff, 1667 u_int8_t protonum, 1668 const struct nf_hook_state *state) 1669 { 1670 const struct nf_conntrack_zone *zone; 1671 struct nf_conntrack_tuple tuple; 1672 struct nf_conntrack_tuple_hash *h; 1673 enum ip_conntrack_info ctinfo; 1674 struct nf_conntrack_zone tmp; 1675 struct nf_conn *ct; 1676 u32 hash; 1677 1678 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1679 dataoff, state->pf, protonum, state->net, 1680 &tuple)) { 1681 pr_debug("Can't get tuple\n"); 1682 return 0; 1683 } 1684 1685 /* look for tuple match */ 1686 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1687 hash = hash_conntrack_raw(&tuple, state->net); 1688 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1689 if (!h) { 1690 h = init_conntrack(state->net, tmpl, &tuple, 1691 skb, dataoff, hash); 1692 if (!h) 1693 return 0; 1694 if (IS_ERR(h)) 1695 return PTR_ERR(h); 1696 } 1697 ct = nf_ct_tuplehash_to_ctrack(h); 1698 1699 /* It exists; we have (non-exclusive) reference. */ 1700 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1701 ctinfo = IP_CT_ESTABLISHED_REPLY; 1702 } else { 1703 /* Once we've had two way comms, always ESTABLISHED. */ 1704 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1705 pr_debug("normal packet for %p\n", ct); 1706 ctinfo = IP_CT_ESTABLISHED; 1707 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1708 pr_debug("related packet for %p\n", ct); 1709 ctinfo = IP_CT_RELATED; 1710 } else { 1711 pr_debug("new packet for %p\n", ct); 1712 ctinfo = IP_CT_NEW; 1713 } 1714 } 1715 nf_ct_set(skb, ct, ctinfo); 1716 return 0; 1717 } 1718 1719 /* 1720 * icmp packets need special treatment to handle error messages that are 1721 * related to a connection. 1722 * 1723 * Callers need to check if skb has a conntrack assigned when this 1724 * helper returns; in such case skb belongs to an already known connection. 1725 */ 1726 static unsigned int __cold 1727 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1728 struct sk_buff *skb, 1729 unsigned int dataoff, 1730 u8 protonum, 1731 const struct nf_hook_state *state) 1732 { 1733 int ret; 1734 1735 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1736 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1737 #if IS_ENABLED(CONFIG_IPV6) 1738 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1739 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1740 #endif 1741 else 1742 return NF_ACCEPT; 1743 1744 if (ret <= 0) 1745 NF_CT_STAT_INC_ATOMIC(state->net, error); 1746 1747 return ret; 1748 } 1749 1750 static int generic_packet(struct nf_conn *ct, struct sk_buff *skb, 1751 enum ip_conntrack_info ctinfo) 1752 { 1753 const unsigned int *timeout = nf_ct_timeout_lookup(ct); 1754 1755 if (!timeout) 1756 timeout = &nf_generic_pernet(nf_ct_net(ct))->timeout; 1757 1758 nf_ct_refresh_acct(ct, ctinfo, skb, *timeout); 1759 return NF_ACCEPT; 1760 } 1761 1762 /* Returns verdict for packet, or -1 for invalid. */ 1763 static int nf_conntrack_handle_packet(struct nf_conn *ct, 1764 struct sk_buff *skb, 1765 unsigned int dataoff, 1766 enum ip_conntrack_info ctinfo, 1767 const struct nf_hook_state *state) 1768 { 1769 switch (nf_ct_protonum(ct)) { 1770 case IPPROTO_TCP: 1771 return nf_conntrack_tcp_packet(ct, skb, dataoff, 1772 ctinfo, state); 1773 case IPPROTO_UDP: 1774 return nf_conntrack_udp_packet(ct, skb, dataoff, 1775 ctinfo, state); 1776 case IPPROTO_ICMP: 1777 return nf_conntrack_icmp_packet(ct, skb, ctinfo, state); 1778 #if IS_ENABLED(CONFIG_IPV6) 1779 case IPPROTO_ICMPV6: 1780 return nf_conntrack_icmpv6_packet(ct, skb, ctinfo, state); 1781 #endif 1782 #ifdef CONFIG_NF_CT_PROTO_UDPLITE 1783 case IPPROTO_UDPLITE: 1784 return nf_conntrack_udplite_packet(ct, skb, dataoff, 1785 ctinfo, state); 1786 #endif 1787 #ifdef CONFIG_NF_CT_PROTO_SCTP 1788 case IPPROTO_SCTP: 1789 return nf_conntrack_sctp_packet(ct, skb, dataoff, 1790 ctinfo, state); 1791 #endif 1792 #ifdef CONFIG_NF_CT_PROTO_DCCP 1793 case IPPROTO_DCCP: 1794 return nf_conntrack_dccp_packet(ct, skb, dataoff, 1795 ctinfo, state); 1796 #endif 1797 #ifdef CONFIG_NF_CT_PROTO_GRE 1798 case IPPROTO_GRE: 1799 return nf_conntrack_gre_packet(ct, skb, dataoff, 1800 ctinfo, state); 1801 #endif 1802 } 1803 1804 return generic_packet(ct, skb, ctinfo); 1805 } 1806 1807 unsigned int 1808 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1809 { 1810 enum ip_conntrack_info ctinfo; 1811 struct nf_conn *ct, *tmpl; 1812 u_int8_t protonum; 1813 int dataoff, ret; 1814 1815 tmpl = nf_ct_get(skb, &ctinfo); 1816 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1817 /* Previously seen (loopback or untracked)? Ignore. */ 1818 if ((tmpl && !nf_ct_is_template(tmpl)) || 1819 ctinfo == IP_CT_UNTRACKED) 1820 return NF_ACCEPT; 1821 skb->_nfct = 0; 1822 } 1823 1824 /* rcu_read_lock()ed by nf_hook_thresh */ 1825 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1826 if (dataoff <= 0) { 1827 pr_debug("not prepared to track yet or error occurred\n"); 1828 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1829 ret = NF_ACCEPT; 1830 goto out; 1831 } 1832 1833 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1834 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1835 protonum, state); 1836 if (ret <= 0) { 1837 ret = -ret; 1838 goto out; 1839 } 1840 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1841 if (skb->_nfct) 1842 goto out; 1843 } 1844 repeat: 1845 ret = resolve_normal_ct(tmpl, skb, dataoff, 1846 protonum, state); 1847 if (ret < 0) { 1848 /* Too stressed to deal. */ 1849 NF_CT_STAT_INC_ATOMIC(state->net, drop); 1850 ret = NF_DROP; 1851 goto out; 1852 } 1853 1854 ct = nf_ct_get(skb, &ctinfo); 1855 if (!ct) { 1856 /* Not valid part of a connection */ 1857 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1858 ret = NF_ACCEPT; 1859 goto out; 1860 } 1861 1862 ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state); 1863 if (ret <= 0) { 1864 /* Invalid: inverse of the return code tells 1865 * the netfilter core what to do */ 1866 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1867 nf_conntrack_put(&ct->ct_general); 1868 skb->_nfct = 0; 1869 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1870 if (ret == -NF_DROP) 1871 NF_CT_STAT_INC_ATOMIC(state->net, drop); 1872 /* Special case: TCP tracker reports an attempt to reopen a 1873 * closed/aborted connection. We have to go back and create a 1874 * fresh conntrack. 1875 */ 1876 if (ret == -NF_REPEAT) 1877 goto repeat; 1878 ret = -ret; 1879 goto out; 1880 } 1881 1882 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 1883 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 1884 nf_conntrack_event_cache(IPCT_REPLY, ct); 1885 out: 1886 if (tmpl) 1887 nf_ct_put(tmpl); 1888 1889 return ret; 1890 } 1891 EXPORT_SYMBOL_GPL(nf_conntrack_in); 1892 1893 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 1894 implicitly racy: see __nf_conntrack_confirm */ 1895 void nf_conntrack_alter_reply(struct nf_conn *ct, 1896 const struct nf_conntrack_tuple *newreply) 1897 { 1898 struct nf_conn_help *help = nfct_help(ct); 1899 1900 /* Should be unconfirmed, so not in hash table yet */ 1901 WARN_ON(nf_ct_is_confirmed(ct)); 1902 1903 pr_debug("Altering reply tuple of %p to ", ct); 1904 nf_ct_dump_tuple(newreply); 1905 1906 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1907 if (ct->master || (help && !hlist_empty(&help->expectations))) 1908 return; 1909 1910 rcu_read_lock(); 1911 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); 1912 rcu_read_unlock(); 1913 } 1914 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 1915 1916 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 1917 void __nf_ct_refresh_acct(struct nf_conn *ct, 1918 enum ip_conntrack_info ctinfo, 1919 const struct sk_buff *skb, 1920 u32 extra_jiffies, 1921 bool do_acct) 1922 { 1923 /* Only update if this is not a fixed timeout */ 1924 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 1925 goto acct; 1926 1927 /* If not in hash table, timer will not be active yet */ 1928 if (nf_ct_is_confirmed(ct)) 1929 extra_jiffies += nfct_time_stamp; 1930 1931 if (READ_ONCE(ct->timeout) != extra_jiffies) 1932 WRITE_ONCE(ct->timeout, extra_jiffies); 1933 acct: 1934 if (do_acct) 1935 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 1936 } 1937 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 1938 1939 bool nf_ct_kill_acct(struct nf_conn *ct, 1940 enum ip_conntrack_info ctinfo, 1941 const struct sk_buff *skb) 1942 { 1943 nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); 1944 1945 return nf_ct_delete(ct, 0, 0); 1946 } 1947 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 1948 1949 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1950 1951 #include <linux/netfilter/nfnetlink.h> 1952 #include <linux/netfilter/nfnetlink_conntrack.h> 1953 #include <linux/mutex.h> 1954 1955 /* Generic function for tcp/udp/sctp/dccp and alike. */ 1956 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 1957 const struct nf_conntrack_tuple *tuple) 1958 { 1959 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 1960 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 1961 goto nla_put_failure; 1962 return 0; 1963 1964 nla_put_failure: 1965 return -1; 1966 } 1967 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 1968 1969 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 1970 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 1971 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 1972 }; 1973 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 1974 1975 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 1976 struct nf_conntrack_tuple *t, 1977 u_int32_t flags) 1978 { 1979 if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { 1980 if (!tb[CTA_PROTO_SRC_PORT]) 1981 return -EINVAL; 1982 1983 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 1984 } 1985 1986 if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { 1987 if (!tb[CTA_PROTO_DST_PORT]) 1988 return -EINVAL; 1989 1990 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 1991 } 1992 1993 return 0; 1994 } 1995 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 1996 1997 unsigned int nf_ct_port_nlattr_tuple_size(void) 1998 { 1999 static unsigned int size __read_mostly; 2000 2001 if (!size) 2002 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 2003 2004 return size; 2005 } 2006 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 2007 #endif 2008 2009 /* Used by ipt_REJECT and ip6t_REJECT. */ 2010 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 2011 { 2012 struct nf_conn *ct; 2013 enum ip_conntrack_info ctinfo; 2014 2015 /* This ICMP is in reverse direction to the packet which caused it */ 2016 ct = nf_ct_get(skb, &ctinfo); 2017 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 2018 ctinfo = IP_CT_RELATED_REPLY; 2019 else 2020 ctinfo = IP_CT_RELATED; 2021 2022 /* Attach to new skbuff, and increment count */ 2023 nf_ct_set(nskb, ct, ctinfo); 2024 nf_conntrack_get(skb_nfct(nskb)); 2025 } 2026 2027 static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2028 struct nf_conn *ct, 2029 enum ip_conntrack_info ctinfo) 2030 { 2031 struct nf_conntrack_tuple_hash *h; 2032 struct nf_conntrack_tuple tuple; 2033 struct nf_nat_hook *nat_hook; 2034 unsigned int status; 2035 int dataoff; 2036 u16 l3num; 2037 u8 l4num; 2038 2039 l3num = nf_ct_l3num(ct); 2040 2041 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2042 if (dataoff <= 0) 2043 return -1; 2044 2045 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2046 l4num, net, &tuple)) 2047 return -1; 2048 2049 if (ct->status & IPS_SRC_NAT) { 2050 memcpy(tuple.src.u3.all, 2051 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2052 sizeof(tuple.src.u3.all)); 2053 tuple.src.u.all = 2054 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2055 } 2056 2057 if (ct->status & IPS_DST_NAT) { 2058 memcpy(tuple.dst.u3.all, 2059 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2060 sizeof(tuple.dst.u3.all)); 2061 tuple.dst.u.all = 2062 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2063 } 2064 2065 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2066 if (!h) 2067 return 0; 2068 2069 /* Store status bits of the conntrack that is clashing to re-do NAT 2070 * mangling according to what it has been done already to this packet. 2071 */ 2072 status = ct->status; 2073 2074 nf_ct_put(ct); 2075 ct = nf_ct_tuplehash_to_ctrack(h); 2076 nf_ct_set(skb, ct, ctinfo); 2077 2078 nat_hook = rcu_dereference(nf_nat_hook); 2079 if (!nat_hook) 2080 return 0; 2081 2082 if (status & IPS_SRC_NAT && 2083 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 2084 IP_CT_DIR_ORIGINAL) == NF_DROP) 2085 return -1; 2086 2087 if (status & IPS_DST_NAT && 2088 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 2089 IP_CT_DIR_ORIGINAL) == NF_DROP) 2090 return -1; 2091 2092 return 0; 2093 } 2094 2095 /* This packet is coming from userspace via nf_queue, complete the packet 2096 * processing after the helper invocation in nf_confirm(). 2097 */ 2098 static int nf_confirm_cthelper(struct sk_buff *skb, struct nf_conn *ct, 2099 enum ip_conntrack_info ctinfo) 2100 { 2101 const struct nf_conntrack_helper *helper; 2102 const struct nf_conn_help *help; 2103 int protoff; 2104 2105 help = nfct_help(ct); 2106 if (!help) 2107 return 0; 2108 2109 helper = rcu_dereference(help->helper); 2110 if (!(helper->flags & NF_CT_HELPER_F_USERSPACE)) 2111 return 0; 2112 2113 switch (nf_ct_l3num(ct)) { 2114 case NFPROTO_IPV4: 2115 protoff = skb_network_offset(skb) + ip_hdrlen(skb); 2116 break; 2117 #if IS_ENABLED(CONFIG_IPV6) 2118 case NFPROTO_IPV6: { 2119 __be16 frag_off; 2120 u8 pnum; 2121 2122 pnum = ipv6_hdr(skb)->nexthdr; 2123 protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, 2124 &frag_off); 2125 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) 2126 return 0; 2127 break; 2128 } 2129 #endif 2130 default: 2131 return 0; 2132 } 2133 2134 if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && 2135 !nf_is_loopback_packet(skb)) { 2136 if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { 2137 NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); 2138 return -1; 2139 } 2140 } 2141 2142 /* We've seen it coming out the other side: confirm it */ 2143 return nf_conntrack_confirm(skb) == NF_DROP ? - 1 : 0; 2144 } 2145 2146 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 2147 { 2148 enum ip_conntrack_info ctinfo; 2149 struct nf_conn *ct; 2150 int err; 2151 2152 ct = nf_ct_get(skb, &ctinfo); 2153 if (!ct) 2154 return 0; 2155 2156 if (!nf_ct_is_confirmed(ct)) { 2157 err = __nf_conntrack_update(net, skb, ct, ctinfo); 2158 if (err < 0) 2159 return err; 2160 2161 ct = nf_ct_get(skb, &ctinfo); 2162 } 2163 2164 return nf_confirm_cthelper(skb, ct, ctinfo); 2165 } 2166 2167 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 2168 const struct sk_buff *skb) 2169 { 2170 const struct nf_conntrack_tuple *src_tuple; 2171 const struct nf_conntrack_tuple_hash *hash; 2172 struct nf_conntrack_tuple srctuple; 2173 enum ip_conntrack_info ctinfo; 2174 struct nf_conn *ct; 2175 2176 ct = nf_ct_get(skb, &ctinfo); 2177 if (ct) { 2178 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 2179 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2180 return true; 2181 } 2182 2183 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 2184 NFPROTO_IPV4, dev_net(skb->dev), 2185 &srctuple)) 2186 return false; 2187 2188 hash = nf_conntrack_find_get(dev_net(skb->dev), 2189 &nf_ct_zone_dflt, 2190 &srctuple); 2191 if (!hash) 2192 return false; 2193 2194 ct = nf_ct_tuplehash_to_ctrack(hash); 2195 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 2196 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 2197 nf_ct_put(ct); 2198 2199 return true; 2200 } 2201 2202 /* Bring out ya dead! */ 2203 static struct nf_conn * 2204 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 2205 void *data, unsigned int *bucket) 2206 { 2207 struct nf_conntrack_tuple_hash *h; 2208 struct nf_conn *ct; 2209 struct hlist_nulls_node *n; 2210 spinlock_t *lockp; 2211 2212 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 2213 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 2214 local_bh_disable(); 2215 nf_conntrack_lock(lockp); 2216 if (*bucket < nf_conntrack_htable_size) { 2217 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { 2218 if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) 2219 continue; 2220 /* All nf_conn objects are added to hash table twice, one 2221 * for original direction tuple, once for the reply tuple. 2222 * 2223 * Exception: In the IPS_NAT_CLASH case, only the reply 2224 * tuple is added (the original tuple already existed for 2225 * a different object). 2226 * 2227 * We only need to call the iterator once for each 2228 * conntrack, so we just use the 'reply' direction 2229 * tuple while iterating. 2230 */ 2231 ct = nf_ct_tuplehash_to_ctrack(h); 2232 if (iter(ct, data)) 2233 goto found; 2234 } 2235 } 2236 spin_unlock(lockp); 2237 local_bh_enable(); 2238 cond_resched(); 2239 } 2240 2241 return NULL; 2242 found: 2243 atomic_inc(&ct->ct_general.use); 2244 spin_unlock(lockp); 2245 local_bh_enable(); 2246 return ct; 2247 } 2248 2249 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 2250 void *data, u32 portid, int report) 2251 { 2252 unsigned int bucket = 0, sequence; 2253 struct nf_conn *ct; 2254 2255 might_sleep(); 2256 2257 for (;;) { 2258 sequence = read_seqcount_begin(&nf_conntrack_generation); 2259 2260 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { 2261 /* Time to push up daises... */ 2262 2263 nf_ct_delete(ct, portid, report); 2264 nf_ct_put(ct); 2265 cond_resched(); 2266 } 2267 2268 if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) 2269 break; 2270 bucket = 0; 2271 } 2272 } 2273 2274 struct iter_data { 2275 int (*iter)(struct nf_conn *i, void *data); 2276 void *data; 2277 struct net *net; 2278 }; 2279 2280 static int iter_net_only(struct nf_conn *i, void *data) 2281 { 2282 struct iter_data *d = data; 2283 2284 if (!net_eq(d->net, nf_ct_net(i))) 2285 return 0; 2286 2287 return d->iter(i, d->data); 2288 } 2289 2290 static void 2291 __nf_ct_unconfirmed_destroy(struct net *net) 2292 { 2293 int cpu; 2294 2295 for_each_possible_cpu(cpu) { 2296 struct nf_conntrack_tuple_hash *h; 2297 struct hlist_nulls_node *n; 2298 struct ct_pcpu *pcpu; 2299 2300 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 2301 2302 spin_lock_bh(&pcpu->lock); 2303 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { 2304 struct nf_conn *ct; 2305 2306 ct = nf_ct_tuplehash_to_ctrack(h); 2307 2308 /* we cannot call iter() on unconfirmed list, the 2309 * owning cpu can reallocate ct->ext at any time. 2310 */ 2311 set_bit(IPS_DYING_BIT, &ct->status); 2312 } 2313 spin_unlock_bh(&pcpu->lock); 2314 cond_resched(); 2315 } 2316 } 2317 2318 void nf_ct_unconfirmed_destroy(struct net *net) 2319 { 2320 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); 2321 2322 might_sleep(); 2323 2324 if (atomic_read(&cnet->count) > 0) { 2325 __nf_ct_unconfirmed_destroy(net); 2326 nf_queue_nf_hook_drop(net); 2327 synchronize_net(); 2328 } 2329 } 2330 EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy); 2331 2332 void nf_ct_iterate_cleanup_net(struct net *net, 2333 int (*iter)(struct nf_conn *i, void *data), 2334 void *data, u32 portid, int report) 2335 { 2336 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); 2337 struct iter_data d; 2338 2339 might_sleep(); 2340 2341 if (atomic_read(&cnet->count) == 0) 2342 return; 2343 2344 d.iter = iter; 2345 d.data = data; 2346 d.net = net; 2347 2348 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report); 2349 } 2350 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2351 2352 /** 2353 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2354 * @iter: callback to invoke for each conntrack 2355 * @data: data to pass to @iter 2356 * 2357 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2358 * unconfirmed list as dying (so they will not be inserted into 2359 * main table). 2360 * 2361 * Can only be called in module exit path. 2362 */ 2363 void 2364 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2365 { 2366 struct net *net; 2367 2368 down_read(&net_rwsem); 2369 for_each_net(net) { 2370 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); 2371 2372 if (atomic_read(&cnet->count) == 0) 2373 continue; 2374 __nf_ct_unconfirmed_destroy(net); 2375 nf_queue_nf_hook_drop(net); 2376 } 2377 up_read(&net_rwsem); 2378 2379 /* Need to wait for netns cleanup worker to finish, if its 2380 * running -- it might have deleted a net namespace from 2381 * the global list, so our __nf_ct_unconfirmed_destroy() might 2382 * not have affected all namespaces. 2383 */ 2384 net_ns_barrier(); 2385 2386 /* a conntrack could have been unlinked from unconfirmed list 2387 * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy(). 2388 * This makes sure its inserted into conntrack table. 2389 */ 2390 synchronize_net(); 2391 2392 nf_ct_iterate_cleanup(iter, data, 0, 0); 2393 } 2394 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2395 2396 static int kill_all(struct nf_conn *i, void *data) 2397 { 2398 return net_eq(nf_ct_net(i), data); 2399 } 2400 2401 void nf_conntrack_cleanup_start(void) 2402 { 2403 conntrack_gc_work.exiting = true; 2404 RCU_INIT_POINTER(ip_ct_attach, NULL); 2405 } 2406 2407 void nf_conntrack_cleanup_end(void) 2408 { 2409 RCU_INIT_POINTER(nf_ct_hook, NULL); 2410 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2411 kvfree(nf_conntrack_hash); 2412 2413 nf_conntrack_proto_fini(); 2414 nf_conntrack_seqadj_fini(); 2415 nf_conntrack_labels_fini(); 2416 nf_conntrack_helper_fini(); 2417 nf_conntrack_timeout_fini(); 2418 nf_conntrack_ecache_fini(); 2419 nf_conntrack_tstamp_fini(); 2420 nf_conntrack_acct_fini(); 2421 nf_conntrack_expect_fini(); 2422 2423 kmem_cache_destroy(nf_conntrack_cachep); 2424 } 2425 2426 /* 2427 * Mishearing the voices in his head, our hero wonders how he's 2428 * supposed to kill the mall. 2429 */ 2430 void nf_conntrack_cleanup_net(struct net *net) 2431 { 2432 LIST_HEAD(single); 2433 2434 list_add(&net->exit_list, &single); 2435 nf_conntrack_cleanup_net_list(&single); 2436 } 2437 2438 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2439 { 2440 int busy; 2441 struct net *net; 2442 2443 /* 2444 * This makes sure all current packets have passed through 2445 * netfilter framework. Roll on, two-stage module 2446 * delete... 2447 */ 2448 synchronize_net(); 2449 i_see_dead_people: 2450 busy = 0; 2451 list_for_each_entry(net, net_exit_list, exit_list) { 2452 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); 2453 2454 nf_ct_iterate_cleanup(kill_all, net, 0, 0); 2455 if (atomic_read(&cnet->count) != 0) 2456 busy = 1; 2457 } 2458 if (busy) { 2459 schedule(); 2460 goto i_see_dead_people; 2461 } 2462 2463 list_for_each_entry(net, net_exit_list, exit_list) { 2464 nf_conntrack_proto_pernet_fini(net); 2465 nf_conntrack_ecache_pernet_fini(net); 2466 nf_conntrack_expect_pernet_fini(net); 2467 free_percpu(net->ct.stat); 2468 free_percpu(net->ct.pcpu_lists); 2469 } 2470 } 2471 2472 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2473 { 2474 struct hlist_nulls_head *hash; 2475 unsigned int nr_slots, i; 2476 2477 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2478 return NULL; 2479 2480 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2481 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2482 2483 hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); 2484 2485 if (hash && nulls) 2486 for (i = 0; i < nr_slots; i++) 2487 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2488 2489 return hash; 2490 } 2491 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2492 2493 int nf_conntrack_hash_resize(unsigned int hashsize) 2494 { 2495 int i, bucket; 2496 unsigned int old_size; 2497 struct hlist_nulls_head *hash, *old_hash; 2498 struct nf_conntrack_tuple_hash *h; 2499 struct nf_conn *ct; 2500 2501 if (!hashsize) 2502 return -EINVAL; 2503 2504 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2505 if (!hash) 2506 return -ENOMEM; 2507 2508 old_size = nf_conntrack_htable_size; 2509 if (old_size == hashsize) { 2510 kvfree(hash); 2511 return 0; 2512 } 2513 2514 local_bh_disable(); 2515 nf_conntrack_all_lock(); 2516 write_seqcount_begin(&nf_conntrack_generation); 2517 2518 /* Lookups in the old hash might happen in parallel, which means we 2519 * might get false negatives during connection lookup. New connections 2520 * created because of a false negative won't make it into the hash 2521 * though since that required taking the locks. 2522 */ 2523 2524 for (i = 0; i < nf_conntrack_htable_size; i++) { 2525 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2526 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2527 struct nf_conntrack_tuple_hash, hnnode); 2528 ct = nf_ct_tuplehash_to_ctrack(h); 2529 hlist_nulls_del_rcu(&h->hnnode); 2530 bucket = __hash_conntrack(nf_ct_net(ct), 2531 &h->tuple, hashsize); 2532 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2533 } 2534 } 2535 old_size = nf_conntrack_htable_size; 2536 old_hash = nf_conntrack_hash; 2537 2538 nf_conntrack_hash = hash; 2539 nf_conntrack_htable_size = hashsize; 2540 2541 write_seqcount_end(&nf_conntrack_generation); 2542 nf_conntrack_all_unlock(); 2543 local_bh_enable(); 2544 2545 synchronize_net(); 2546 kvfree(old_hash); 2547 return 0; 2548 } 2549 2550 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2551 { 2552 unsigned int hashsize; 2553 int rc; 2554 2555 if (current->nsproxy->net_ns != &init_net) 2556 return -EOPNOTSUPP; 2557 2558 /* On boot, we can set this without any fancy locking. */ 2559 if (!nf_conntrack_hash) 2560 return param_set_uint(val, kp); 2561 2562 rc = kstrtouint(val, 0, &hashsize); 2563 if (rc) 2564 return rc; 2565 2566 return nf_conntrack_hash_resize(hashsize); 2567 } 2568 2569 static __always_inline unsigned int total_extension_size(void) 2570 { 2571 /* remember to add new extensions below */ 2572 BUILD_BUG_ON(NF_CT_EXT_NUM > 9); 2573 2574 return sizeof(struct nf_ct_ext) + 2575 sizeof(struct nf_conn_help) 2576 #if IS_ENABLED(CONFIG_NF_NAT) 2577 + sizeof(struct nf_conn_nat) 2578 #endif 2579 + sizeof(struct nf_conn_seqadj) 2580 + sizeof(struct nf_conn_acct) 2581 #ifdef CONFIG_NF_CONNTRACK_EVENTS 2582 + sizeof(struct nf_conntrack_ecache) 2583 #endif 2584 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 2585 + sizeof(struct nf_conn_tstamp) 2586 #endif 2587 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT 2588 + sizeof(struct nf_conn_timeout) 2589 #endif 2590 #ifdef CONFIG_NF_CONNTRACK_LABELS 2591 + sizeof(struct nf_conn_labels) 2592 #endif 2593 #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) 2594 + sizeof(struct nf_conn_synproxy) 2595 #endif 2596 ; 2597 }; 2598 2599 int nf_conntrack_init_start(void) 2600 { 2601 unsigned long nr_pages = totalram_pages(); 2602 int max_factor = 8; 2603 int ret = -ENOMEM; 2604 int i; 2605 2606 /* struct nf_ct_ext uses u8 to store offsets/size */ 2607 BUILD_BUG_ON(total_extension_size() > 255u); 2608 2609 seqcount_spinlock_init(&nf_conntrack_generation, 2610 &nf_conntrack_locks_all_lock); 2611 2612 for (i = 0; i < CONNTRACK_LOCKS; i++) 2613 spin_lock_init(&nf_conntrack_locks[i]); 2614 2615 if (!nf_conntrack_htable_size) { 2616 /* Idea from tcp.c: use 1/16384 of memory. 2617 * On i386: 32MB machine has 512 buckets. 2618 * >= 1GB machines have 16384 buckets. 2619 * >= 4GB machines have 65536 buckets. 2620 */ 2621 nf_conntrack_htable_size 2622 = (((nr_pages << PAGE_SHIFT) / 16384) 2623 / sizeof(struct hlist_head)); 2624 if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2625 nf_conntrack_htable_size = 65536; 2626 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2627 nf_conntrack_htable_size = 16384; 2628 if (nf_conntrack_htable_size < 32) 2629 nf_conntrack_htable_size = 32; 2630 2631 /* Use a max. factor of four by default to get the same max as 2632 * with the old struct list_heads. When a table size is given 2633 * we use the old value of 8 to avoid reducing the max. 2634 * entries. */ 2635 max_factor = 4; 2636 } 2637 2638 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2639 if (!nf_conntrack_hash) 2640 return -ENOMEM; 2641 2642 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2643 2644 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2645 sizeof(struct nf_conn), 2646 NFCT_INFOMASK + 1, 2647 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2648 if (!nf_conntrack_cachep) 2649 goto err_cachep; 2650 2651 ret = nf_conntrack_expect_init(); 2652 if (ret < 0) 2653 goto err_expect; 2654 2655 ret = nf_conntrack_acct_init(); 2656 if (ret < 0) 2657 goto err_acct; 2658 2659 ret = nf_conntrack_tstamp_init(); 2660 if (ret < 0) 2661 goto err_tstamp; 2662 2663 ret = nf_conntrack_ecache_init(); 2664 if (ret < 0) 2665 goto err_ecache; 2666 2667 ret = nf_conntrack_timeout_init(); 2668 if (ret < 0) 2669 goto err_timeout; 2670 2671 ret = nf_conntrack_helper_init(); 2672 if (ret < 0) 2673 goto err_helper; 2674 2675 ret = nf_conntrack_labels_init(); 2676 if (ret < 0) 2677 goto err_labels; 2678 2679 ret = nf_conntrack_seqadj_init(); 2680 if (ret < 0) 2681 goto err_seqadj; 2682 2683 ret = nf_conntrack_proto_init(); 2684 if (ret < 0) 2685 goto err_proto; 2686 2687 conntrack_gc_work_init(&conntrack_gc_work); 2688 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2689 2690 return 0; 2691 2692 err_proto: 2693 nf_conntrack_seqadj_fini(); 2694 err_seqadj: 2695 nf_conntrack_labels_fini(); 2696 err_labels: 2697 nf_conntrack_helper_fini(); 2698 err_helper: 2699 nf_conntrack_timeout_fini(); 2700 err_timeout: 2701 nf_conntrack_ecache_fini(); 2702 err_ecache: 2703 nf_conntrack_tstamp_fini(); 2704 err_tstamp: 2705 nf_conntrack_acct_fini(); 2706 err_acct: 2707 nf_conntrack_expect_fini(); 2708 err_expect: 2709 kmem_cache_destroy(nf_conntrack_cachep); 2710 err_cachep: 2711 kvfree(nf_conntrack_hash); 2712 return ret; 2713 } 2714 2715 static struct nf_ct_hook nf_conntrack_hook = { 2716 .update = nf_conntrack_update, 2717 .destroy = destroy_conntrack, 2718 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2719 }; 2720 2721 void nf_conntrack_init_end(void) 2722 { 2723 /* For use by REJECT target */ 2724 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); 2725 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2726 } 2727 2728 /* 2729 * We need to use special "null" values, not used in hash table 2730 */ 2731 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2732 #define DYING_NULLS_VAL ((1<<30)+1) 2733 2734 int nf_conntrack_init_net(struct net *net) 2735 { 2736 struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id); 2737 int ret = -ENOMEM; 2738 int cpu; 2739 2740 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2741 BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS); 2742 atomic_set(&cnet->count, 0); 2743 2744 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); 2745 if (!net->ct.pcpu_lists) 2746 goto err_stat; 2747 2748 for_each_possible_cpu(cpu) { 2749 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 2750 2751 spin_lock_init(&pcpu->lock); 2752 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); 2753 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); 2754 } 2755 2756 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2757 if (!net->ct.stat) 2758 goto err_pcpu_lists; 2759 2760 ret = nf_conntrack_expect_pernet_init(net); 2761 if (ret < 0) 2762 goto err_expect; 2763 2764 nf_conntrack_acct_pernet_init(net); 2765 nf_conntrack_tstamp_pernet_init(net); 2766 nf_conntrack_ecache_pernet_init(net); 2767 nf_conntrack_helper_pernet_init(net); 2768 nf_conntrack_proto_pernet_init(net); 2769 2770 return 0; 2771 2772 err_expect: 2773 free_percpu(net->ct.stat); 2774 err_pcpu_lists: 2775 free_percpu(net->ct.pcpu_lists); 2776 err_stat: 2777 return ret; 2778 } 2779