1 /* Connection state tracking for netfilter. This is separated from, 2 but required by, the NAT layer; it can also be used by an iptables 3 extension. */ 4 5 /* (C) 1999-2001 Paul `Rusty' Russell 6 * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> 7 * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> 8 * (C) 2005-2012 Patrick McHardy <kaber@trash.net> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License version 2 as 12 * published by the Free Software Foundation. 13 */ 14 15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 16 17 #include <linux/types.h> 18 #include <linux/netfilter.h> 19 #include <linux/module.h> 20 #include <linux/sched.h> 21 #include <linux/skbuff.h> 22 #include <linux/proc_fs.h> 23 #include <linux/vmalloc.h> 24 #include <linux/stddef.h> 25 #include <linux/slab.h> 26 #include <linux/random.h> 27 #include <linux/jhash.h> 28 #include <linux/err.h> 29 #include <linux/percpu.h> 30 #include <linux/moduleparam.h> 31 #include <linux/notifier.h> 32 #include <linux/kernel.h> 33 #include <linux/netdevice.h> 34 #include <linux/socket.h> 35 #include <linux/mm.h> 36 #include <linux/nsproxy.h> 37 #include <linux/rculist_nulls.h> 38 39 #include <net/netfilter/nf_conntrack.h> 40 #include <net/netfilter/nf_conntrack_l4proto.h> 41 #include <net/netfilter/nf_conntrack_expect.h> 42 #include <net/netfilter/nf_conntrack_helper.h> 43 #include <net/netfilter/nf_conntrack_seqadj.h> 44 #include <net/netfilter/nf_conntrack_core.h> 45 #include <net/netfilter/nf_conntrack_extend.h> 46 #include <net/netfilter/nf_conntrack_acct.h> 47 #include <net/netfilter/nf_conntrack_ecache.h> 48 #include <net/netfilter/nf_conntrack_zones.h> 49 #include <net/netfilter/nf_conntrack_timestamp.h> 50 #include <net/netfilter/nf_conntrack_timeout.h> 51 #include <net/netfilter/nf_conntrack_labels.h> 52 #include <net/netfilter/nf_conntrack_synproxy.h> 53 #include <net/netfilter/nf_nat.h> 54 #include <net/netfilter/nf_nat_core.h> 55 #include <net/netfilter/nf_nat_helper.h> 56 #include <net/netns/hash.h> 57 #include <net/ip.h> 58 59 #include "nf_internals.h" 60 61 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; 62 EXPORT_SYMBOL_GPL(nf_conntrack_locks); 63 64 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); 65 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); 66 67 struct hlist_nulls_head *nf_conntrack_hash __read_mostly; 68 EXPORT_SYMBOL_GPL(nf_conntrack_hash); 69 70 struct conntrack_gc_work { 71 struct delayed_work dwork; 72 u32 last_bucket; 73 bool exiting; 74 bool early_drop; 75 long next_gc_run; 76 }; 77 78 static __read_mostly struct kmem_cache *nf_conntrack_cachep; 79 static __read_mostly spinlock_t nf_conntrack_locks_all_lock; 80 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); 81 static __read_mostly bool nf_conntrack_locks_all; 82 83 /* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ 84 #define GC_MAX_BUCKETS_DIV 128u 85 /* upper bound of full table scan */ 86 #define GC_MAX_SCAN_JIFFIES (16u * HZ) 87 /* desired ratio of entries found to be expired */ 88 #define GC_EVICT_RATIO 50u 89 90 static struct conntrack_gc_work conntrack_gc_work; 91 92 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) 93 { 94 /* 1) Acquire the lock */ 95 spin_lock(lock); 96 97 /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics 98 * It pairs with the smp_store_release() in nf_conntrack_all_unlock() 99 */ 100 if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) 101 return; 102 103 /* fast path failed, unlock */ 104 spin_unlock(lock); 105 106 /* Slow path 1) get global lock */ 107 spin_lock(&nf_conntrack_locks_all_lock); 108 109 /* Slow path 2) get the lock we want */ 110 spin_lock(lock); 111 112 /* Slow path 3) release the global lock */ 113 spin_unlock(&nf_conntrack_locks_all_lock); 114 } 115 EXPORT_SYMBOL_GPL(nf_conntrack_lock); 116 117 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2) 118 { 119 h1 %= CONNTRACK_LOCKS; 120 h2 %= CONNTRACK_LOCKS; 121 spin_unlock(&nf_conntrack_locks[h1]); 122 if (h1 != h2) 123 spin_unlock(&nf_conntrack_locks[h2]); 124 } 125 126 /* return true if we need to recompute hashes (in case hash table was resized) */ 127 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, 128 unsigned int h2, unsigned int sequence) 129 { 130 h1 %= CONNTRACK_LOCKS; 131 h2 %= CONNTRACK_LOCKS; 132 if (h1 <= h2) { 133 nf_conntrack_lock(&nf_conntrack_locks[h1]); 134 if (h1 != h2) 135 spin_lock_nested(&nf_conntrack_locks[h2], 136 SINGLE_DEPTH_NESTING); 137 } else { 138 nf_conntrack_lock(&nf_conntrack_locks[h2]); 139 spin_lock_nested(&nf_conntrack_locks[h1], 140 SINGLE_DEPTH_NESTING); 141 } 142 if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { 143 nf_conntrack_double_unlock(h1, h2); 144 return true; 145 } 146 return false; 147 } 148 149 static void nf_conntrack_all_lock(void) 150 { 151 int i; 152 153 spin_lock(&nf_conntrack_locks_all_lock); 154 155 nf_conntrack_locks_all = true; 156 157 for (i = 0; i < CONNTRACK_LOCKS; i++) { 158 spin_lock(&nf_conntrack_locks[i]); 159 160 /* This spin_unlock provides the "release" to ensure that 161 * nf_conntrack_locks_all==true is visible to everyone that 162 * acquired spin_lock(&nf_conntrack_locks[]). 163 */ 164 spin_unlock(&nf_conntrack_locks[i]); 165 } 166 } 167 168 static void nf_conntrack_all_unlock(void) 169 { 170 /* All prior stores must be complete before we clear 171 * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() 172 * might observe the false value but not the entire 173 * critical section. 174 * It pairs with the smp_load_acquire() in nf_conntrack_lock() 175 */ 176 smp_store_release(&nf_conntrack_locks_all, false); 177 spin_unlock(&nf_conntrack_locks_all_lock); 178 } 179 180 unsigned int nf_conntrack_htable_size __read_mostly; 181 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); 182 183 unsigned int nf_conntrack_max __read_mostly; 184 EXPORT_SYMBOL_GPL(nf_conntrack_max); 185 seqcount_t nf_conntrack_generation __read_mostly; 186 static unsigned int nf_conntrack_hash_rnd __read_mostly; 187 188 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, 189 const struct net *net) 190 { 191 unsigned int n; 192 u32 seed; 193 194 get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); 195 196 /* The direction must be ignored, so we hash everything up to the 197 * destination ports (which is a multiple of 4) and treat the last 198 * three bytes manually. 199 */ 200 seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); 201 n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); 202 return jhash2((u32 *)tuple, n, seed ^ 203 (((__force __u16)tuple->dst.u.all << 16) | 204 tuple->dst.protonum)); 205 } 206 207 static u32 scale_hash(u32 hash) 208 { 209 return reciprocal_scale(hash, nf_conntrack_htable_size); 210 } 211 212 static u32 __hash_conntrack(const struct net *net, 213 const struct nf_conntrack_tuple *tuple, 214 unsigned int size) 215 { 216 return reciprocal_scale(hash_conntrack_raw(tuple, net), size); 217 } 218 219 static u32 hash_conntrack(const struct net *net, 220 const struct nf_conntrack_tuple *tuple) 221 { 222 return scale_hash(hash_conntrack_raw(tuple, net)); 223 } 224 225 static bool 226 nf_ct_get_tuple(const struct sk_buff *skb, 227 unsigned int nhoff, 228 unsigned int dataoff, 229 u_int16_t l3num, 230 u_int8_t protonum, 231 struct net *net, 232 struct nf_conntrack_tuple *tuple, 233 const struct nf_conntrack_l4proto *l4proto) 234 { 235 unsigned int size; 236 const __be32 *ap; 237 __be32 _addrs[8]; 238 struct { 239 __be16 sport; 240 __be16 dport; 241 } _inet_hdr, *inet_hdr; 242 243 memset(tuple, 0, sizeof(*tuple)); 244 245 tuple->src.l3num = l3num; 246 switch (l3num) { 247 case NFPROTO_IPV4: 248 nhoff += offsetof(struct iphdr, saddr); 249 size = 2 * sizeof(__be32); 250 break; 251 case NFPROTO_IPV6: 252 nhoff += offsetof(struct ipv6hdr, saddr); 253 size = sizeof(_addrs); 254 break; 255 default: 256 return true; 257 } 258 259 ap = skb_header_pointer(skb, nhoff, size, _addrs); 260 if (!ap) 261 return false; 262 263 switch (l3num) { 264 case NFPROTO_IPV4: 265 tuple->src.u3.ip = ap[0]; 266 tuple->dst.u3.ip = ap[1]; 267 break; 268 case NFPROTO_IPV6: 269 memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6)); 270 memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6)); 271 break; 272 } 273 274 tuple->dst.protonum = protonum; 275 tuple->dst.dir = IP_CT_DIR_ORIGINAL; 276 277 if (unlikely(l4proto->pkt_to_tuple)) 278 return l4proto->pkt_to_tuple(skb, dataoff, net, tuple); 279 280 /* Actually only need first 4 bytes to get ports. */ 281 inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr); 282 if (!inet_hdr) 283 return false; 284 285 tuple->src.u.udp.port = inet_hdr->sport; 286 tuple->dst.u.udp.port = inet_hdr->dport; 287 return true; 288 } 289 290 static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 291 u_int8_t *protonum) 292 { 293 int dataoff = -1; 294 const struct iphdr *iph; 295 struct iphdr _iph; 296 297 iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph); 298 if (!iph) 299 return -1; 300 301 /* Conntrack defragments packets, we might still see fragments 302 * inside ICMP packets though. 303 */ 304 if (iph->frag_off & htons(IP_OFFSET)) 305 return -1; 306 307 dataoff = nhoff + (iph->ihl << 2); 308 *protonum = iph->protocol; 309 310 /* Check bogus IP headers */ 311 if (dataoff > skb->len) { 312 pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n", 313 nhoff, iph->ihl << 2, skb->len); 314 return -1; 315 } 316 return dataoff; 317 } 318 319 #if IS_ENABLED(CONFIG_IPV6) 320 static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff, 321 u8 *protonum) 322 { 323 int protoff = -1; 324 unsigned int extoff = nhoff + sizeof(struct ipv6hdr); 325 __be16 frag_off; 326 u8 nexthdr; 327 328 if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr), 329 &nexthdr, sizeof(nexthdr)) != 0) { 330 pr_debug("can't get nexthdr\n"); 331 return -1; 332 } 333 protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off); 334 /* 335 * (protoff == skb->len) means the packet has not data, just 336 * IPv6 and possibly extensions headers, but it is tracked anyway 337 */ 338 if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { 339 pr_debug("can't find proto in pkt\n"); 340 return -1; 341 } 342 343 *protonum = nexthdr; 344 return protoff; 345 } 346 #endif 347 348 static int get_l4proto(const struct sk_buff *skb, 349 unsigned int nhoff, u8 pf, u8 *l4num) 350 { 351 switch (pf) { 352 case NFPROTO_IPV4: 353 return ipv4_get_l4proto(skb, nhoff, l4num); 354 #if IS_ENABLED(CONFIG_IPV6) 355 case NFPROTO_IPV6: 356 return ipv6_get_l4proto(skb, nhoff, l4num); 357 #endif 358 default: 359 *l4num = 0; 360 break; 361 } 362 return -1; 363 } 364 365 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, 366 u_int16_t l3num, 367 struct net *net, struct nf_conntrack_tuple *tuple) 368 { 369 const struct nf_conntrack_l4proto *l4proto; 370 u8 protonum; 371 int protoff; 372 int ret; 373 374 rcu_read_lock(); 375 376 protoff = get_l4proto(skb, nhoff, l3num, &protonum); 377 if (protoff <= 0) { 378 rcu_read_unlock(); 379 return false; 380 } 381 382 l4proto = __nf_ct_l4proto_find(protonum); 383 384 ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple, 385 l4proto); 386 387 rcu_read_unlock(); 388 return ret; 389 } 390 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr); 391 392 bool 393 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, 394 const struct nf_conntrack_tuple *orig, 395 const struct nf_conntrack_l4proto *l4proto) 396 { 397 memset(inverse, 0, sizeof(*inverse)); 398 399 inverse->src.l3num = orig->src.l3num; 400 401 switch (orig->src.l3num) { 402 case NFPROTO_IPV4: 403 inverse->src.u3.ip = orig->dst.u3.ip; 404 inverse->dst.u3.ip = orig->src.u3.ip; 405 break; 406 case NFPROTO_IPV6: 407 inverse->src.u3.in6 = orig->dst.u3.in6; 408 inverse->dst.u3.in6 = orig->src.u3.in6; 409 break; 410 default: 411 break; 412 } 413 414 inverse->dst.dir = !orig->dst.dir; 415 416 inverse->dst.protonum = orig->dst.protonum; 417 418 if (unlikely(l4proto->invert_tuple)) 419 return l4proto->invert_tuple(inverse, orig); 420 421 inverse->src.u.all = orig->dst.u.all; 422 inverse->dst.u.all = orig->src.u.all; 423 return true; 424 } 425 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); 426 427 static void 428 clean_from_lists(struct nf_conn *ct) 429 { 430 pr_debug("clean_from_lists(%p)\n", ct); 431 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 432 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 433 434 /* Destroy all pending expectations */ 435 nf_ct_remove_expectations(ct); 436 } 437 438 /* must be called with local_bh_disable */ 439 static void nf_ct_add_to_dying_list(struct nf_conn *ct) 440 { 441 struct ct_pcpu *pcpu; 442 443 /* add this conntrack to the (per cpu) dying list */ 444 ct->cpu = smp_processor_id(); 445 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 446 447 spin_lock(&pcpu->lock); 448 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 449 &pcpu->dying); 450 spin_unlock(&pcpu->lock); 451 } 452 453 /* must be called with local_bh_disable */ 454 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct) 455 { 456 struct ct_pcpu *pcpu; 457 458 /* add this conntrack to the (per cpu) unconfirmed list */ 459 ct->cpu = smp_processor_id(); 460 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 461 462 spin_lock(&pcpu->lock); 463 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 464 &pcpu->unconfirmed); 465 spin_unlock(&pcpu->lock); 466 } 467 468 /* must be called with local_bh_disable */ 469 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) 470 { 471 struct ct_pcpu *pcpu; 472 473 /* We overload first tuple to link into unconfirmed or dying list.*/ 474 pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu); 475 476 spin_lock(&pcpu->lock); 477 BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode)); 478 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 479 spin_unlock(&pcpu->lock); 480 } 481 482 #define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK) 483 484 /* Released via destroy_conntrack() */ 485 struct nf_conn *nf_ct_tmpl_alloc(struct net *net, 486 const struct nf_conntrack_zone *zone, 487 gfp_t flags) 488 { 489 struct nf_conn *tmpl, *p; 490 491 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) { 492 tmpl = kzalloc(sizeof(*tmpl) + NFCT_INFOMASK, flags); 493 if (!tmpl) 494 return NULL; 495 496 p = tmpl; 497 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 498 if (tmpl != p) { 499 tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); 500 tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; 501 } 502 } else { 503 tmpl = kzalloc(sizeof(*tmpl), flags); 504 if (!tmpl) 505 return NULL; 506 } 507 508 tmpl->status = IPS_TEMPLATE; 509 write_pnet(&tmpl->ct_net, net); 510 nf_ct_zone_add(tmpl, zone); 511 atomic_set(&tmpl->ct_general.use, 0); 512 513 return tmpl; 514 } 515 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); 516 517 void nf_ct_tmpl_free(struct nf_conn *tmpl) 518 { 519 nf_ct_ext_destroy(tmpl); 520 nf_ct_ext_free(tmpl); 521 522 if (ARCH_KMALLOC_MINALIGN <= NFCT_INFOMASK) 523 kfree((char *)tmpl - tmpl->proto.tmpl_padto); 524 else 525 kfree(tmpl); 526 } 527 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free); 528 529 static void 530 destroy_conntrack(struct nf_conntrack *nfct) 531 { 532 struct nf_conn *ct = (struct nf_conn *)nfct; 533 const struct nf_conntrack_l4proto *l4proto; 534 535 pr_debug("destroy_conntrack(%p)\n", ct); 536 WARN_ON(atomic_read(&nfct->use) != 0); 537 538 if (unlikely(nf_ct_is_template(ct))) { 539 nf_ct_tmpl_free(ct); 540 return; 541 } 542 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); 543 if (l4proto->destroy) 544 l4proto->destroy(ct); 545 546 local_bh_disable(); 547 /* Expectations will have been removed in clean_from_lists, 548 * except TFTP can create an expectation on the first packet, 549 * before connection is in the list, so we need to clean here, 550 * too. 551 */ 552 nf_ct_remove_expectations(ct); 553 554 nf_ct_del_from_dying_or_unconfirmed_list(ct); 555 556 local_bh_enable(); 557 558 if (ct->master) 559 nf_ct_put(ct->master); 560 561 pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct); 562 nf_conntrack_free(ct); 563 } 564 565 static void nf_ct_delete_from_lists(struct nf_conn *ct) 566 { 567 struct net *net = nf_ct_net(ct); 568 unsigned int hash, reply_hash; 569 unsigned int sequence; 570 571 nf_ct_helper_destroy(ct); 572 573 local_bh_disable(); 574 do { 575 sequence = read_seqcount_begin(&nf_conntrack_generation); 576 hash = hash_conntrack(net, 577 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 578 reply_hash = hash_conntrack(net, 579 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 580 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 581 582 clean_from_lists(ct); 583 nf_conntrack_double_unlock(hash, reply_hash); 584 585 nf_ct_add_to_dying_list(ct); 586 587 local_bh_enable(); 588 } 589 590 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) 591 { 592 struct nf_conn_tstamp *tstamp; 593 594 if (test_and_set_bit(IPS_DYING_BIT, &ct->status)) 595 return false; 596 597 tstamp = nf_conn_tstamp_find(ct); 598 if (tstamp && tstamp->stop == 0) 599 tstamp->stop = ktime_get_real_ns(); 600 601 if (nf_conntrack_event_report(IPCT_DESTROY, ct, 602 portid, report) < 0) { 603 /* destroy event was not delivered. nf_ct_put will 604 * be done by event cache worker on redelivery. 605 */ 606 nf_ct_delete_from_lists(ct); 607 nf_conntrack_ecache_delayed_work(nf_ct_net(ct)); 608 return false; 609 } 610 611 nf_conntrack_ecache_work(nf_ct_net(ct)); 612 nf_ct_delete_from_lists(ct); 613 nf_ct_put(ct); 614 return true; 615 } 616 EXPORT_SYMBOL_GPL(nf_ct_delete); 617 618 static inline bool 619 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, 620 const struct nf_conntrack_tuple *tuple, 621 const struct nf_conntrack_zone *zone, 622 const struct net *net) 623 { 624 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 625 626 /* A conntrack can be recreated with the equal tuple, 627 * so we need to check that the conntrack is confirmed 628 */ 629 return nf_ct_tuple_equal(tuple, &h->tuple) && 630 nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && 631 nf_ct_is_confirmed(ct) && 632 net_eq(net, nf_ct_net(ct)); 633 } 634 635 static inline bool 636 nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2) 637 { 638 return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 639 &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) && 640 nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 641 &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) && 642 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) && 643 nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) && 644 net_eq(nf_ct_net(ct1), nf_ct_net(ct2)); 645 } 646 647 /* caller must hold rcu readlock and none of the nf_conntrack_locks */ 648 static void nf_ct_gc_expired(struct nf_conn *ct) 649 { 650 if (!atomic_inc_not_zero(&ct->ct_general.use)) 651 return; 652 653 if (nf_ct_should_gc(ct)) 654 nf_ct_kill(ct); 655 656 nf_ct_put(ct); 657 } 658 659 /* 660 * Warning : 661 * - Caller must take a reference on returned object 662 * and recheck nf_ct_tuple_equal(tuple, &h->tuple) 663 */ 664 static struct nf_conntrack_tuple_hash * 665 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, 666 const struct nf_conntrack_tuple *tuple, u32 hash) 667 { 668 struct nf_conntrack_tuple_hash *h; 669 struct hlist_nulls_head *ct_hash; 670 struct hlist_nulls_node *n; 671 unsigned int bucket, hsize; 672 673 begin: 674 nf_conntrack_get_ht(&ct_hash, &hsize); 675 bucket = reciprocal_scale(hash, hsize); 676 677 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { 678 struct nf_conn *ct; 679 680 ct = nf_ct_tuplehash_to_ctrack(h); 681 if (nf_ct_is_expired(ct)) { 682 nf_ct_gc_expired(ct); 683 continue; 684 } 685 686 if (nf_ct_is_dying(ct)) 687 continue; 688 689 if (nf_ct_key_equal(h, tuple, zone, net)) 690 return h; 691 } 692 /* 693 * if the nulls value we got at the end of this lookup is 694 * not the expected one, we must restart lookup. 695 * We probably met an item that was moved to another chain. 696 */ 697 if (get_nulls_value(n) != bucket) { 698 NF_CT_STAT_INC_ATOMIC(net, search_restart); 699 goto begin; 700 } 701 702 return NULL; 703 } 704 705 /* Find a connection corresponding to a tuple. */ 706 static struct nf_conntrack_tuple_hash * 707 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 708 const struct nf_conntrack_tuple *tuple, u32 hash) 709 { 710 struct nf_conntrack_tuple_hash *h; 711 struct nf_conn *ct; 712 713 rcu_read_lock(); 714 begin: 715 h = ____nf_conntrack_find(net, zone, tuple, hash); 716 if (h) { 717 ct = nf_ct_tuplehash_to_ctrack(h); 718 if (unlikely(nf_ct_is_dying(ct) || 719 !atomic_inc_not_zero(&ct->ct_general.use))) 720 h = NULL; 721 else { 722 if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { 723 nf_ct_put(ct); 724 goto begin; 725 } 726 } 727 } 728 rcu_read_unlock(); 729 730 return h; 731 } 732 733 struct nf_conntrack_tuple_hash * 734 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, 735 const struct nf_conntrack_tuple *tuple) 736 { 737 return __nf_conntrack_find_get(net, zone, tuple, 738 hash_conntrack_raw(tuple, net)); 739 } 740 EXPORT_SYMBOL_GPL(nf_conntrack_find_get); 741 742 static void __nf_conntrack_hash_insert(struct nf_conn *ct, 743 unsigned int hash, 744 unsigned int reply_hash) 745 { 746 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, 747 &nf_conntrack_hash[hash]); 748 hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, 749 &nf_conntrack_hash[reply_hash]); 750 } 751 752 int 753 nf_conntrack_hash_check_insert(struct nf_conn *ct) 754 { 755 const struct nf_conntrack_zone *zone; 756 struct net *net = nf_ct_net(ct); 757 unsigned int hash, reply_hash; 758 struct nf_conntrack_tuple_hash *h; 759 struct hlist_nulls_node *n; 760 unsigned int sequence; 761 762 zone = nf_ct_zone(ct); 763 764 local_bh_disable(); 765 do { 766 sequence = read_seqcount_begin(&nf_conntrack_generation); 767 hash = hash_conntrack(net, 768 &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 769 reply_hash = hash_conntrack(net, 770 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 771 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 772 773 /* See if there's one in the list already, including reverse */ 774 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 775 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 776 zone, net)) 777 goto out; 778 779 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 780 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 781 zone, net)) 782 goto out; 783 784 smp_wmb(); 785 /* The caller holds a reference to this object */ 786 atomic_set(&ct->ct_general.use, 2); 787 __nf_conntrack_hash_insert(ct, hash, reply_hash); 788 nf_conntrack_double_unlock(hash, reply_hash); 789 NF_CT_STAT_INC(net, insert); 790 local_bh_enable(); 791 return 0; 792 793 out: 794 nf_conntrack_double_unlock(hash, reply_hash); 795 NF_CT_STAT_INC(net, insert_failed); 796 local_bh_enable(); 797 return -EEXIST; 798 } 799 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); 800 801 static inline void nf_ct_acct_update(struct nf_conn *ct, 802 enum ip_conntrack_info ctinfo, 803 unsigned int len) 804 { 805 struct nf_conn_acct *acct; 806 807 acct = nf_conn_acct_find(ct); 808 if (acct) { 809 struct nf_conn_counter *counter = acct->counter; 810 811 atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); 812 atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); 813 } 814 } 815 816 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, 817 const struct nf_conn *loser_ct) 818 { 819 struct nf_conn_acct *acct; 820 821 acct = nf_conn_acct_find(loser_ct); 822 if (acct) { 823 struct nf_conn_counter *counter = acct->counter; 824 unsigned int bytes; 825 826 /* u32 should be fine since we must have seen one packet. */ 827 bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); 828 nf_ct_acct_update(ct, ctinfo, bytes); 829 } 830 } 831 832 /* Resolve race on insertion if this protocol allows this. */ 833 static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, 834 enum ip_conntrack_info ctinfo, 835 struct nf_conntrack_tuple_hash *h) 836 { 837 /* This is the conntrack entry already in hashes that won race. */ 838 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 839 const struct nf_conntrack_l4proto *l4proto; 840 enum ip_conntrack_info oldinfo; 841 struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo); 842 843 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); 844 if (l4proto->allow_clash && 845 !nf_ct_is_dying(ct) && 846 atomic_inc_not_zero(&ct->ct_general.use)) { 847 if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 848 nf_ct_match(ct, loser_ct)) { 849 nf_ct_acct_merge(ct, ctinfo, loser_ct); 850 nf_conntrack_put(&loser_ct->ct_general); 851 nf_ct_set(skb, ct, oldinfo); 852 return NF_ACCEPT; 853 } 854 nf_ct_put(ct); 855 } 856 NF_CT_STAT_INC(net, drop); 857 return NF_DROP; 858 } 859 860 /* Confirm a connection given skb; places it in hash table */ 861 int 862 __nf_conntrack_confirm(struct sk_buff *skb) 863 { 864 const struct nf_conntrack_zone *zone; 865 unsigned int hash, reply_hash; 866 struct nf_conntrack_tuple_hash *h; 867 struct nf_conn *ct; 868 struct nf_conn_help *help; 869 struct nf_conn_tstamp *tstamp; 870 struct hlist_nulls_node *n; 871 enum ip_conntrack_info ctinfo; 872 struct net *net; 873 unsigned int sequence; 874 int ret = NF_DROP; 875 876 ct = nf_ct_get(skb, &ctinfo); 877 net = nf_ct_net(ct); 878 879 /* ipt_REJECT uses nf_conntrack_attach to attach related 880 ICMP/TCP RST packets in other direction. Actual packet 881 which created connection will be IP_CT_NEW or for an 882 expected connection, IP_CT_RELATED. */ 883 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) 884 return NF_ACCEPT; 885 886 zone = nf_ct_zone(ct); 887 local_bh_disable(); 888 889 do { 890 sequence = read_seqcount_begin(&nf_conntrack_generation); 891 /* reuse the hash saved before */ 892 hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; 893 hash = scale_hash(hash); 894 reply_hash = hash_conntrack(net, 895 &ct->tuplehash[IP_CT_DIR_REPLY].tuple); 896 897 } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); 898 899 /* We're not in hash table, and we refuse to set up related 900 * connections for unconfirmed conns. But packet copies and 901 * REJECT will give spurious warnings here. 902 */ 903 904 /* No external references means no one else could have 905 * confirmed us. 906 */ 907 WARN_ON(nf_ct_is_confirmed(ct)); 908 pr_debug("Confirming conntrack %p\n", ct); 909 /* We have to check the DYING flag after unlink to prevent 910 * a race against nf_ct_get_next_corpse() possibly called from 911 * user context, else we insert an already 'dead' hash, blocking 912 * further use of that particular connection -JM. 913 */ 914 nf_ct_del_from_dying_or_unconfirmed_list(ct); 915 916 if (unlikely(nf_ct_is_dying(ct))) { 917 nf_ct_add_to_dying_list(ct); 918 goto dying; 919 } 920 921 /* See if there's one in the list already, including reverse: 922 NAT could have grabbed it without realizing, since we're 923 not in the hash. If there is, we lost race. */ 924 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) 925 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 926 zone, net)) 927 goto out; 928 929 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) 930 if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, 931 zone, net)) 932 goto out; 933 934 /* Timer relative to confirmation time, not original 935 setting time, otherwise we'd get timer wrap in 936 weird delay cases. */ 937 ct->timeout += nfct_time_stamp; 938 atomic_inc(&ct->ct_general.use); 939 ct->status |= IPS_CONFIRMED; 940 941 /* set conntrack timestamp, if enabled. */ 942 tstamp = nf_conn_tstamp_find(ct); 943 if (tstamp) { 944 if (skb->tstamp == 0) 945 __net_timestamp(skb); 946 947 tstamp->start = ktime_to_ns(skb->tstamp); 948 } 949 /* Since the lookup is lockless, hash insertion must be done after 950 * starting the timer and setting the CONFIRMED bit. The RCU barriers 951 * guarantee that no other CPU can find the conntrack before the above 952 * stores are visible. 953 */ 954 __nf_conntrack_hash_insert(ct, hash, reply_hash); 955 nf_conntrack_double_unlock(hash, reply_hash); 956 local_bh_enable(); 957 958 help = nfct_help(ct); 959 if (help && help->helper) 960 nf_conntrack_event_cache(IPCT_HELPER, ct); 961 962 nf_conntrack_event_cache(master_ct(ct) ? 963 IPCT_RELATED : IPCT_NEW, ct); 964 return NF_ACCEPT; 965 966 out: 967 nf_ct_add_to_dying_list(ct); 968 ret = nf_ct_resolve_clash(net, skb, ctinfo, h); 969 dying: 970 nf_conntrack_double_unlock(hash, reply_hash); 971 NF_CT_STAT_INC(net, insert_failed); 972 local_bh_enable(); 973 return ret; 974 } 975 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); 976 977 /* Returns true if a connection correspondings to the tuple (required 978 for NAT). */ 979 int 980 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, 981 const struct nf_conn *ignored_conntrack) 982 { 983 struct net *net = nf_ct_net(ignored_conntrack); 984 const struct nf_conntrack_zone *zone; 985 struct nf_conntrack_tuple_hash *h; 986 struct hlist_nulls_head *ct_hash; 987 unsigned int hash, hsize; 988 struct hlist_nulls_node *n; 989 struct nf_conn *ct; 990 991 zone = nf_ct_zone(ignored_conntrack); 992 993 rcu_read_lock(); 994 begin: 995 nf_conntrack_get_ht(&ct_hash, &hsize); 996 hash = __hash_conntrack(net, tuple, hsize); 997 998 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { 999 ct = nf_ct_tuplehash_to_ctrack(h); 1000 1001 if (ct == ignored_conntrack) 1002 continue; 1003 1004 if (nf_ct_is_expired(ct)) { 1005 nf_ct_gc_expired(ct); 1006 continue; 1007 } 1008 1009 if (nf_ct_key_equal(h, tuple, zone, net)) { 1010 NF_CT_STAT_INC_ATOMIC(net, found); 1011 rcu_read_unlock(); 1012 return 1; 1013 } 1014 } 1015 1016 if (get_nulls_value(n) != hash) { 1017 NF_CT_STAT_INC_ATOMIC(net, search_restart); 1018 goto begin; 1019 } 1020 1021 rcu_read_unlock(); 1022 1023 return 0; 1024 } 1025 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); 1026 1027 #define NF_CT_EVICTION_RANGE 8 1028 1029 /* There's a small race here where we may free a just-assured 1030 connection. Too bad: we're in trouble anyway. */ 1031 static unsigned int early_drop_list(struct net *net, 1032 struct hlist_nulls_head *head) 1033 { 1034 struct nf_conntrack_tuple_hash *h; 1035 struct hlist_nulls_node *n; 1036 unsigned int drops = 0; 1037 struct nf_conn *tmp; 1038 1039 hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) { 1040 tmp = nf_ct_tuplehash_to_ctrack(h); 1041 1042 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) 1043 continue; 1044 1045 if (nf_ct_is_expired(tmp)) { 1046 nf_ct_gc_expired(tmp); 1047 continue; 1048 } 1049 1050 if (test_bit(IPS_ASSURED_BIT, &tmp->status) || 1051 !net_eq(nf_ct_net(tmp), net) || 1052 nf_ct_is_dying(tmp)) 1053 continue; 1054 1055 if (!atomic_inc_not_zero(&tmp->ct_general.use)) 1056 continue; 1057 1058 /* kill only if still in same netns -- might have moved due to 1059 * SLAB_TYPESAFE_BY_RCU rules. 1060 * 1061 * We steal the timer reference. If that fails timer has 1062 * already fired or someone else deleted it. Just drop ref 1063 * and move to next entry. 1064 */ 1065 if (net_eq(nf_ct_net(tmp), net) && 1066 nf_ct_is_confirmed(tmp) && 1067 nf_ct_delete(tmp, 0, 0)) 1068 drops++; 1069 1070 nf_ct_put(tmp); 1071 } 1072 1073 return drops; 1074 } 1075 1076 static noinline int early_drop(struct net *net, unsigned int hash) 1077 { 1078 unsigned int i, bucket; 1079 1080 for (i = 0; i < NF_CT_EVICTION_RANGE; i++) { 1081 struct hlist_nulls_head *ct_hash; 1082 unsigned int hsize, drops; 1083 1084 rcu_read_lock(); 1085 nf_conntrack_get_ht(&ct_hash, &hsize); 1086 if (!i) 1087 bucket = reciprocal_scale(hash, hsize); 1088 else 1089 bucket = (bucket + 1) % hsize; 1090 1091 drops = early_drop_list(net, &ct_hash[bucket]); 1092 rcu_read_unlock(); 1093 1094 if (drops) { 1095 NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops); 1096 return true; 1097 } 1098 } 1099 1100 return false; 1101 } 1102 1103 static bool gc_worker_skip_ct(const struct nf_conn *ct) 1104 { 1105 return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); 1106 } 1107 1108 static bool gc_worker_can_early_drop(const struct nf_conn *ct) 1109 { 1110 const struct nf_conntrack_l4proto *l4proto; 1111 1112 if (!test_bit(IPS_ASSURED_BIT, &ct->status)) 1113 return true; 1114 1115 l4proto = __nf_ct_l4proto_find(nf_ct_protonum(ct)); 1116 if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) 1117 return true; 1118 1119 return false; 1120 } 1121 1122 #define DAY (86400 * HZ) 1123 1124 /* Set an arbitrary timeout large enough not to ever expire, this save 1125 * us a check for the IPS_OFFLOAD_BIT from the packet path via 1126 * nf_ct_is_expired(). 1127 */ 1128 static void nf_ct_offload_timeout(struct nf_conn *ct) 1129 { 1130 if (nf_ct_expires(ct) < DAY / 2) 1131 ct->timeout = nfct_time_stamp + DAY; 1132 } 1133 1134 static void gc_worker(struct work_struct *work) 1135 { 1136 unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); 1137 unsigned int i, goal, buckets = 0, expired_count = 0; 1138 unsigned int nf_conntrack_max95 = 0; 1139 struct conntrack_gc_work *gc_work; 1140 unsigned int ratio, scanned = 0; 1141 unsigned long next_run; 1142 1143 gc_work = container_of(work, struct conntrack_gc_work, dwork.work); 1144 1145 goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; 1146 i = gc_work->last_bucket; 1147 if (gc_work->early_drop) 1148 nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; 1149 1150 do { 1151 struct nf_conntrack_tuple_hash *h; 1152 struct hlist_nulls_head *ct_hash; 1153 struct hlist_nulls_node *n; 1154 unsigned int hashsz; 1155 struct nf_conn *tmp; 1156 1157 i++; 1158 rcu_read_lock(); 1159 1160 nf_conntrack_get_ht(&ct_hash, &hashsz); 1161 if (i >= hashsz) 1162 i = 0; 1163 1164 hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { 1165 struct net *net; 1166 1167 tmp = nf_ct_tuplehash_to_ctrack(h); 1168 1169 scanned++; 1170 if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { 1171 nf_ct_offload_timeout(tmp); 1172 continue; 1173 } 1174 1175 if (nf_ct_is_expired(tmp)) { 1176 nf_ct_gc_expired(tmp); 1177 expired_count++; 1178 continue; 1179 } 1180 1181 if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) 1182 continue; 1183 1184 net = nf_ct_net(tmp); 1185 if (atomic_read(&net->ct.count) < nf_conntrack_max95) 1186 continue; 1187 1188 /* need to take reference to avoid possible races */ 1189 if (!atomic_inc_not_zero(&tmp->ct_general.use)) 1190 continue; 1191 1192 if (gc_worker_skip_ct(tmp)) { 1193 nf_ct_put(tmp); 1194 continue; 1195 } 1196 1197 if (gc_worker_can_early_drop(tmp)) 1198 nf_ct_kill(tmp); 1199 1200 nf_ct_put(tmp); 1201 } 1202 1203 /* could check get_nulls_value() here and restart if ct 1204 * was moved to another chain. But given gc is best-effort 1205 * we will just continue with next hash slot. 1206 */ 1207 rcu_read_unlock(); 1208 cond_resched(); 1209 } while (++buckets < goal); 1210 1211 if (gc_work->exiting) 1212 return; 1213 1214 /* 1215 * Eviction will normally happen from the packet path, and not 1216 * from this gc worker. 1217 * 1218 * This worker is only here to reap expired entries when system went 1219 * idle after a busy period. 1220 * 1221 * The heuristics below are supposed to balance conflicting goals: 1222 * 1223 * 1. Minimize time until we notice a stale entry 1224 * 2. Maximize scan intervals to not waste cycles 1225 * 1226 * Normally, expire ratio will be close to 0. 1227 * 1228 * As soon as a sizeable fraction of the entries have expired 1229 * increase scan frequency. 1230 */ 1231 ratio = scanned ? expired_count * 100 / scanned : 0; 1232 if (ratio > GC_EVICT_RATIO) { 1233 gc_work->next_gc_run = min_interval; 1234 } else { 1235 unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; 1236 1237 BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); 1238 1239 gc_work->next_gc_run += min_interval; 1240 if (gc_work->next_gc_run > max) 1241 gc_work->next_gc_run = max; 1242 } 1243 1244 next_run = gc_work->next_gc_run; 1245 gc_work->last_bucket = i; 1246 gc_work->early_drop = false; 1247 queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); 1248 } 1249 1250 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) 1251 { 1252 INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); 1253 gc_work->next_gc_run = HZ; 1254 gc_work->exiting = false; 1255 } 1256 1257 static struct nf_conn * 1258 __nf_conntrack_alloc(struct net *net, 1259 const struct nf_conntrack_zone *zone, 1260 const struct nf_conntrack_tuple *orig, 1261 const struct nf_conntrack_tuple *repl, 1262 gfp_t gfp, u32 hash) 1263 { 1264 struct nf_conn *ct; 1265 1266 /* We don't want any race condition at early drop stage */ 1267 atomic_inc(&net->ct.count); 1268 1269 if (nf_conntrack_max && 1270 unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { 1271 if (!early_drop(net, hash)) { 1272 if (!conntrack_gc_work.early_drop) 1273 conntrack_gc_work.early_drop = true; 1274 atomic_dec(&net->ct.count); 1275 net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); 1276 return ERR_PTR(-ENOMEM); 1277 } 1278 } 1279 1280 /* 1281 * Do not use kmem_cache_zalloc(), as this cache uses 1282 * SLAB_TYPESAFE_BY_RCU. 1283 */ 1284 ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); 1285 if (ct == NULL) 1286 goto out; 1287 1288 spin_lock_init(&ct->lock); 1289 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 1290 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 1291 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; 1292 /* save hash for reusing when confirming */ 1293 *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; 1294 ct->status = 0; 1295 write_pnet(&ct->ct_net, net); 1296 memset(&ct->__nfct_init_offset[0], 0, 1297 offsetof(struct nf_conn, proto) - 1298 offsetof(struct nf_conn, __nfct_init_offset[0])); 1299 1300 nf_ct_zone_add(ct, zone); 1301 1302 /* Because we use RCU lookups, we set ct_general.use to zero before 1303 * this is inserted in any list. 1304 */ 1305 atomic_set(&ct->ct_general.use, 0); 1306 return ct; 1307 out: 1308 atomic_dec(&net->ct.count); 1309 return ERR_PTR(-ENOMEM); 1310 } 1311 1312 struct nf_conn *nf_conntrack_alloc(struct net *net, 1313 const struct nf_conntrack_zone *zone, 1314 const struct nf_conntrack_tuple *orig, 1315 const struct nf_conntrack_tuple *repl, 1316 gfp_t gfp) 1317 { 1318 return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); 1319 } 1320 EXPORT_SYMBOL_GPL(nf_conntrack_alloc); 1321 1322 void nf_conntrack_free(struct nf_conn *ct) 1323 { 1324 struct net *net = nf_ct_net(ct); 1325 1326 /* A freed object has refcnt == 0, that's 1327 * the golden rule for SLAB_TYPESAFE_BY_RCU 1328 */ 1329 WARN_ON(atomic_read(&ct->ct_general.use) != 0); 1330 1331 nf_ct_ext_destroy(ct); 1332 nf_ct_ext_free(ct); 1333 kmem_cache_free(nf_conntrack_cachep, ct); 1334 smp_mb__before_atomic(); 1335 atomic_dec(&net->ct.count); 1336 } 1337 EXPORT_SYMBOL_GPL(nf_conntrack_free); 1338 1339 1340 /* Allocate a new conntrack: we return -ENOMEM if classification 1341 failed due to stress. Otherwise it really is unclassifiable. */ 1342 static noinline struct nf_conntrack_tuple_hash * 1343 init_conntrack(struct net *net, struct nf_conn *tmpl, 1344 const struct nf_conntrack_tuple *tuple, 1345 const struct nf_conntrack_l4proto *l4proto, 1346 struct sk_buff *skb, 1347 unsigned int dataoff, u32 hash) 1348 { 1349 struct nf_conn *ct; 1350 struct nf_conn_help *help; 1351 struct nf_conntrack_tuple repl_tuple; 1352 struct nf_conntrack_ecache *ecache; 1353 struct nf_conntrack_expect *exp = NULL; 1354 const struct nf_conntrack_zone *zone; 1355 struct nf_conn_timeout *timeout_ext; 1356 struct nf_conntrack_zone tmp; 1357 1358 if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) { 1359 pr_debug("Can't invert tuple.\n"); 1360 return NULL; 1361 } 1362 1363 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1364 ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, 1365 hash); 1366 if (IS_ERR(ct)) 1367 return (struct nf_conntrack_tuple_hash *)ct; 1368 1369 if (!nf_ct_add_synproxy(ct, tmpl)) { 1370 nf_conntrack_free(ct); 1371 return ERR_PTR(-ENOMEM); 1372 } 1373 1374 timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL; 1375 1376 if (timeout_ext) 1377 nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), 1378 GFP_ATOMIC); 1379 1380 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1381 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); 1382 nf_ct_labels_ext_add(ct); 1383 1384 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 1385 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 1386 ecache ? ecache->expmask : 0, 1387 GFP_ATOMIC); 1388 1389 local_bh_disable(); 1390 if (net->ct.expect_count) { 1391 spin_lock(&nf_conntrack_expect_lock); 1392 exp = nf_ct_find_expectation(net, zone, tuple); 1393 if (exp) { 1394 pr_debug("expectation arrives ct=%p exp=%p\n", 1395 ct, exp); 1396 /* Welcome, Mr. Bond. We've been expecting you... */ 1397 __set_bit(IPS_EXPECTED_BIT, &ct->status); 1398 /* exp->master safe, refcnt bumped in nf_ct_find_expectation */ 1399 ct->master = exp->master; 1400 if (exp->helper) { 1401 help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); 1402 if (help) 1403 rcu_assign_pointer(help->helper, exp->helper); 1404 } 1405 1406 #ifdef CONFIG_NF_CONNTRACK_MARK 1407 ct->mark = exp->master->mark; 1408 #endif 1409 #ifdef CONFIG_NF_CONNTRACK_SECMARK 1410 ct->secmark = exp->master->secmark; 1411 #endif 1412 NF_CT_STAT_INC(net, expect_new); 1413 } 1414 spin_unlock(&nf_conntrack_expect_lock); 1415 } 1416 if (!exp) 1417 __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); 1418 1419 /* Now it is inserted into the unconfirmed list, bump refcount */ 1420 nf_conntrack_get(&ct->ct_general); 1421 nf_ct_add_to_unconfirmed_list(ct); 1422 1423 local_bh_enable(); 1424 1425 if (exp) { 1426 if (exp->expectfn) 1427 exp->expectfn(ct, exp); 1428 nf_ct_expect_put(exp); 1429 } 1430 1431 return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; 1432 } 1433 1434 /* On success, returns 0, sets skb->_nfct | ctinfo */ 1435 static int 1436 resolve_normal_ct(struct nf_conn *tmpl, 1437 struct sk_buff *skb, 1438 unsigned int dataoff, 1439 u_int8_t protonum, 1440 const struct nf_conntrack_l4proto *l4proto, 1441 const struct nf_hook_state *state) 1442 { 1443 const struct nf_conntrack_zone *zone; 1444 struct nf_conntrack_tuple tuple; 1445 struct nf_conntrack_tuple_hash *h; 1446 enum ip_conntrack_info ctinfo; 1447 struct nf_conntrack_zone tmp; 1448 struct nf_conn *ct; 1449 u32 hash; 1450 1451 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), 1452 dataoff, state->pf, protonum, state->net, 1453 &tuple, l4proto)) { 1454 pr_debug("Can't get tuple\n"); 1455 return 0; 1456 } 1457 1458 /* look for tuple match */ 1459 zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); 1460 hash = hash_conntrack_raw(&tuple, state->net); 1461 h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); 1462 if (!h) { 1463 h = init_conntrack(state->net, tmpl, &tuple, l4proto, 1464 skb, dataoff, hash); 1465 if (!h) 1466 return 0; 1467 if (IS_ERR(h)) 1468 return PTR_ERR(h); 1469 } 1470 ct = nf_ct_tuplehash_to_ctrack(h); 1471 1472 /* It exists; we have (non-exclusive) reference. */ 1473 if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { 1474 ctinfo = IP_CT_ESTABLISHED_REPLY; 1475 } else { 1476 /* Once we've had two way comms, always ESTABLISHED. */ 1477 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { 1478 pr_debug("normal packet for %p\n", ct); 1479 ctinfo = IP_CT_ESTABLISHED; 1480 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { 1481 pr_debug("related packet for %p\n", ct); 1482 ctinfo = IP_CT_RELATED; 1483 } else { 1484 pr_debug("new packet for %p\n", ct); 1485 ctinfo = IP_CT_NEW; 1486 } 1487 } 1488 nf_ct_set(skb, ct, ctinfo); 1489 return 0; 1490 } 1491 1492 /* 1493 * icmp packets need special treatment to handle error messages that are 1494 * related to a connection. 1495 * 1496 * Callers need to check if skb has a conntrack assigned when this 1497 * helper returns; in such case skb belongs to an already known connection. 1498 */ 1499 static unsigned int __cold 1500 nf_conntrack_handle_icmp(struct nf_conn *tmpl, 1501 struct sk_buff *skb, 1502 unsigned int dataoff, 1503 u8 protonum, 1504 const struct nf_hook_state *state) 1505 { 1506 int ret; 1507 1508 if (state->pf == NFPROTO_IPV4 && protonum == IPPROTO_ICMP) 1509 ret = nf_conntrack_icmpv4_error(tmpl, skb, dataoff, state); 1510 #if IS_ENABLED(CONFIG_IPV6) 1511 else if (state->pf == NFPROTO_IPV6 && protonum == IPPROTO_ICMPV6) 1512 ret = nf_conntrack_icmpv6_error(tmpl, skb, dataoff, state); 1513 #endif 1514 else 1515 return NF_ACCEPT; 1516 1517 if (ret <= 0) { 1518 NF_CT_STAT_INC_ATOMIC(state->net, error); 1519 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1520 } 1521 1522 return ret; 1523 } 1524 1525 unsigned int 1526 nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) 1527 { 1528 const struct nf_conntrack_l4proto *l4proto; 1529 enum ip_conntrack_info ctinfo; 1530 struct nf_conn *ct, *tmpl; 1531 u_int8_t protonum; 1532 int dataoff, ret; 1533 1534 tmpl = nf_ct_get(skb, &ctinfo); 1535 if (tmpl || ctinfo == IP_CT_UNTRACKED) { 1536 /* Previously seen (loopback or untracked)? Ignore. */ 1537 if ((tmpl && !nf_ct_is_template(tmpl)) || 1538 ctinfo == IP_CT_UNTRACKED) { 1539 NF_CT_STAT_INC_ATOMIC(state->net, ignore); 1540 return NF_ACCEPT; 1541 } 1542 skb->_nfct = 0; 1543 } 1544 1545 /* rcu_read_lock()ed by nf_hook_thresh */ 1546 dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum); 1547 if (dataoff <= 0) { 1548 pr_debug("not prepared to track yet or error occurred\n"); 1549 NF_CT_STAT_INC_ATOMIC(state->net, error); 1550 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1551 ret = NF_ACCEPT; 1552 goto out; 1553 } 1554 1555 l4proto = __nf_ct_l4proto_find(protonum); 1556 1557 if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) { 1558 ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff, 1559 protonum, state); 1560 if (ret <= 0) { 1561 ret = -ret; 1562 goto out; 1563 } 1564 /* ICMP[v6] protocol trackers may assign one conntrack. */ 1565 if (skb->_nfct) 1566 goto out; 1567 } 1568 repeat: 1569 ret = resolve_normal_ct(tmpl, skb, dataoff, 1570 protonum, l4proto, state); 1571 if (ret < 0) { 1572 /* Too stressed to deal. */ 1573 NF_CT_STAT_INC_ATOMIC(state->net, drop); 1574 ret = NF_DROP; 1575 goto out; 1576 } 1577 1578 ct = nf_ct_get(skb, &ctinfo); 1579 if (!ct) { 1580 /* Not valid part of a connection */ 1581 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1582 ret = NF_ACCEPT; 1583 goto out; 1584 } 1585 1586 ret = l4proto->packet(ct, skb, dataoff, ctinfo, state); 1587 if (ret <= 0) { 1588 /* Invalid: inverse of the return code tells 1589 * the netfilter core what to do */ 1590 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 1591 nf_conntrack_put(&ct->ct_general); 1592 skb->_nfct = 0; 1593 NF_CT_STAT_INC_ATOMIC(state->net, invalid); 1594 if (ret == -NF_DROP) 1595 NF_CT_STAT_INC_ATOMIC(state->net, drop); 1596 /* Special case: TCP tracker reports an attempt to reopen a 1597 * closed/aborted connection. We have to go back and create a 1598 * fresh conntrack. 1599 */ 1600 if (ret == -NF_REPEAT) 1601 goto repeat; 1602 ret = -ret; 1603 goto out; 1604 } 1605 1606 if (ctinfo == IP_CT_ESTABLISHED_REPLY && 1607 !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) 1608 nf_conntrack_event_cache(IPCT_REPLY, ct); 1609 out: 1610 if (tmpl) 1611 nf_ct_put(tmpl); 1612 1613 return ret; 1614 } 1615 EXPORT_SYMBOL_GPL(nf_conntrack_in); 1616 1617 bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse, 1618 const struct nf_conntrack_tuple *orig) 1619 { 1620 bool ret; 1621 1622 rcu_read_lock(); 1623 ret = nf_ct_invert_tuple(inverse, orig, 1624 __nf_ct_l4proto_find(orig->dst.protonum)); 1625 rcu_read_unlock(); 1626 return ret; 1627 } 1628 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr); 1629 1630 /* Alter reply tuple (maybe alter helper). This is for NAT, and is 1631 implicitly racy: see __nf_conntrack_confirm */ 1632 void nf_conntrack_alter_reply(struct nf_conn *ct, 1633 const struct nf_conntrack_tuple *newreply) 1634 { 1635 struct nf_conn_help *help = nfct_help(ct); 1636 1637 /* Should be unconfirmed, so not in hash table yet */ 1638 WARN_ON(nf_ct_is_confirmed(ct)); 1639 1640 pr_debug("Altering reply tuple of %p to ", ct); 1641 nf_ct_dump_tuple(newreply); 1642 1643 ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; 1644 if (ct->master || (help && !hlist_empty(&help->expectations))) 1645 return; 1646 1647 rcu_read_lock(); 1648 __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); 1649 rcu_read_unlock(); 1650 } 1651 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); 1652 1653 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ 1654 void __nf_ct_refresh_acct(struct nf_conn *ct, 1655 enum ip_conntrack_info ctinfo, 1656 const struct sk_buff *skb, 1657 unsigned long extra_jiffies, 1658 int do_acct) 1659 { 1660 WARN_ON(!skb); 1661 1662 /* Only update if this is not a fixed timeout */ 1663 if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) 1664 goto acct; 1665 1666 /* If not in hash table, timer will not be active yet */ 1667 if (nf_ct_is_confirmed(ct)) 1668 extra_jiffies += nfct_time_stamp; 1669 1670 ct->timeout = extra_jiffies; 1671 acct: 1672 if (do_acct) 1673 nf_ct_acct_update(ct, ctinfo, skb->len); 1674 } 1675 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); 1676 1677 bool nf_ct_kill_acct(struct nf_conn *ct, 1678 enum ip_conntrack_info ctinfo, 1679 const struct sk_buff *skb) 1680 { 1681 nf_ct_acct_update(ct, ctinfo, skb->len); 1682 1683 return nf_ct_delete(ct, 0, 0); 1684 } 1685 EXPORT_SYMBOL_GPL(nf_ct_kill_acct); 1686 1687 #if IS_ENABLED(CONFIG_NF_CT_NETLINK) 1688 1689 #include <linux/netfilter/nfnetlink.h> 1690 #include <linux/netfilter/nfnetlink_conntrack.h> 1691 #include <linux/mutex.h> 1692 1693 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be 1694 * in ip_conntrack_core, since we don't want the protocols to autoload 1695 * or depend on ctnetlink */ 1696 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, 1697 const struct nf_conntrack_tuple *tuple) 1698 { 1699 if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) || 1700 nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port)) 1701 goto nla_put_failure; 1702 return 0; 1703 1704 nla_put_failure: 1705 return -1; 1706 } 1707 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr); 1708 1709 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { 1710 [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 }, 1711 [CTA_PROTO_DST_PORT] = { .type = NLA_U16 }, 1712 }; 1713 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); 1714 1715 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], 1716 struct nf_conntrack_tuple *t) 1717 { 1718 if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) 1719 return -EINVAL; 1720 1721 t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); 1722 t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); 1723 1724 return 0; 1725 } 1726 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple); 1727 1728 unsigned int nf_ct_port_nlattr_tuple_size(void) 1729 { 1730 static unsigned int size __read_mostly; 1731 1732 if (!size) 1733 size = nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); 1734 1735 return size; 1736 } 1737 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size); 1738 #endif 1739 1740 /* Used by ipt_REJECT and ip6t_REJECT. */ 1741 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb) 1742 { 1743 struct nf_conn *ct; 1744 enum ip_conntrack_info ctinfo; 1745 1746 /* This ICMP is in reverse direction to the packet which caused it */ 1747 ct = nf_ct_get(skb, &ctinfo); 1748 if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) 1749 ctinfo = IP_CT_RELATED_REPLY; 1750 else 1751 ctinfo = IP_CT_RELATED; 1752 1753 /* Attach to new skbuff, and increment count */ 1754 nf_ct_set(nskb, ct, ctinfo); 1755 nf_conntrack_get(skb_nfct(nskb)); 1756 } 1757 1758 static int nf_conntrack_update(struct net *net, struct sk_buff *skb) 1759 { 1760 const struct nf_conntrack_l4proto *l4proto; 1761 struct nf_conntrack_tuple_hash *h; 1762 struct nf_conntrack_tuple tuple; 1763 enum ip_conntrack_info ctinfo; 1764 struct nf_nat_hook *nat_hook; 1765 unsigned int status; 1766 struct nf_conn *ct; 1767 int dataoff; 1768 u16 l3num; 1769 u8 l4num; 1770 1771 ct = nf_ct_get(skb, &ctinfo); 1772 if (!ct || nf_ct_is_confirmed(ct)) 1773 return 0; 1774 1775 l3num = nf_ct_l3num(ct); 1776 1777 dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 1778 if (dataoff <= 0) 1779 return -1; 1780 1781 l4proto = nf_ct_l4proto_find_get(l4num); 1782 1783 if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 1784 l4num, net, &tuple, l4proto)) 1785 return -1; 1786 1787 if (ct->status & IPS_SRC_NAT) { 1788 memcpy(tuple.src.u3.all, 1789 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 1790 sizeof(tuple.src.u3.all)); 1791 tuple.src.u.all = 1792 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 1793 } 1794 1795 if (ct->status & IPS_DST_NAT) { 1796 memcpy(tuple.dst.u3.all, 1797 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 1798 sizeof(tuple.dst.u3.all)); 1799 tuple.dst.u.all = 1800 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 1801 } 1802 1803 h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 1804 if (!h) 1805 return 0; 1806 1807 /* Store status bits of the conntrack that is clashing to re-do NAT 1808 * mangling according to what it has been done already to this packet. 1809 */ 1810 status = ct->status; 1811 1812 nf_ct_put(ct); 1813 ct = nf_ct_tuplehash_to_ctrack(h); 1814 nf_ct_set(skb, ct, ctinfo); 1815 1816 nat_hook = rcu_dereference(nf_nat_hook); 1817 if (!nat_hook) 1818 return 0; 1819 1820 if (status & IPS_SRC_NAT && 1821 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC, 1822 IP_CT_DIR_ORIGINAL) == NF_DROP) 1823 return -1; 1824 1825 if (status & IPS_DST_NAT && 1826 nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_DST, 1827 IP_CT_DIR_ORIGINAL) == NF_DROP) 1828 return -1; 1829 1830 return 0; 1831 } 1832 1833 static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, 1834 const struct sk_buff *skb) 1835 { 1836 const struct nf_conntrack_tuple *src_tuple; 1837 const struct nf_conntrack_tuple_hash *hash; 1838 struct nf_conntrack_tuple srctuple; 1839 enum ip_conntrack_info ctinfo; 1840 struct nf_conn *ct; 1841 1842 ct = nf_ct_get(skb, &ctinfo); 1843 if (ct) { 1844 src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo)); 1845 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 1846 return true; 1847 } 1848 1849 if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 1850 NFPROTO_IPV4, dev_net(skb->dev), 1851 &srctuple)) 1852 return false; 1853 1854 hash = nf_conntrack_find_get(dev_net(skb->dev), 1855 &nf_ct_zone_dflt, 1856 &srctuple); 1857 if (!hash) 1858 return false; 1859 1860 ct = nf_ct_tuplehash_to_ctrack(hash); 1861 src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir); 1862 memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple)); 1863 nf_ct_put(ct); 1864 1865 return true; 1866 } 1867 1868 /* Bring out ya dead! */ 1869 static struct nf_conn * 1870 get_next_corpse(int (*iter)(struct nf_conn *i, void *data), 1871 void *data, unsigned int *bucket) 1872 { 1873 struct nf_conntrack_tuple_hash *h; 1874 struct nf_conn *ct; 1875 struct hlist_nulls_node *n; 1876 spinlock_t *lockp; 1877 1878 for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { 1879 lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; 1880 local_bh_disable(); 1881 nf_conntrack_lock(lockp); 1882 if (*bucket < nf_conntrack_htable_size) { 1883 hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { 1884 if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) 1885 continue; 1886 ct = nf_ct_tuplehash_to_ctrack(h); 1887 if (iter(ct, data)) 1888 goto found; 1889 } 1890 } 1891 spin_unlock(lockp); 1892 local_bh_enable(); 1893 cond_resched(); 1894 } 1895 1896 return NULL; 1897 found: 1898 atomic_inc(&ct->ct_general.use); 1899 spin_unlock(lockp); 1900 local_bh_enable(); 1901 return ct; 1902 } 1903 1904 static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), 1905 void *data, u32 portid, int report) 1906 { 1907 unsigned int bucket = 0, sequence; 1908 struct nf_conn *ct; 1909 1910 might_sleep(); 1911 1912 for (;;) { 1913 sequence = read_seqcount_begin(&nf_conntrack_generation); 1914 1915 while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { 1916 /* Time to push up daises... */ 1917 1918 nf_ct_delete(ct, portid, report); 1919 nf_ct_put(ct); 1920 cond_resched(); 1921 } 1922 1923 if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) 1924 break; 1925 bucket = 0; 1926 } 1927 } 1928 1929 struct iter_data { 1930 int (*iter)(struct nf_conn *i, void *data); 1931 void *data; 1932 struct net *net; 1933 }; 1934 1935 static int iter_net_only(struct nf_conn *i, void *data) 1936 { 1937 struct iter_data *d = data; 1938 1939 if (!net_eq(d->net, nf_ct_net(i))) 1940 return 0; 1941 1942 return d->iter(i, d->data); 1943 } 1944 1945 static void 1946 __nf_ct_unconfirmed_destroy(struct net *net) 1947 { 1948 int cpu; 1949 1950 for_each_possible_cpu(cpu) { 1951 struct nf_conntrack_tuple_hash *h; 1952 struct hlist_nulls_node *n; 1953 struct ct_pcpu *pcpu; 1954 1955 pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 1956 1957 spin_lock_bh(&pcpu->lock); 1958 hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) { 1959 struct nf_conn *ct; 1960 1961 ct = nf_ct_tuplehash_to_ctrack(h); 1962 1963 /* we cannot call iter() on unconfirmed list, the 1964 * owning cpu can reallocate ct->ext at any time. 1965 */ 1966 set_bit(IPS_DYING_BIT, &ct->status); 1967 } 1968 spin_unlock_bh(&pcpu->lock); 1969 cond_resched(); 1970 } 1971 } 1972 1973 void nf_ct_unconfirmed_destroy(struct net *net) 1974 { 1975 might_sleep(); 1976 1977 if (atomic_read(&net->ct.count) > 0) { 1978 __nf_ct_unconfirmed_destroy(net); 1979 nf_queue_nf_hook_drop(net); 1980 synchronize_net(); 1981 } 1982 } 1983 EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy); 1984 1985 void nf_ct_iterate_cleanup_net(struct net *net, 1986 int (*iter)(struct nf_conn *i, void *data), 1987 void *data, u32 portid, int report) 1988 { 1989 struct iter_data d; 1990 1991 might_sleep(); 1992 1993 if (atomic_read(&net->ct.count) == 0) 1994 return; 1995 1996 d.iter = iter; 1997 d.data = data; 1998 d.net = net; 1999 2000 nf_ct_iterate_cleanup(iter_net_only, &d, portid, report); 2001 } 2002 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net); 2003 2004 /** 2005 * nf_ct_iterate_destroy - destroy unconfirmed conntracks and iterate table 2006 * @iter: callback to invoke for each conntrack 2007 * @data: data to pass to @iter 2008 * 2009 * Like nf_ct_iterate_cleanup, but first marks conntracks on the 2010 * unconfirmed list as dying (so they will not be inserted into 2011 * main table). 2012 * 2013 * Can only be called in module exit path. 2014 */ 2015 void 2016 nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data) 2017 { 2018 struct net *net; 2019 2020 down_read(&net_rwsem); 2021 for_each_net(net) { 2022 if (atomic_read(&net->ct.count) == 0) 2023 continue; 2024 __nf_ct_unconfirmed_destroy(net); 2025 nf_queue_nf_hook_drop(net); 2026 } 2027 up_read(&net_rwsem); 2028 2029 /* Need to wait for netns cleanup worker to finish, if its 2030 * running -- it might have deleted a net namespace from 2031 * the global list, so our __nf_ct_unconfirmed_destroy() might 2032 * not have affected all namespaces. 2033 */ 2034 net_ns_barrier(); 2035 2036 /* a conntrack could have been unlinked from unconfirmed list 2037 * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy(). 2038 * This makes sure its inserted into conntrack table. 2039 */ 2040 synchronize_net(); 2041 2042 nf_ct_iterate_cleanup(iter, data, 0, 0); 2043 } 2044 EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy); 2045 2046 static int kill_all(struct nf_conn *i, void *data) 2047 { 2048 return net_eq(nf_ct_net(i), data); 2049 } 2050 2051 void nf_conntrack_cleanup_start(void) 2052 { 2053 conntrack_gc_work.exiting = true; 2054 RCU_INIT_POINTER(ip_ct_attach, NULL); 2055 } 2056 2057 void nf_conntrack_cleanup_end(void) 2058 { 2059 RCU_INIT_POINTER(nf_ct_hook, NULL); 2060 cancel_delayed_work_sync(&conntrack_gc_work.dwork); 2061 kvfree(nf_conntrack_hash); 2062 2063 nf_conntrack_proto_fini(); 2064 nf_conntrack_seqadj_fini(); 2065 nf_conntrack_labels_fini(); 2066 nf_conntrack_helper_fini(); 2067 nf_conntrack_timeout_fini(); 2068 nf_conntrack_ecache_fini(); 2069 nf_conntrack_tstamp_fini(); 2070 nf_conntrack_acct_fini(); 2071 nf_conntrack_expect_fini(); 2072 2073 kmem_cache_destroy(nf_conntrack_cachep); 2074 } 2075 2076 /* 2077 * Mishearing the voices in his head, our hero wonders how he's 2078 * supposed to kill the mall. 2079 */ 2080 void nf_conntrack_cleanup_net(struct net *net) 2081 { 2082 LIST_HEAD(single); 2083 2084 list_add(&net->exit_list, &single); 2085 nf_conntrack_cleanup_net_list(&single); 2086 } 2087 2088 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2089 { 2090 int busy; 2091 struct net *net; 2092 2093 /* 2094 * This makes sure all current packets have passed through 2095 * netfilter framework. Roll on, two-stage module 2096 * delete... 2097 */ 2098 synchronize_net(); 2099 i_see_dead_people: 2100 busy = 0; 2101 list_for_each_entry(net, net_exit_list, exit_list) { 2102 nf_ct_iterate_cleanup(kill_all, net, 0, 0); 2103 if (atomic_read(&net->ct.count) != 0) 2104 busy = 1; 2105 } 2106 if (busy) { 2107 schedule(); 2108 goto i_see_dead_people; 2109 } 2110 2111 list_for_each_entry(net, net_exit_list, exit_list) { 2112 nf_conntrack_proto_pernet_fini(net); 2113 nf_conntrack_ecache_pernet_fini(net); 2114 nf_conntrack_expect_pernet_fini(net); 2115 free_percpu(net->ct.stat); 2116 free_percpu(net->ct.pcpu_lists); 2117 } 2118 } 2119 2120 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) 2121 { 2122 struct hlist_nulls_head *hash; 2123 unsigned int nr_slots, i; 2124 2125 if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) 2126 return NULL; 2127 2128 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 2129 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 2130 2131 hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head), 2132 GFP_KERNEL | __GFP_ZERO); 2133 2134 if (hash && nulls) 2135 for (i = 0; i < nr_slots; i++) 2136 INIT_HLIST_NULLS_HEAD(&hash[i], i); 2137 2138 return hash; 2139 } 2140 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); 2141 2142 int nf_conntrack_hash_resize(unsigned int hashsize) 2143 { 2144 int i, bucket; 2145 unsigned int old_size; 2146 struct hlist_nulls_head *hash, *old_hash; 2147 struct nf_conntrack_tuple_hash *h; 2148 struct nf_conn *ct; 2149 2150 if (!hashsize) 2151 return -EINVAL; 2152 2153 hash = nf_ct_alloc_hashtable(&hashsize, 1); 2154 if (!hash) 2155 return -ENOMEM; 2156 2157 old_size = nf_conntrack_htable_size; 2158 if (old_size == hashsize) { 2159 kvfree(hash); 2160 return 0; 2161 } 2162 2163 local_bh_disable(); 2164 nf_conntrack_all_lock(); 2165 write_seqcount_begin(&nf_conntrack_generation); 2166 2167 /* Lookups in the old hash might happen in parallel, which means we 2168 * might get false negatives during connection lookup. New connections 2169 * created because of a false negative won't make it into the hash 2170 * though since that required taking the locks. 2171 */ 2172 2173 for (i = 0; i < nf_conntrack_htable_size; i++) { 2174 while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { 2175 h = hlist_nulls_entry(nf_conntrack_hash[i].first, 2176 struct nf_conntrack_tuple_hash, hnnode); 2177 ct = nf_ct_tuplehash_to_ctrack(h); 2178 hlist_nulls_del_rcu(&h->hnnode); 2179 bucket = __hash_conntrack(nf_ct_net(ct), 2180 &h->tuple, hashsize); 2181 hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); 2182 } 2183 } 2184 old_size = nf_conntrack_htable_size; 2185 old_hash = nf_conntrack_hash; 2186 2187 nf_conntrack_hash = hash; 2188 nf_conntrack_htable_size = hashsize; 2189 2190 write_seqcount_end(&nf_conntrack_generation); 2191 nf_conntrack_all_unlock(); 2192 local_bh_enable(); 2193 2194 synchronize_net(); 2195 kvfree(old_hash); 2196 return 0; 2197 } 2198 2199 int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) 2200 { 2201 unsigned int hashsize; 2202 int rc; 2203 2204 if (current->nsproxy->net_ns != &init_net) 2205 return -EOPNOTSUPP; 2206 2207 /* On boot, we can set this without any fancy locking. */ 2208 if (!nf_conntrack_hash) 2209 return param_set_uint(val, kp); 2210 2211 rc = kstrtouint(val, 0, &hashsize); 2212 if (rc) 2213 return rc; 2214 2215 return nf_conntrack_hash_resize(hashsize); 2216 } 2217 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 2218 2219 static __always_inline unsigned int total_extension_size(void) 2220 { 2221 /* remember to add new extensions below */ 2222 BUILD_BUG_ON(NF_CT_EXT_NUM > 9); 2223 2224 return sizeof(struct nf_ct_ext) + 2225 sizeof(struct nf_conn_help) 2226 #if IS_ENABLED(CONFIG_NF_NAT) 2227 + sizeof(struct nf_conn_nat) 2228 #endif 2229 + sizeof(struct nf_conn_seqadj) 2230 + sizeof(struct nf_conn_acct) 2231 #ifdef CONFIG_NF_CONNTRACK_EVENTS 2232 + sizeof(struct nf_conntrack_ecache) 2233 #endif 2234 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 2235 + sizeof(struct nf_conn_tstamp) 2236 #endif 2237 #ifdef CONFIG_NF_CONNTRACK_TIMEOUT 2238 + sizeof(struct nf_conn_timeout) 2239 #endif 2240 #ifdef CONFIG_NF_CONNTRACK_LABELS 2241 + sizeof(struct nf_conn_labels) 2242 #endif 2243 #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) 2244 + sizeof(struct nf_conn_synproxy) 2245 #endif 2246 ; 2247 }; 2248 2249 int nf_conntrack_init_start(void) 2250 { 2251 unsigned long nr_pages = totalram_pages(); 2252 int max_factor = 8; 2253 int ret = -ENOMEM; 2254 int i; 2255 2256 /* struct nf_ct_ext uses u8 to store offsets/size */ 2257 BUILD_BUG_ON(total_extension_size() > 255u); 2258 2259 seqcount_init(&nf_conntrack_generation); 2260 2261 for (i = 0; i < CONNTRACK_LOCKS; i++) 2262 spin_lock_init(&nf_conntrack_locks[i]); 2263 2264 if (!nf_conntrack_htable_size) { 2265 /* Idea from tcp.c: use 1/16384 of memory. 2266 * On i386: 32MB machine has 512 buckets. 2267 * >= 1GB machines have 16384 buckets. 2268 * >= 4GB machines have 65536 buckets. 2269 */ 2270 nf_conntrack_htable_size 2271 = (((nr_pages << PAGE_SHIFT) / 16384) 2272 / sizeof(struct hlist_head)); 2273 if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) 2274 nf_conntrack_htable_size = 65536; 2275 else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) 2276 nf_conntrack_htable_size = 16384; 2277 if (nf_conntrack_htable_size < 32) 2278 nf_conntrack_htable_size = 32; 2279 2280 /* Use a max. factor of four by default to get the same max as 2281 * with the old struct list_heads. When a table size is given 2282 * we use the old value of 8 to avoid reducing the max. 2283 * entries. */ 2284 max_factor = 4; 2285 } 2286 2287 nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); 2288 if (!nf_conntrack_hash) 2289 return -ENOMEM; 2290 2291 nf_conntrack_max = max_factor * nf_conntrack_htable_size; 2292 2293 nf_conntrack_cachep = kmem_cache_create("nf_conntrack", 2294 sizeof(struct nf_conn), 2295 NFCT_INFOMASK + 1, 2296 SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); 2297 if (!nf_conntrack_cachep) 2298 goto err_cachep; 2299 2300 ret = nf_conntrack_expect_init(); 2301 if (ret < 0) 2302 goto err_expect; 2303 2304 ret = nf_conntrack_acct_init(); 2305 if (ret < 0) 2306 goto err_acct; 2307 2308 ret = nf_conntrack_tstamp_init(); 2309 if (ret < 0) 2310 goto err_tstamp; 2311 2312 ret = nf_conntrack_ecache_init(); 2313 if (ret < 0) 2314 goto err_ecache; 2315 2316 ret = nf_conntrack_timeout_init(); 2317 if (ret < 0) 2318 goto err_timeout; 2319 2320 ret = nf_conntrack_helper_init(); 2321 if (ret < 0) 2322 goto err_helper; 2323 2324 ret = nf_conntrack_labels_init(); 2325 if (ret < 0) 2326 goto err_labels; 2327 2328 ret = nf_conntrack_seqadj_init(); 2329 if (ret < 0) 2330 goto err_seqadj; 2331 2332 ret = nf_conntrack_proto_init(); 2333 if (ret < 0) 2334 goto err_proto; 2335 2336 conntrack_gc_work_init(&conntrack_gc_work); 2337 queue_delayed_work(system_power_efficient_wq, &conntrack_gc_work.dwork, HZ); 2338 2339 return 0; 2340 2341 err_proto: 2342 nf_conntrack_seqadj_fini(); 2343 err_seqadj: 2344 nf_conntrack_labels_fini(); 2345 err_labels: 2346 nf_conntrack_helper_fini(); 2347 err_helper: 2348 nf_conntrack_timeout_fini(); 2349 err_timeout: 2350 nf_conntrack_ecache_fini(); 2351 err_ecache: 2352 nf_conntrack_tstamp_fini(); 2353 err_tstamp: 2354 nf_conntrack_acct_fini(); 2355 err_acct: 2356 nf_conntrack_expect_fini(); 2357 err_expect: 2358 kmem_cache_destroy(nf_conntrack_cachep); 2359 err_cachep: 2360 kvfree(nf_conntrack_hash); 2361 return ret; 2362 } 2363 2364 static struct nf_ct_hook nf_conntrack_hook = { 2365 .update = nf_conntrack_update, 2366 .destroy = destroy_conntrack, 2367 .get_tuple_skb = nf_conntrack_get_tuple_skb, 2368 }; 2369 2370 void nf_conntrack_init_end(void) 2371 { 2372 /* For use by REJECT target */ 2373 RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach); 2374 RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook); 2375 } 2376 2377 /* 2378 * We need to use special "null" values, not used in hash table 2379 */ 2380 #define UNCONFIRMED_NULLS_VAL ((1<<30)+0) 2381 #define DYING_NULLS_VAL ((1<<30)+1) 2382 #define TEMPLATE_NULLS_VAL ((1<<30)+2) 2383 2384 int nf_conntrack_init_net(struct net *net) 2385 { 2386 int ret = -ENOMEM; 2387 int cpu; 2388 2389 BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); 2390 atomic_set(&net->ct.count, 0); 2391 2392 net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); 2393 if (!net->ct.pcpu_lists) 2394 goto err_stat; 2395 2396 for_each_possible_cpu(cpu) { 2397 struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); 2398 2399 spin_lock_init(&pcpu->lock); 2400 INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); 2401 INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); 2402 } 2403 2404 net->ct.stat = alloc_percpu(struct ip_conntrack_stat); 2405 if (!net->ct.stat) 2406 goto err_pcpu_lists; 2407 2408 ret = nf_conntrack_expect_pernet_init(net); 2409 if (ret < 0) 2410 goto err_expect; 2411 2412 nf_conntrack_acct_pernet_init(net); 2413 nf_conntrack_tstamp_pernet_init(net); 2414 nf_conntrack_ecache_pernet_init(net); 2415 nf_conntrack_helper_pernet_init(net); 2416 2417 ret = nf_conntrack_proto_pernet_init(net); 2418 if (ret < 0) 2419 goto err_proto; 2420 return 0; 2421 2422 err_proto: 2423 nf_conntrack_ecache_pernet_fini(net); 2424 nf_conntrack_expect_pernet_fini(net); 2425 err_expect: 2426 free_percpu(net->ct.stat); 2427 err_pcpu_lists: 2428 free_percpu(net->ct.pcpu_lists); 2429 err_stat: 2430 return ret; 2431 } 2432