1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4 
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
7  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
8  * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License version 2 as
12  * published by the Free Software Foundation.
13  */
14 
15 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16 
17 #include <linux/types.h>
18 #include <linux/netfilter.h>
19 #include <linux/module.h>
20 #include <linux/sched.h>
21 #include <linux/skbuff.h>
22 #include <linux/proc_fs.h>
23 #include <linux/vmalloc.h>
24 #include <linux/stddef.h>
25 #include <linux/slab.h>
26 #include <linux/random.h>
27 #include <linux/jhash.h>
28 #include <linux/err.h>
29 #include <linux/percpu.h>
30 #include <linux/moduleparam.h>
31 #include <linux/notifier.h>
32 #include <linux/kernel.h>
33 #include <linux/netdevice.h>
34 #include <linux/socket.h>
35 #include <linux/mm.h>
36 #include <linux/nsproxy.h>
37 #include <linux/rculist_nulls.h>
38 
39 #include <net/netfilter/nf_conntrack.h>
40 #include <net/netfilter/nf_conntrack_l3proto.h>
41 #include <net/netfilter/nf_conntrack_l4proto.h>
42 #include <net/netfilter/nf_conntrack_expect.h>
43 #include <net/netfilter/nf_conntrack_helper.h>
44 #include <net/netfilter/nf_conntrack_seqadj.h>
45 #include <net/netfilter/nf_conntrack_core.h>
46 #include <net/netfilter/nf_conntrack_extend.h>
47 #include <net/netfilter/nf_conntrack_acct.h>
48 #include <net/netfilter/nf_conntrack_ecache.h>
49 #include <net/netfilter/nf_conntrack_zones.h>
50 #include <net/netfilter/nf_conntrack_timestamp.h>
51 #include <net/netfilter/nf_conntrack_timeout.h>
52 #include <net/netfilter/nf_conntrack_labels.h>
53 #include <net/netfilter/nf_conntrack_synproxy.h>
54 #include <net/netfilter/nf_nat.h>
55 #include <net/netfilter/nf_nat_core.h>
56 #include <net/netfilter/nf_nat_helper.h>
57 #include <net/netns/hash.h>
58 
59 #define NF_CONNTRACK_VERSION	"0.5.0"
60 
61 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
62 				      enum nf_nat_manip_type manip,
63 				      const struct nlattr *attr) __read_mostly;
64 EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
65 
66 __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
67 EXPORT_SYMBOL_GPL(nf_conntrack_locks);
68 
69 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
70 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
71 
72 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
73 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
74 
75 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
76 static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
77 static __read_mostly seqcount_t nf_conntrack_generation;
78 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
79 static __read_mostly bool nf_conntrack_locks_all;
80 
81 void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
82 {
83 	spin_lock(lock);
84 	while (unlikely(nf_conntrack_locks_all)) {
85 		spin_unlock(lock);
86 		spin_unlock_wait(&nf_conntrack_locks_all_lock);
87 		spin_lock(lock);
88 	}
89 }
90 EXPORT_SYMBOL_GPL(nf_conntrack_lock);
91 
92 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
93 {
94 	h1 %= CONNTRACK_LOCKS;
95 	h2 %= CONNTRACK_LOCKS;
96 	spin_unlock(&nf_conntrack_locks[h1]);
97 	if (h1 != h2)
98 		spin_unlock(&nf_conntrack_locks[h2]);
99 }
100 
101 /* return true if we need to recompute hashes (in case hash table was resized) */
102 static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
103 				     unsigned int h2, unsigned int sequence)
104 {
105 	h1 %= CONNTRACK_LOCKS;
106 	h2 %= CONNTRACK_LOCKS;
107 	if (h1 <= h2) {
108 		nf_conntrack_lock(&nf_conntrack_locks[h1]);
109 		if (h1 != h2)
110 			spin_lock_nested(&nf_conntrack_locks[h2],
111 					 SINGLE_DEPTH_NESTING);
112 	} else {
113 		nf_conntrack_lock(&nf_conntrack_locks[h2]);
114 		spin_lock_nested(&nf_conntrack_locks[h1],
115 				 SINGLE_DEPTH_NESTING);
116 	}
117 	if (read_seqcount_retry(&nf_conntrack_generation, sequence)) {
118 		nf_conntrack_double_unlock(h1, h2);
119 		return true;
120 	}
121 	return false;
122 }
123 
124 static void nf_conntrack_all_lock(void)
125 {
126 	int i;
127 
128 	spin_lock(&nf_conntrack_locks_all_lock);
129 	nf_conntrack_locks_all = true;
130 
131 	for (i = 0; i < CONNTRACK_LOCKS; i++) {
132 		spin_unlock_wait(&nf_conntrack_locks[i]);
133 	}
134 }
135 
136 static void nf_conntrack_all_unlock(void)
137 {
138 	nf_conntrack_locks_all = false;
139 	spin_unlock(&nf_conntrack_locks_all_lock);
140 }
141 
142 unsigned int nf_conntrack_htable_size __read_mostly;
143 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
144 
145 unsigned int nf_conntrack_max __read_mostly;
146 EXPORT_SYMBOL_GPL(nf_conntrack_max);
147 
148 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
149 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
150 
151 static unsigned int nf_conntrack_hash_rnd __read_mostly;
152 
153 static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
154 			      const struct net *net)
155 {
156 	unsigned int n;
157 	u32 seed;
158 
159 	get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
160 
161 	/* The direction must be ignored, so we hash everything up to the
162 	 * destination ports (which is a multiple of 4) and treat the last
163 	 * three bytes manually.
164 	 */
165 	seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
166 	n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
167 	return jhash2((u32 *)tuple, n, seed ^
168 		      (((__force __u16)tuple->dst.u.all << 16) |
169 		      tuple->dst.protonum));
170 }
171 
172 static u32 scale_hash(u32 hash)
173 {
174 	return reciprocal_scale(hash, nf_conntrack_htable_size);
175 }
176 
177 static u32 __hash_conntrack(const struct net *net,
178 			    const struct nf_conntrack_tuple *tuple,
179 			    unsigned int size)
180 {
181 	return reciprocal_scale(hash_conntrack_raw(tuple, net), size);
182 }
183 
184 static u32 hash_conntrack(const struct net *net,
185 			  const struct nf_conntrack_tuple *tuple)
186 {
187 	return scale_hash(hash_conntrack_raw(tuple, net));
188 }
189 
190 bool
191 nf_ct_get_tuple(const struct sk_buff *skb,
192 		unsigned int nhoff,
193 		unsigned int dataoff,
194 		u_int16_t l3num,
195 		u_int8_t protonum,
196 		struct net *net,
197 		struct nf_conntrack_tuple *tuple,
198 		const struct nf_conntrack_l3proto *l3proto,
199 		const struct nf_conntrack_l4proto *l4proto)
200 {
201 	memset(tuple, 0, sizeof(*tuple));
202 
203 	tuple->src.l3num = l3num;
204 	if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
205 		return false;
206 
207 	tuple->dst.protonum = protonum;
208 	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
209 
210 	return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
211 }
212 EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
213 
214 bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
215 		       u_int16_t l3num,
216 		       struct net *net, struct nf_conntrack_tuple *tuple)
217 {
218 	struct nf_conntrack_l3proto *l3proto;
219 	struct nf_conntrack_l4proto *l4proto;
220 	unsigned int protoff;
221 	u_int8_t protonum;
222 	int ret;
223 
224 	rcu_read_lock();
225 
226 	l3proto = __nf_ct_l3proto_find(l3num);
227 	ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
228 	if (ret != NF_ACCEPT) {
229 		rcu_read_unlock();
230 		return false;
231 	}
232 
233 	l4proto = __nf_ct_l4proto_find(l3num, protonum);
234 
235 	ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
236 			      l3proto, l4proto);
237 
238 	rcu_read_unlock();
239 	return ret;
240 }
241 EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
242 
243 bool
244 nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
245 		   const struct nf_conntrack_tuple *orig,
246 		   const struct nf_conntrack_l3proto *l3proto,
247 		   const struct nf_conntrack_l4proto *l4proto)
248 {
249 	memset(inverse, 0, sizeof(*inverse));
250 
251 	inverse->src.l3num = orig->src.l3num;
252 	if (l3proto->invert_tuple(inverse, orig) == 0)
253 		return false;
254 
255 	inverse->dst.dir = !orig->dst.dir;
256 
257 	inverse->dst.protonum = orig->dst.protonum;
258 	return l4proto->invert_tuple(inverse, orig);
259 }
260 EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
261 
262 static void
263 clean_from_lists(struct nf_conn *ct)
264 {
265 	pr_debug("clean_from_lists(%p)\n", ct);
266 	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
267 	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
268 
269 	/* Destroy all pending expectations */
270 	nf_ct_remove_expectations(ct);
271 }
272 
273 /* must be called with local_bh_disable */
274 static void nf_ct_add_to_dying_list(struct nf_conn *ct)
275 {
276 	struct ct_pcpu *pcpu;
277 
278 	/* add this conntrack to the (per cpu) dying list */
279 	ct->cpu = smp_processor_id();
280 	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
281 
282 	spin_lock(&pcpu->lock);
283 	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
284 			     &pcpu->dying);
285 	spin_unlock(&pcpu->lock);
286 }
287 
288 /* must be called with local_bh_disable */
289 static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
290 {
291 	struct ct_pcpu *pcpu;
292 
293 	/* add this conntrack to the (per cpu) unconfirmed list */
294 	ct->cpu = smp_processor_id();
295 	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
296 
297 	spin_lock(&pcpu->lock);
298 	hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
299 			     &pcpu->unconfirmed);
300 	spin_unlock(&pcpu->lock);
301 }
302 
303 /* must be called with local_bh_disable */
304 static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
305 {
306 	struct ct_pcpu *pcpu;
307 
308 	/* We overload first tuple to link into unconfirmed or dying list.*/
309 	pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
310 
311 	spin_lock(&pcpu->lock);
312 	BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
313 	hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
314 	spin_unlock(&pcpu->lock);
315 }
316 
317 /* Released via destroy_conntrack() */
318 struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
319 				 const struct nf_conntrack_zone *zone,
320 				 gfp_t flags)
321 {
322 	struct nf_conn *tmpl;
323 
324 	tmpl = kzalloc(sizeof(*tmpl), flags);
325 	if (tmpl == NULL)
326 		return NULL;
327 
328 	tmpl->status = IPS_TEMPLATE;
329 	write_pnet(&tmpl->ct_net, net);
330 	nf_ct_zone_add(tmpl, zone);
331 	atomic_set(&tmpl->ct_general.use, 0);
332 
333 	return tmpl;
334 }
335 EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc);
336 
337 void nf_ct_tmpl_free(struct nf_conn *tmpl)
338 {
339 	nf_ct_ext_destroy(tmpl);
340 	nf_ct_ext_free(tmpl);
341 	kfree(tmpl);
342 }
343 EXPORT_SYMBOL_GPL(nf_ct_tmpl_free);
344 
345 static void
346 destroy_conntrack(struct nf_conntrack *nfct)
347 {
348 	struct nf_conn *ct = (struct nf_conn *)nfct;
349 	struct net *net = nf_ct_net(ct);
350 	struct nf_conntrack_l4proto *l4proto;
351 
352 	pr_debug("destroy_conntrack(%p)\n", ct);
353 	NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
354 	NF_CT_ASSERT(!timer_pending(&ct->timeout));
355 
356 	if (unlikely(nf_ct_is_template(ct))) {
357 		nf_ct_tmpl_free(ct);
358 		return;
359 	}
360 	rcu_read_lock();
361 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
362 	if (l4proto->destroy)
363 		l4proto->destroy(ct);
364 
365 	rcu_read_unlock();
366 
367 	local_bh_disable();
368 	/* Expectations will have been removed in clean_from_lists,
369 	 * except TFTP can create an expectation on the first packet,
370 	 * before connection is in the list, so we need to clean here,
371 	 * too.
372 	 */
373 	nf_ct_remove_expectations(ct);
374 
375 	nf_ct_del_from_dying_or_unconfirmed_list(ct);
376 
377 	NF_CT_STAT_INC(net, delete);
378 	local_bh_enable();
379 
380 	if (ct->master)
381 		nf_ct_put(ct->master);
382 
383 	pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
384 	nf_conntrack_free(ct);
385 }
386 
387 static void nf_ct_delete_from_lists(struct nf_conn *ct)
388 {
389 	struct net *net = nf_ct_net(ct);
390 	unsigned int hash, reply_hash;
391 	unsigned int sequence;
392 
393 	nf_ct_helper_destroy(ct);
394 
395 	local_bh_disable();
396 	do {
397 		sequence = read_seqcount_begin(&nf_conntrack_generation);
398 		hash = hash_conntrack(net,
399 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
400 		reply_hash = hash_conntrack(net,
401 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
402 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
403 
404 	clean_from_lists(ct);
405 	nf_conntrack_double_unlock(hash, reply_hash);
406 
407 	nf_ct_add_to_dying_list(ct);
408 
409 	NF_CT_STAT_INC(net, delete_list);
410 	local_bh_enable();
411 }
412 
413 bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
414 {
415 	struct nf_conn_tstamp *tstamp;
416 
417 	tstamp = nf_conn_tstamp_find(ct);
418 	if (tstamp && tstamp->stop == 0)
419 		tstamp->stop = ktime_get_real_ns();
420 
421 	if (nf_ct_is_dying(ct))
422 		goto delete;
423 
424 	if (nf_conntrack_event_report(IPCT_DESTROY, ct,
425 				    portid, report) < 0) {
426 		/* destroy event was not delivered */
427 		nf_ct_delete_from_lists(ct);
428 		nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
429 		return false;
430 	}
431 
432 	nf_conntrack_ecache_work(nf_ct_net(ct));
433 	set_bit(IPS_DYING_BIT, &ct->status);
434  delete:
435 	nf_ct_delete_from_lists(ct);
436 	nf_ct_put(ct);
437 	return true;
438 }
439 EXPORT_SYMBOL_GPL(nf_ct_delete);
440 
441 static void death_by_timeout(unsigned long ul_conntrack)
442 {
443 	nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
444 }
445 
446 static inline bool
447 nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
448 		const struct nf_conntrack_tuple *tuple,
449 		const struct nf_conntrack_zone *zone,
450 		const struct net *net)
451 {
452 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
453 
454 	/* A conntrack can be recreated with the equal tuple,
455 	 * so we need to check that the conntrack is confirmed
456 	 */
457 	return nf_ct_tuple_equal(tuple, &h->tuple) &&
458 	       nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) &&
459 	       nf_ct_is_confirmed(ct) &&
460 	       net_eq(net, nf_ct_net(ct));
461 }
462 
463 /* must be called with rcu read lock held */
464 void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
465 {
466 	struct hlist_nulls_head *hptr;
467 	unsigned int sequence, hsz;
468 
469 	do {
470 		sequence = read_seqcount_begin(&nf_conntrack_generation);
471 		hsz = nf_conntrack_htable_size;
472 		hptr = nf_conntrack_hash;
473 	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
474 
475 	*hash = hptr;
476 	*hsize = hsz;
477 }
478 EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
479 
480 /*
481  * Warning :
482  * - Caller must take a reference on returned object
483  *   and recheck nf_ct_tuple_equal(tuple, &h->tuple)
484  */
485 static struct nf_conntrack_tuple_hash *
486 ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
487 		      const struct nf_conntrack_tuple *tuple, u32 hash)
488 {
489 	struct nf_conntrack_tuple_hash *h;
490 	struct hlist_nulls_head *ct_hash;
491 	struct hlist_nulls_node *n;
492 	unsigned int bucket, sequence;
493 
494 begin:
495 	do {
496 		sequence = read_seqcount_begin(&nf_conntrack_generation);
497 		bucket = scale_hash(hash);
498 		ct_hash = nf_conntrack_hash;
499 	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
500 
501 	hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
502 		if (nf_ct_key_equal(h, tuple, zone, net)) {
503 			NF_CT_STAT_INC_ATOMIC(net, found);
504 			return h;
505 		}
506 		NF_CT_STAT_INC_ATOMIC(net, searched);
507 	}
508 	/*
509 	 * if the nulls value we got at the end of this lookup is
510 	 * not the expected one, we must restart lookup.
511 	 * We probably met an item that was moved to another chain.
512 	 */
513 	if (get_nulls_value(n) != bucket) {
514 		NF_CT_STAT_INC_ATOMIC(net, search_restart);
515 		goto begin;
516 	}
517 
518 	return NULL;
519 }
520 
521 /* Find a connection corresponding to a tuple. */
522 static struct nf_conntrack_tuple_hash *
523 __nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
524 			const struct nf_conntrack_tuple *tuple, u32 hash)
525 {
526 	struct nf_conntrack_tuple_hash *h;
527 	struct nf_conn *ct;
528 
529 	rcu_read_lock();
530 begin:
531 	h = ____nf_conntrack_find(net, zone, tuple, hash);
532 	if (h) {
533 		ct = nf_ct_tuplehash_to_ctrack(h);
534 		if (unlikely(nf_ct_is_dying(ct) ||
535 			     !atomic_inc_not_zero(&ct->ct_general.use)))
536 			h = NULL;
537 		else {
538 			if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) {
539 				nf_ct_put(ct);
540 				goto begin;
541 			}
542 		}
543 	}
544 	rcu_read_unlock();
545 
546 	return h;
547 }
548 
549 struct nf_conntrack_tuple_hash *
550 nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
551 		      const struct nf_conntrack_tuple *tuple)
552 {
553 	return __nf_conntrack_find_get(net, zone, tuple,
554 				       hash_conntrack_raw(tuple, net));
555 }
556 EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
557 
558 static void __nf_conntrack_hash_insert(struct nf_conn *ct,
559 				       unsigned int hash,
560 				       unsigned int reply_hash)
561 {
562 	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
563 			   &nf_conntrack_hash[hash]);
564 	hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
565 			   &nf_conntrack_hash[reply_hash]);
566 }
567 
568 int
569 nf_conntrack_hash_check_insert(struct nf_conn *ct)
570 {
571 	const struct nf_conntrack_zone *zone;
572 	struct net *net = nf_ct_net(ct);
573 	unsigned int hash, reply_hash;
574 	struct nf_conntrack_tuple_hash *h;
575 	struct hlist_nulls_node *n;
576 	unsigned int sequence;
577 
578 	zone = nf_ct_zone(ct);
579 
580 	local_bh_disable();
581 	do {
582 		sequence = read_seqcount_begin(&nf_conntrack_generation);
583 		hash = hash_conntrack(net,
584 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
585 		reply_hash = hash_conntrack(net,
586 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
587 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
588 
589 	/* See if there's one in the list already, including reverse */
590 	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
591 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
592 				    zone, net))
593 			goto out;
594 
595 	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
596 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
597 				    zone, net))
598 			goto out;
599 
600 	add_timer(&ct->timeout);
601 	smp_wmb();
602 	/* The caller holds a reference to this object */
603 	atomic_set(&ct->ct_general.use, 2);
604 	__nf_conntrack_hash_insert(ct, hash, reply_hash);
605 	nf_conntrack_double_unlock(hash, reply_hash);
606 	NF_CT_STAT_INC(net, insert);
607 	local_bh_enable();
608 	return 0;
609 
610 out:
611 	nf_conntrack_double_unlock(hash, reply_hash);
612 	NF_CT_STAT_INC(net, insert_failed);
613 	local_bh_enable();
614 	return -EEXIST;
615 }
616 EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
617 
618 static inline void nf_ct_acct_update(struct nf_conn *ct,
619 				     enum ip_conntrack_info ctinfo,
620 				     unsigned int len)
621 {
622 	struct nf_conn_acct *acct;
623 
624 	acct = nf_conn_acct_find(ct);
625 	if (acct) {
626 		struct nf_conn_counter *counter = acct->counter;
627 
628 		atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
629 		atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
630 	}
631 }
632 
633 static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
634 			     const struct nf_conn *loser_ct)
635 {
636 	struct nf_conn_acct *acct;
637 
638 	acct = nf_conn_acct_find(loser_ct);
639 	if (acct) {
640 		struct nf_conn_counter *counter = acct->counter;
641 		unsigned int bytes;
642 
643 		/* u32 should be fine since we must have seen one packet. */
644 		bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
645 		nf_ct_acct_update(ct, ctinfo, bytes);
646 	}
647 }
648 
649 /* Resolve race on insertion if this protocol allows this. */
650 static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
651 			       enum ip_conntrack_info ctinfo,
652 			       struct nf_conntrack_tuple_hash *h)
653 {
654 	/* This is the conntrack entry already in hashes that won race. */
655 	struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
656 	struct nf_conntrack_l4proto *l4proto;
657 
658 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
659 	if (l4proto->allow_clash &&
660 	    !nfct_nat(ct) &&
661 	    !nf_ct_is_dying(ct) &&
662 	    atomic_inc_not_zero(&ct->ct_general.use)) {
663 		nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
664 		nf_conntrack_put(skb->nfct);
665 		/* Assign conntrack already in hashes to this skbuff. Don't
666 		 * modify skb->nfctinfo to ensure consistent stateful filtering.
667 		 */
668 		skb->nfct = &ct->ct_general;
669 		return NF_ACCEPT;
670 	}
671 	NF_CT_STAT_INC(net, drop);
672 	return NF_DROP;
673 }
674 
675 /* Confirm a connection given skb; places it in hash table */
676 int
677 __nf_conntrack_confirm(struct sk_buff *skb)
678 {
679 	const struct nf_conntrack_zone *zone;
680 	unsigned int hash, reply_hash;
681 	struct nf_conntrack_tuple_hash *h;
682 	struct nf_conn *ct;
683 	struct nf_conn_help *help;
684 	struct nf_conn_tstamp *tstamp;
685 	struct hlist_nulls_node *n;
686 	enum ip_conntrack_info ctinfo;
687 	struct net *net;
688 	unsigned int sequence;
689 	int ret = NF_DROP;
690 
691 	ct = nf_ct_get(skb, &ctinfo);
692 	net = nf_ct_net(ct);
693 
694 	/* ipt_REJECT uses nf_conntrack_attach to attach related
695 	   ICMP/TCP RST packets in other direction.  Actual packet
696 	   which created connection will be IP_CT_NEW or for an
697 	   expected connection, IP_CT_RELATED. */
698 	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
699 		return NF_ACCEPT;
700 
701 	zone = nf_ct_zone(ct);
702 	local_bh_disable();
703 
704 	do {
705 		sequence = read_seqcount_begin(&nf_conntrack_generation);
706 		/* reuse the hash saved before */
707 		hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
708 		hash = scale_hash(hash);
709 		reply_hash = hash_conntrack(net,
710 					   &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
711 
712 	} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
713 
714 	/* We're not in hash table, and we refuse to set up related
715 	 * connections for unconfirmed conns.  But packet copies and
716 	 * REJECT will give spurious warnings here.
717 	 */
718 	/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
719 
720 	/* No external references means no one else could have
721 	 * confirmed us.
722 	 */
723 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
724 	pr_debug("Confirming conntrack %p\n", ct);
725 	/* We have to check the DYING flag after unlink to prevent
726 	 * a race against nf_ct_get_next_corpse() possibly called from
727 	 * user context, else we insert an already 'dead' hash, blocking
728 	 * further use of that particular connection -JM.
729 	 */
730 	nf_ct_del_from_dying_or_unconfirmed_list(ct);
731 
732 	if (unlikely(nf_ct_is_dying(ct))) {
733 		nf_ct_add_to_dying_list(ct);
734 		goto dying;
735 	}
736 
737 	/* See if there's one in the list already, including reverse:
738 	   NAT could have grabbed it without realizing, since we're
739 	   not in the hash.  If there is, we lost race. */
740 	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
741 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
742 				    zone, net))
743 			goto out;
744 
745 	hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
746 		if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
747 				    zone, net))
748 			goto out;
749 
750 	/* Timer relative to confirmation time, not original
751 	   setting time, otherwise we'd get timer wrap in
752 	   weird delay cases. */
753 	ct->timeout.expires += jiffies;
754 	add_timer(&ct->timeout);
755 	atomic_inc(&ct->ct_general.use);
756 	ct->status |= IPS_CONFIRMED;
757 
758 	/* set conntrack timestamp, if enabled. */
759 	tstamp = nf_conn_tstamp_find(ct);
760 	if (tstamp) {
761 		if (skb->tstamp.tv64 == 0)
762 			__net_timestamp(skb);
763 
764 		tstamp->start = ktime_to_ns(skb->tstamp);
765 	}
766 	/* Since the lookup is lockless, hash insertion must be done after
767 	 * starting the timer and setting the CONFIRMED bit. The RCU barriers
768 	 * guarantee that no other CPU can find the conntrack before the above
769 	 * stores are visible.
770 	 */
771 	__nf_conntrack_hash_insert(ct, hash, reply_hash);
772 	nf_conntrack_double_unlock(hash, reply_hash);
773 	NF_CT_STAT_INC(net, insert);
774 	local_bh_enable();
775 
776 	help = nfct_help(ct);
777 	if (help && help->helper)
778 		nf_conntrack_event_cache(IPCT_HELPER, ct);
779 
780 	nf_conntrack_event_cache(master_ct(ct) ?
781 				 IPCT_RELATED : IPCT_NEW, ct);
782 	return NF_ACCEPT;
783 
784 out:
785 	nf_ct_add_to_dying_list(ct);
786 	ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
787 dying:
788 	nf_conntrack_double_unlock(hash, reply_hash);
789 	NF_CT_STAT_INC(net, insert_failed);
790 	local_bh_enable();
791 	return ret;
792 }
793 EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
794 
795 /* Returns true if a connection correspondings to the tuple (required
796    for NAT). */
797 int
798 nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
799 			 const struct nf_conn *ignored_conntrack)
800 {
801 	struct net *net = nf_ct_net(ignored_conntrack);
802 	const struct nf_conntrack_zone *zone;
803 	struct nf_conntrack_tuple_hash *h;
804 	struct hlist_nulls_head *ct_hash;
805 	unsigned int hash, sequence;
806 	struct hlist_nulls_node *n;
807 	struct nf_conn *ct;
808 
809 	zone = nf_ct_zone(ignored_conntrack);
810 
811 	rcu_read_lock();
812 	do {
813 		sequence = read_seqcount_begin(&nf_conntrack_generation);
814 		hash = hash_conntrack(net, tuple);
815 		ct_hash = nf_conntrack_hash;
816 	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
817 
818 	hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
819 		ct = nf_ct_tuplehash_to_ctrack(h);
820 		if (ct != ignored_conntrack &&
821 		    nf_ct_key_equal(h, tuple, zone, net)) {
822 			NF_CT_STAT_INC_ATOMIC(net, found);
823 			rcu_read_unlock();
824 			return 1;
825 		}
826 		NF_CT_STAT_INC_ATOMIC(net, searched);
827 	}
828 	rcu_read_unlock();
829 
830 	return 0;
831 }
832 EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
833 
834 #define NF_CT_EVICTION_RANGE	8
835 
836 /* There's a small race here where we may free a just-assured
837    connection.  Too bad: we're in trouble anyway. */
838 static unsigned int early_drop_list(struct net *net,
839 				    struct hlist_nulls_head *head)
840 {
841 	struct nf_conntrack_tuple_hash *h;
842 	struct hlist_nulls_node *n;
843 	unsigned int drops = 0;
844 	struct nf_conn *tmp;
845 
846 	hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
847 		tmp = nf_ct_tuplehash_to_ctrack(h);
848 
849 		if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
850 		    !net_eq(nf_ct_net(tmp), net) ||
851 		    nf_ct_is_dying(tmp))
852 			continue;
853 
854 		if (!atomic_inc_not_zero(&tmp->ct_general.use))
855 			continue;
856 
857 		/* kill only if still in same netns -- might have moved due to
858 		 * SLAB_DESTROY_BY_RCU rules.
859 		 *
860 		 * We steal the timer reference.  If that fails timer has
861 		 * already fired or someone else deleted it. Just drop ref
862 		 * and move to next entry.
863 		 */
864 		if (net_eq(nf_ct_net(tmp), net) &&
865 		    nf_ct_is_confirmed(tmp) &&
866 		    del_timer(&tmp->timeout) &&
867 		    nf_ct_delete(tmp, 0, 0))
868 			drops++;
869 
870 		nf_ct_put(tmp);
871 	}
872 
873 	return drops;
874 }
875 
876 static noinline int early_drop(struct net *net, unsigned int _hash)
877 {
878 	unsigned int i;
879 
880 	for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
881 		struct hlist_nulls_head *ct_hash;
882 		unsigned hash, sequence, drops;
883 
884 		rcu_read_lock();
885 		do {
886 			sequence = read_seqcount_begin(&nf_conntrack_generation);
887 			hash = scale_hash(_hash++);
888 			ct_hash = nf_conntrack_hash;
889 		} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
890 
891 		drops = early_drop_list(net, &ct_hash[hash]);
892 		rcu_read_unlock();
893 
894 		if (drops) {
895 			NF_CT_STAT_ADD_ATOMIC(net, early_drop, drops);
896 			return true;
897 		}
898 	}
899 
900 	return false;
901 }
902 
903 static struct nf_conn *
904 __nf_conntrack_alloc(struct net *net,
905 		     const struct nf_conntrack_zone *zone,
906 		     const struct nf_conntrack_tuple *orig,
907 		     const struct nf_conntrack_tuple *repl,
908 		     gfp_t gfp, u32 hash)
909 {
910 	struct nf_conn *ct;
911 
912 	/* We don't want any race condition at early drop stage */
913 	atomic_inc(&net->ct.count);
914 
915 	if (nf_conntrack_max &&
916 	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
917 		if (!early_drop(net, hash)) {
918 			atomic_dec(&net->ct.count);
919 			net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
920 			return ERR_PTR(-ENOMEM);
921 		}
922 	}
923 
924 	/*
925 	 * Do not use kmem_cache_zalloc(), as this cache uses
926 	 * SLAB_DESTROY_BY_RCU.
927 	 */
928 	ct = kmem_cache_alloc(nf_conntrack_cachep, gfp);
929 	if (ct == NULL)
930 		goto out;
931 
932 	spin_lock_init(&ct->lock);
933 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
934 	ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
935 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
936 	/* save hash for reusing when confirming */
937 	*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
938 	ct->status = 0;
939 	/* Don't set timer yet: wait for confirmation */
940 	setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
941 	write_pnet(&ct->ct_net, net);
942 	memset(&ct->__nfct_init_offset[0], 0,
943 	       offsetof(struct nf_conn, proto) -
944 	       offsetof(struct nf_conn, __nfct_init_offset[0]));
945 
946 	nf_ct_zone_add(ct, zone);
947 
948 	/* Because we use RCU lookups, we set ct_general.use to zero before
949 	 * this is inserted in any list.
950 	 */
951 	atomic_set(&ct->ct_general.use, 0);
952 	return ct;
953 out:
954 	atomic_dec(&net->ct.count);
955 	return ERR_PTR(-ENOMEM);
956 }
957 
958 struct nf_conn *nf_conntrack_alloc(struct net *net,
959 				   const struct nf_conntrack_zone *zone,
960 				   const struct nf_conntrack_tuple *orig,
961 				   const struct nf_conntrack_tuple *repl,
962 				   gfp_t gfp)
963 {
964 	return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0);
965 }
966 EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
967 
968 void nf_conntrack_free(struct nf_conn *ct)
969 {
970 	struct net *net = nf_ct_net(ct);
971 
972 	/* A freed object has refcnt == 0, that's
973 	 * the golden rule for SLAB_DESTROY_BY_RCU
974 	 */
975 	NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0);
976 
977 	nf_ct_ext_destroy(ct);
978 	nf_ct_ext_free(ct);
979 	kmem_cache_free(nf_conntrack_cachep, ct);
980 	smp_mb__before_atomic();
981 	atomic_dec(&net->ct.count);
982 }
983 EXPORT_SYMBOL_GPL(nf_conntrack_free);
984 
985 
986 /* Allocate a new conntrack: we return -ENOMEM if classification
987    failed due to stress.  Otherwise it really is unclassifiable. */
988 static struct nf_conntrack_tuple_hash *
989 init_conntrack(struct net *net, struct nf_conn *tmpl,
990 	       const struct nf_conntrack_tuple *tuple,
991 	       struct nf_conntrack_l3proto *l3proto,
992 	       struct nf_conntrack_l4proto *l4proto,
993 	       struct sk_buff *skb,
994 	       unsigned int dataoff, u32 hash)
995 {
996 	struct nf_conn *ct;
997 	struct nf_conn_help *help;
998 	struct nf_conntrack_tuple repl_tuple;
999 	struct nf_conntrack_ecache *ecache;
1000 	struct nf_conntrack_expect *exp = NULL;
1001 	const struct nf_conntrack_zone *zone;
1002 	struct nf_conn_timeout *timeout_ext;
1003 	struct nf_conntrack_zone tmp;
1004 	unsigned int *timeouts;
1005 
1006 	if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
1007 		pr_debug("Can't invert tuple.\n");
1008 		return NULL;
1009 	}
1010 
1011 	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1012 	ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
1013 				  hash);
1014 	if (IS_ERR(ct))
1015 		return (struct nf_conntrack_tuple_hash *)ct;
1016 
1017 	if (tmpl && nfct_synproxy(tmpl)) {
1018 		nfct_seqadj_ext_add(ct);
1019 		nfct_synproxy_ext_add(ct);
1020 	}
1021 
1022 	timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
1023 	if (timeout_ext) {
1024 		timeouts = nf_ct_timeout_data(timeout_ext);
1025 		if (unlikely(!timeouts))
1026 			timeouts = l4proto->get_timeouts(net);
1027 	} else {
1028 		timeouts = l4proto->get_timeouts(net);
1029 	}
1030 
1031 	if (!l4proto->new(ct, skb, dataoff, timeouts)) {
1032 		nf_conntrack_free(ct);
1033 		pr_debug("can't track with proto module\n");
1034 		return NULL;
1035 	}
1036 
1037 	if (timeout_ext)
1038 		nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
1039 				      GFP_ATOMIC);
1040 
1041 	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1042 	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1043 	nf_ct_labels_ext_add(ct);
1044 
1045 	ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
1046 	nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
1047 				 ecache ? ecache->expmask : 0,
1048 			     GFP_ATOMIC);
1049 
1050 	local_bh_disable();
1051 	if (net->ct.expect_count) {
1052 		spin_lock(&nf_conntrack_expect_lock);
1053 		exp = nf_ct_find_expectation(net, zone, tuple);
1054 		if (exp) {
1055 			pr_debug("expectation arrives ct=%p exp=%p\n",
1056 				 ct, exp);
1057 			/* Welcome, Mr. Bond.  We've been expecting you... */
1058 			__set_bit(IPS_EXPECTED_BIT, &ct->status);
1059 			/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
1060 			ct->master = exp->master;
1061 			if (exp->helper) {
1062 				help = nf_ct_helper_ext_add(ct, exp->helper,
1063 							    GFP_ATOMIC);
1064 				if (help)
1065 					rcu_assign_pointer(help->helper, exp->helper);
1066 			}
1067 
1068 #ifdef CONFIG_NF_CONNTRACK_MARK
1069 			ct->mark = exp->master->mark;
1070 #endif
1071 #ifdef CONFIG_NF_CONNTRACK_SECMARK
1072 			ct->secmark = exp->master->secmark;
1073 #endif
1074 			NF_CT_STAT_INC(net, expect_new);
1075 		}
1076 		spin_unlock(&nf_conntrack_expect_lock);
1077 	}
1078 	if (!exp) {
1079 		__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
1080 		NF_CT_STAT_INC(net, new);
1081 	}
1082 
1083 	/* Now it is inserted into the unconfirmed list, bump refcount */
1084 	nf_conntrack_get(&ct->ct_general);
1085 	nf_ct_add_to_unconfirmed_list(ct);
1086 
1087 	local_bh_enable();
1088 
1089 	if (exp) {
1090 		if (exp->expectfn)
1091 			exp->expectfn(ct, exp);
1092 		nf_ct_expect_put(exp);
1093 	}
1094 
1095 	return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
1096 }
1097 
1098 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
1099 static inline struct nf_conn *
1100 resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
1101 		  struct sk_buff *skb,
1102 		  unsigned int dataoff,
1103 		  u_int16_t l3num,
1104 		  u_int8_t protonum,
1105 		  struct nf_conntrack_l3proto *l3proto,
1106 		  struct nf_conntrack_l4proto *l4proto,
1107 		  int *set_reply,
1108 		  enum ip_conntrack_info *ctinfo)
1109 {
1110 	const struct nf_conntrack_zone *zone;
1111 	struct nf_conntrack_tuple tuple;
1112 	struct nf_conntrack_tuple_hash *h;
1113 	struct nf_conntrack_zone tmp;
1114 	struct nf_conn *ct;
1115 	u32 hash;
1116 
1117 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
1118 			     dataoff, l3num, protonum, net, &tuple, l3proto,
1119 			     l4proto)) {
1120 		pr_debug("Can't get tuple\n");
1121 		return NULL;
1122 	}
1123 
1124 	/* look for tuple match */
1125 	zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
1126 	hash = hash_conntrack_raw(&tuple, net);
1127 	h = __nf_conntrack_find_get(net, zone, &tuple, hash);
1128 	if (!h) {
1129 		h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
1130 				   skb, dataoff, hash);
1131 		if (!h)
1132 			return NULL;
1133 		if (IS_ERR(h))
1134 			return (void *)h;
1135 	}
1136 	ct = nf_ct_tuplehash_to_ctrack(h);
1137 
1138 	/* It exists; we have (non-exclusive) reference. */
1139 	if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
1140 		*ctinfo = IP_CT_ESTABLISHED_REPLY;
1141 		/* Please set reply bit if this packet OK */
1142 		*set_reply = 1;
1143 	} else {
1144 		/* Once we've had two way comms, always ESTABLISHED. */
1145 		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
1146 			pr_debug("normal packet for %p\n", ct);
1147 			*ctinfo = IP_CT_ESTABLISHED;
1148 		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
1149 			pr_debug("related packet for %p\n", ct);
1150 			*ctinfo = IP_CT_RELATED;
1151 		} else {
1152 			pr_debug("new packet for %p\n", ct);
1153 			*ctinfo = IP_CT_NEW;
1154 		}
1155 		*set_reply = 0;
1156 	}
1157 	skb->nfct = &ct->ct_general;
1158 	skb->nfctinfo = *ctinfo;
1159 	return ct;
1160 }
1161 
1162 unsigned int
1163 nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
1164 		struct sk_buff *skb)
1165 {
1166 	struct nf_conn *ct, *tmpl = NULL;
1167 	enum ip_conntrack_info ctinfo;
1168 	struct nf_conntrack_l3proto *l3proto;
1169 	struct nf_conntrack_l4proto *l4proto;
1170 	unsigned int *timeouts;
1171 	unsigned int dataoff;
1172 	u_int8_t protonum;
1173 	int set_reply = 0;
1174 	int ret;
1175 
1176 	if (skb->nfct) {
1177 		/* Previously seen (loopback or untracked)?  Ignore. */
1178 		tmpl = (struct nf_conn *)skb->nfct;
1179 		if (!nf_ct_is_template(tmpl)) {
1180 			NF_CT_STAT_INC_ATOMIC(net, ignore);
1181 			return NF_ACCEPT;
1182 		}
1183 		skb->nfct = NULL;
1184 	}
1185 
1186 	/* rcu_read_lock()ed by nf_hook_slow */
1187 	l3proto = __nf_ct_l3proto_find(pf);
1188 	ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
1189 				   &dataoff, &protonum);
1190 	if (ret <= 0) {
1191 		pr_debug("not prepared to track yet or error occurred\n");
1192 		NF_CT_STAT_INC_ATOMIC(net, error);
1193 		NF_CT_STAT_INC_ATOMIC(net, invalid);
1194 		ret = -ret;
1195 		goto out;
1196 	}
1197 
1198 	l4proto = __nf_ct_l4proto_find(pf, protonum);
1199 
1200 	/* It may be an special packet, error, unclean...
1201 	 * inverse of the return code tells to the netfilter
1202 	 * core what to do with the packet. */
1203 	if (l4proto->error != NULL) {
1204 		ret = l4proto->error(net, tmpl, skb, dataoff, &ctinfo,
1205 				     pf, hooknum);
1206 		if (ret <= 0) {
1207 			NF_CT_STAT_INC_ATOMIC(net, error);
1208 			NF_CT_STAT_INC_ATOMIC(net, invalid);
1209 			ret = -ret;
1210 			goto out;
1211 		}
1212 		/* ICMP[v6] protocol trackers may assign one conntrack. */
1213 		if (skb->nfct)
1214 			goto out;
1215 	}
1216 
1217 	ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
1218 			       l3proto, l4proto, &set_reply, &ctinfo);
1219 	if (!ct) {
1220 		/* Not valid part of a connection */
1221 		NF_CT_STAT_INC_ATOMIC(net, invalid);
1222 		ret = NF_ACCEPT;
1223 		goto out;
1224 	}
1225 
1226 	if (IS_ERR(ct)) {
1227 		/* Too stressed to deal. */
1228 		NF_CT_STAT_INC_ATOMIC(net, drop);
1229 		ret = NF_DROP;
1230 		goto out;
1231 	}
1232 
1233 	NF_CT_ASSERT(skb->nfct);
1234 
1235 	/* Decide what timeout policy we want to apply to this flow. */
1236 	timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
1237 
1238 	ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum, timeouts);
1239 	if (ret <= 0) {
1240 		/* Invalid: inverse of the return code tells
1241 		 * the netfilter core what to do */
1242 		pr_debug("nf_conntrack_in: Can't track with proto module\n");
1243 		nf_conntrack_put(skb->nfct);
1244 		skb->nfct = NULL;
1245 		NF_CT_STAT_INC_ATOMIC(net, invalid);
1246 		if (ret == -NF_DROP)
1247 			NF_CT_STAT_INC_ATOMIC(net, drop);
1248 		ret = -ret;
1249 		goto out;
1250 	}
1251 
1252 	if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
1253 		nf_conntrack_event_cache(IPCT_REPLY, ct);
1254 out:
1255 	if (tmpl) {
1256 		/* Special case: we have to repeat this hook, assign the
1257 		 * template again to this packet. We assume that this packet
1258 		 * has no conntrack assigned. This is used by nf_ct_tcp. */
1259 		if (ret == NF_REPEAT)
1260 			skb->nfct = (struct nf_conntrack *)tmpl;
1261 		else
1262 			nf_ct_put(tmpl);
1263 	}
1264 
1265 	return ret;
1266 }
1267 EXPORT_SYMBOL_GPL(nf_conntrack_in);
1268 
1269 bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
1270 			  const struct nf_conntrack_tuple *orig)
1271 {
1272 	bool ret;
1273 
1274 	rcu_read_lock();
1275 	ret = nf_ct_invert_tuple(inverse, orig,
1276 				 __nf_ct_l3proto_find(orig->src.l3num),
1277 				 __nf_ct_l4proto_find(orig->src.l3num,
1278 						      orig->dst.protonum));
1279 	rcu_read_unlock();
1280 	return ret;
1281 }
1282 EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
1283 
1284 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1285    implicitly racy: see __nf_conntrack_confirm */
1286 void nf_conntrack_alter_reply(struct nf_conn *ct,
1287 			      const struct nf_conntrack_tuple *newreply)
1288 {
1289 	struct nf_conn_help *help = nfct_help(ct);
1290 
1291 	/* Should be unconfirmed, so not in hash table yet */
1292 	NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
1293 
1294 	pr_debug("Altering reply tuple of %p to ", ct);
1295 	nf_ct_dump_tuple(newreply);
1296 
1297 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1298 	if (ct->master || (help && !hlist_empty(&help->expectations)))
1299 		return;
1300 
1301 	rcu_read_lock();
1302 	__nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC);
1303 	rcu_read_unlock();
1304 }
1305 EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
1306 
1307 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1308 void __nf_ct_refresh_acct(struct nf_conn *ct,
1309 			  enum ip_conntrack_info ctinfo,
1310 			  const struct sk_buff *skb,
1311 			  unsigned long extra_jiffies,
1312 			  int do_acct)
1313 {
1314 	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
1315 	NF_CT_ASSERT(skb);
1316 
1317 	/* Only update if this is not a fixed timeout */
1318 	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
1319 		goto acct;
1320 
1321 	/* If not in hash table, timer will not be active yet */
1322 	if (!nf_ct_is_confirmed(ct)) {
1323 		ct->timeout.expires = extra_jiffies;
1324 	} else {
1325 		unsigned long newtime = jiffies + extra_jiffies;
1326 
1327 		/* Only update the timeout if the new timeout is at least
1328 		   HZ jiffies from the old timeout. Need del_timer for race
1329 		   avoidance (may already be dying). */
1330 		if (newtime - ct->timeout.expires >= HZ)
1331 			mod_timer_pending(&ct->timeout, newtime);
1332 	}
1333 
1334 acct:
1335 	if (do_acct)
1336 		nf_ct_acct_update(ct, ctinfo, skb->len);
1337 }
1338 EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
1339 
1340 bool __nf_ct_kill_acct(struct nf_conn *ct,
1341 		       enum ip_conntrack_info ctinfo,
1342 		       const struct sk_buff *skb,
1343 		       int do_acct)
1344 {
1345 	if (do_acct)
1346 		nf_ct_acct_update(ct, ctinfo, skb->len);
1347 
1348 	if (del_timer(&ct->timeout)) {
1349 		ct->timeout.function((unsigned long)ct);
1350 		return true;
1351 	}
1352 	return false;
1353 }
1354 EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
1355 
1356 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
1357 
1358 #include <linux/netfilter/nfnetlink.h>
1359 #include <linux/netfilter/nfnetlink_conntrack.h>
1360 #include <linux/mutex.h>
1361 
1362 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1363  * in ip_conntrack_core, since we don't want the protocols to autoload
1364  * or depend on ctnetlink */
1365 int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
1366 			       const struct nf_conntrack_tuple *tuple)
1367 {
1368 	if (nla_put_be16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port) ||
1369 	    nla_put_be16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port))
1370 		goto nla_put_failure;
1371 	return 0;
1372 
1373 nla_put_failure:
1374 	return -1;
1375 }
1376 EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
1377 
1378 const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
1379 	[CTA_PROTO_SRC_PORT]  = { .type = NLA_U16 },
1380 	[CTA_PROTO_DST_PORT]  = { .type = NLA_U16 },
1381 };
1382 EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
1383 
1384 int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
1385 			       struct nf_conntrack_tuple *t)
1386 {
1387 	if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
1388 		return -EINVAL;
1389 
1390 	t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
1391 	t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
1392 
1393 	return 0;
1394 }
1395 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
1396 
1397 int nf_ct_port_nlattr_tuple_size(void)
1398 {
1399 	return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1400 }
1401 EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
1402 #endif
1403 
1404 /* Used by ipt_REJECT and ip6t_REJECT. */
1405 static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
1406 {
1407 	struct nf_conn *ct;
1408 	enum ip_conntrack_info ctinfo;
1409 
1410 	/* This ICMP is in reverse direction to the packet which caused it */
1411 	ct = nf_ct_get(skb, &ctinfo);
1412 	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1413 		ctinfo = IP_CT_RELATED_REPLY;
1414 	else
1415 		ctinfo = IP_CT_RELATED;
1416 
1417 	/* Attach to new skbuff, and increment count */
1418 	nskb->nfct = &ct->ct_general;
1419 	nskb->nfctinfo = ctinfo;
1420 	nf_conntrack_get(nskb->nfct);
1421 }
1422 
1423 /* Bring out ya dead! */
1424 static struct nf_conn *
1425 get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
1426 		void *data, unsigned int *bucket)
1427 {
1428 	struct nf_conntrack_tuple_hash *h;
1429 	struct nf_conn *ct;
1430 	struct hlist_nulls_node *n;
1431 	int cpu;
1432 	spinlock_t *lockp;
1433 
1434 	for (; *bucket < nf_conntrack_htable_size; (*bucket)++) {
1435 		lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
1436 		local_bh_disable();
1437 		nf_conntrack_lock(lockp);
1438 		if (*bucket < nf_conntrack_htable_size) {
1439 			hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) {
1440 				if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
1441 					continue;
1442 				ct = nf_ct_tuplehash_to_ctrack(h);
1443 				if (net_eq(nf_ct_net(ct), net) &&
1444 				    iter(ct, data))
1445 					goto found;
1446 			}
1447 		}
1448 		spin_unlock(lockp);
1449 		local_bh_enable();
1450 		cond_resched();
1451 	}
1452 
1453 	for_each_possible_cpu(cpu) {
1454 		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1455 
1456 		spin_lock_bh(&pcpu->lock);
1457 		hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
1458 			ct = nf_ct_tuplehash_to_ctrack(h);
1459 			if (iter(ct, data))
1460 				set_bit(IPS_DYING_BIT, &ct->status);
1461 		}
1462 		spin_unlock_bh(&pcpu->lock);
1463 		cond_resched();
1464 	}
1465 	return NULL;
1466 found:
1467 	atomic_inc(&ct->ct_general.use);
1468 	spin_unlock(lockp);
1469 	local_bh_enable();
1470 	return ct;
1471 }
1472 
1473 void nf_ct_iterate_cleanup(struct net *net,
1474 			   int (*iter)(struct nf_conn *i, void *data),
1475 			   void *data, u32 portid, int report)
1476 {
1477 	struct nf_conn *ct;
1478 	unsigned int bucket = 0;
1479 
1480 	might_sleep();
1481 
1482 	if (atomic_read(&net->ct.count) == 0)
1483 		return;
1484 
1485 	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
1486 		/* Time to push up daises... */
1487 		if (del_timer(&ct->timeout))
1488 			nf_ct_delete(ct, portid, report);
1489 
1490 		/* ... else the timer will get him soon. */
1491 
1492 		nf_ct_put(ct);
1493 		cond_resched();
1494 	}
1495 }
1496 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
1497 
1498 static int kill_all(struct nf_conn *i, void *data)
1499 {
1500 	return 1;
1501 }
1502 
1503 void nf_ct_free_hashtable(void *hash, unsigned int size)
1504 {
1505 	if (is_vmalloc_addr(hash))
1506 		vfree(hash);
1507 	else
1508 		free_pages((unsigned long)hash,
1509 			   get_order(sizeof(struct hlist_head) * size));
1510 }
1511 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
1512 
1513 static int untrack_refs(void)
1514 {
1515 	int cnt = 0, cpu;
1516 
1517 	for_each_possible_cpu(cpu) {
1518 		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1519 
1520 		cnt += atomic_read(&ct->ct_general.use) - 1;
1521 	}
1522 	return cnt;
1523 }
1524 
1525 void nf_conntrack_cleanup_start(void)
1526 {
1527 	RCU_INIT_POINTER(ip_ct_attach, NULL);
1528 }
1529 
1530 void nf_conntrack_cleanup_end(void)
1531 {
1532 	RCU_INIT_POINTER(nf_ct_destroy, NULL);
1533 	while (untrack_refs() > 0)
1534 		schedule();
1535 
1536 	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1537 
1538 	nf_conntrack_proto_fini();
1539 	nf_conntrack_seqadj_fini();
1540 	nf_conntrack_labels_fini();
1541 	nf_conntrack_helper_fini();
1542 	nf_conntrack_timeout_fini();
1543 	nf_conntrack_ecache_fini();
1544 	nf_conntrack_tstamp_fini();
1545 	nf_conntrack_acct_fini();
1546 	nf_conntrack_expect_fini();
1547 
1548 	kmem_cache_destroy(nf_conntrack_cachep);
1549 }
1550 
1551 /*
1552  * Mishearing the voices in his head, our hero wonders how he's
1553  * supposed to kill the mall.
1554  */
1555 void nf_conntrack_cleanup_net(struct net *net)
1556 {
1557 	LIST_HEAD(single);
1558 
1559 	list_add(&net->exit_list, &single);
1560 	nf_conntrack_cleanup_net_list(&single);
1561 }
1562 
1563 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
1564 {
1565 	int busy;
1566 	struct net *net;
1567 
1568 	/*
1569 	 * This makes sure all current packets have passed through
1570 	 *  netfilter framework.  Roll on, two-stage module
1571 	 *  delete...
1572 	 */
1573 	synchronize_net();
1574 i_see_dead_people:
1575 	busy = 0;
1576 	list_for_each_entry(net, net_exit_list, exit_list) {
1577 		nf_ct_iterate_cleanup(net, kill_all, NULL, 0, 0);
1578 		if (atomic_read(&net->ct.count) != 0)
1579 			busy = 1;
1580 	}
1581 	if (busy) {
1582 		schedule();
1583 		goto i_see_dead_people;
1584 	}
1585 
1586 	list_for_each_entry(net, net_exit_list, exit_list) {
1587 		nf_conntrack_proto_pernet_fini(net);
1588 		nf_conntrack_helper_pernet_fini(net);
1589 		nf_conntrack_ecache_pernet_fini(net);
1590 		nf_conntrack_tstamp_pernet_fini(net);
1591 		nf_conntrack_acct_pernet_fini(net);
1592 		nf_conntrack_expect_pernet_fini(net);
1593 		free_percpu(net->ct.stat);
1594 		free_percpu(net->ct.pcpu_lists);
1595 	}
1596 }
1597 
1598 void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
1599 {
1600 	struct hlist_nulls_head *hash;
1601 	unsigned int nr_slots, i;
1602 	size_t sz;
1603 
1604 	if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1605 		return NULL;
1606 
1607 	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1608 	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1609 
1610 	if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
1611 		return NULL;
1612 
1613 	sz = nr_slots * sizeof(struct hlist_nulls_head);
1614 	hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1615 					get_order(sz));
1616 	if (!hash)
1617 		hash = vzalloc(sz);
1618 
1619 	if (hash && nulls)
1620 		for (i = 0; i < nr_slots; i++)
1621 			INIT_HLIST_NULLS_HEAD(&hash[i], i);
1622 
1623 	return hash;
1624 }
1625 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1626 
1627 int nf_conntrack_hash_resize(unsigned int hashsize)
1628 {
1629 	int i, bucket;
1630 	unsigned int old_size;
1631 	struct hlist_nulls_head *hash, *old_hash;
1632 	struct nf_conntrack_tuple_hash *h;
1633 	struct nf_conn *ct;
1634 
1635 	if (!hashsize)
1636 		return -EINVAL;
1637 
1638 	hash = nf_ct_alloc_hashtable(&hashsize, 1);
1639 	if (!hash)
1640 		return -ENOMEM;
1641 
1642 	old_size = nf_conntrack_htable_size;
1643 	if (old_size == hashsize) {
1644 		nf_ct_free_hashtable(hash, hashsize);
1645 		return 0;
1646 	}
1647 
1648 	local_bh_disable();
1649 	nf_conntrack_all_lock();
1650 	write_seqcount_begin(&nf_conntrack_generation);
1651 
1652 	/* Lookups in the old hash might happen in parallel, which means we
1653 	 * might get false negatives during connection lookup. New connections
1654 	 * created because of a false negative won't make it into the hash
1655 	 * though since that required taking the locks.
1656 	 */
1657 
1658 	for (i = 0; i < nf_conntrack_htable_size; i++) {
1659 		while (!hlist_nulls_empty(&nf_conntrack_hash[i])) {
1660 			h = hlist_nulls_entry(nf_conntrack_hash[i].first,
1661 					      struct nf_conntrack_tuple_hash, hnnode);
1662 			ct = nf_ct_tuplehash_to_ctrack(h);
1663 			hlist_nulls_del_rcu(&h->hnnode);
1664 			bucket = __hash_conntrack(nf_ct_net(ct),
1665 						  &h->tuple, hashsize);
1666 			hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
1667 		}
1668 	}
1669 	old_size = nf_conntrack_htable_size;
1670 	old_hash = nf_conntrack_hash;
1671 
1672 	nf_conntrack_hash = hash;
1673 	nf_conntrack_htable_size = hashsize;
1674 
1675 	write_seqcount_end(&nf_conntrack_generation);
1676 	nf_conntrack_all_unlock();
1677 	local_bh_enable();
1678 
1679 	synchronize_net();
1680 	nf_ct_free_hashtable(old_hash, old_size);
1681 	return 0;
1682 }
1683 
1684 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1685 {
1686 	unsigned int hashsize;
1687 	int rc;
1688 
1689 	if (current->nsproxy->net_ns != &init_net)
1690 		return -EOPNOTSUPP;
1691 
1692 	/* On boot, we can set this without any fancy locking. */
1693 	if (!nf_conntrack_htable_size)
1694 		return param_set_uint(val, kp);
1695 
1696 	rc = kstrtouint(val, 0, &hashsize);
1697 	if (rc)
1698 		return rc;
1699 
1700 	return nf_conntrack_hash_resize(hashsize);
1701 }
1702 EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
1703 
1704 module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
1705 		  &nf_conntrack_htable_size, 0600);
1706 
1707 void nf_ct_untracked_status_or(unsigned long bits)
1708 {
1709 	int cpu;
1710 
1711 	for_each_possible_cpu(cpu)
1712 		per_cpu(nf_conntrack_untracked, cpu).status |= bits;
1713 }
1714 EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
1715 
1716 int nf_conntrack_init_start(void)
1717 {
1718 	int max_factor = 8;
1719 	int ret = -ENOMEM;
1720 	int i, cpu;
1721 
1722 	seqcount_init(&nf_conntrack_generation);
1723 
1724 	for (i = 0; i < CONNTRACK_LOCKS; i++)
1725 		spin_lock_init(&nf_conntrack_locks[i]);
1726 
1727 	if (!nf_conntrack_htable_size) {
1728 		/* Idea from tcp.c: use 1/16384 of memory.
1729 		 * On i386: 32MB machine has 512 buckets.
1730 		 * >= 1GB machines have 16384 buckets.
1731 		 * >= 4GB machines have 65536 buckets.
1732 		 */
1733 		nf_conntrack_htable_size
1734 			= (((totalram_pages << PAGE_SHIFT) / 16384)
1735 			   / sizeof(struct hlist_head));
1736 		if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
1737 			nf_conntrack_htable_size = 65536;
1738 		else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
1739 			nf_conntrack_htable_size = 16384;
1740 		if (nf_conntrack_htable_size < 32)
1741 			nf_conntrack_htable_size = 32;
1742 
1743 		/* Use a max. factor of four by default to get the same max as
1744 		 * with the old struct list_heads. When a table size is given
1745 		 * we use the old value of 8 to avoid reducing the max.
1746 		 * entries. */
1747 		max_factor = 4;
1748 	}
1749 
1750 	nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);
1751 	if (!nf_conntrack_hash)
1752 		return -ENOMEM;
1753 
1754 	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
1755 
1756 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
1757 						sizeof(struct nf_conn), 0,
1758 						SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL);
1759 	if (!nf_conntrack_cachep)
1760 		goto err_cachep;
1761 
1762 	printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
1763 	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
1764 	       nf_conntrack_max);
1765 
1766 	ret = nf_conntrack_expect_init();
1767 	if (ret < 0)
1768 		goto err_expect;
1769 
1770 	ret = nf_conntrack_acct_init();
1771 	if (ret < 0)
1772 		goto err_acct;
1773 
1774 	ret = nf_conntrack_tstamp_init();
1775 	if (ret < 0)
1776 		goto err_tstamp;
1777 
1778 	ret = nf_conntrack_ecache_init();
1779 	if (ret < 0)
1780 		goto err_ecache;
1781 
1782 	ret = nf_conntrack_timeout_init();
1783 	if (ret < 0)
1784 		goto err_timeout;
1785 
1786 	ret = nf_conntrack_helper_init();
1787 	if (ret < 0)
1788 		goto err_helper;
1789 
1790 	ret = nf_conntrack_labels_init();
1791 	if (ret < 0)
1792 		goto err_labels;
1793 
1794 	ret = nf_conntrack_seqadj_init();
1795 	if (ret < 0)
1796 		goto err_seqadj;
1797 
1798 	ret = nf_conntrack_proto_init();
1799 	if (ret < 0)
1800 		goto err_proto;
1801 
1802 	/* Set up fake conntrack: to never be deleted, not in any hashes */
1803 	for_each_possible_cpu(cpu) {
1804 		struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu);
1805 		write_pnet(&ct->ct_net, &init_net);
1806 		atomic_set(&ct->ct_general.use, 1);
1807 	}
1808 	/*  - and look it like as a confirmed connection */
1809 	nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
1810 	return 0;
1811 
1812 err_proto:
1813 	nf_conntrack_seqadj_fini();
1814 err_seqadj:
1815 	nf_conntrack_labels_fini();
1816 err_labels:
1817 	nf_conntrack_helper_fini();
1818 err_helper:
1819 	nf_conntrack_timeout_fini();
1820 err_timeout:
1821 	nf_conntrack_ecache_fini();
1822 err_ecache:
1823 	nf_conntrack_tstamp_fini();
1824 err_tstamp:
1825 	nf_conntrack_acct_fini();
1826 err_acct:
1827 	nf_conntrack_expect_fini();
1828 err_expect:
1829 	kmem_cache_destroy(nf_conntrack_cachep);
1830 err_cachep:
1831 	nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
1832 	return ret;
1833 }
1834 
1835 void nf_conntrack_init_end(void)
1836 {
1837 	/* For use by REJECT target */
1838 	RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
1839 	RCU_INIT_POINTER(nf_ct_destroy, destroy_conntrack);
1840 }
1841 
1842 /*
1843  * We need to use special "null" values, not used in hash table
1844  */
1845 #define UNCONFIRMED_NULLS_VAL	((1<<30)+0)
1846 #define DYING_NULLS_VAL		((1<<30)+1)
1847 #define TEMPLATE_NULLS_VAL	((1<<30)+2)
1848 
1849 int nf_conntrack_init_net(struct net *net)
1850 {
1851 	int ret = -ENOMEM;
1852 	int cpu;
1853 
1854 	atomic_set(&net->ct.count, 0);
1855 
1856 	net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
1857 	if (!net->ct.pcpu_lists)
1858 		goto err_stat;
1859 
1860 	for_each_possible_cpu(cpu) {
1861 		struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
1862 
1863 		spin_lock_init(&pcpu->lock);
1864 		INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
1865 		INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
1866 	}
1867 
1868 	net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
1869 	if (!net->ct.stat)
1870 		goto err_pcpu_lists;
1871 
1872 	ret = nf_conntrack_expect_pernet_init(net);
1873 	if (ret < 0)
1874 		goto err_expect;
1875 	ret = nf_conntrack_acct_pernet_init(net);
1876 	if (ret < 0)
1877 		goto err_acct;
1878 	ret = nf_conntrack_tstamp_pernet_init(net);
1879 	if (ret < 0)
1880 		goto err_tstamp;
1881 	ret = nf_conntrack_ecache_pernet_init(net);
1882 	if (ret < 0)
1883 		goto err_ecache;
1884 	ret = nf_conntrack_helper_pernet_init(net);
1885 	if (ret < 0)
1886 		goto err_helper;
1887 	ret = nf_conntrack_proto_pernet_init(net);
1888 	if (ret < 0)
1889 		goto err_proto;
1890 	return 0;
1891 
1892 err_proto:
1893 	nf_conntrack_helper_pernet_fini(net);
1894 err_helper:
1895 	nf_conntrack_ecache_pernet_fini(net);
1896 err_ecache:
1897 	nf_conntrack_tstamp_pernet_fini(net);
1898 err_tstamp:
1899 	nf_conntrack_acct_pernet_fini(net);
1900 err_acct:
1901 	nf_conntrack_expect_pernet_fini(net);
1902 err_expect:
1903 	free_percpu(net->ct.stat);
1904 err_pcpu_lists:
1905 	free_percpu(net->ct.pcpu_lists);
1906 err_stat:
1907 	return ret;
1908 }
1909