xref: /openbmc/linux/net/netfilter/nf_nat_core.c (revision ba61bb17)
1 /*
2  * (C) 1999-2001 Paul `Rusty' Russell
3  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
4  * (C) 2011 Patrick McHardy <kaber@trash.net>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10 
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12 
13 #include <linux/module.h>
14 #include <linux/types.h>
15 #include <linux/timer.h>
16 #include <linux/skbuff.h>
17 #include <linux/gfp.h>
18 #include <net/xfrm.h>
19 #include <linux/jhash.h>
20 #include <linux/rtnetlink.h>
21 
22 #include <net/netfilter/nf_conntrack.h>
23 #include <net/netfilter/nf_conntrack_core.h>
24 #include <net/netfilter/nf_nat.h>
25 #include <net/netfilter/nf_nat_l3proto.h>
26 #include <net/netfilter/nf_nat_l4proto.h>
27 #include <net/netfilter/nf_nat_core.h>
28 #include <net/netfilter/nf_nat_helper.h>
29 #include <net/netfilter/nf_conntrack_helper.h>
30 #include <net/netfilter/nf_conntrack_seqadj.h>
31 #include <net/netfilter/nf_conntrack_l3proto.h>
32 #include <net/netfilter/nf_conntrack_zones.h>
33 #include <linux/netfilter/nf_nat.h>
34 
35 #include "nf_internals.h"
36 
37 static spinlock_t nf_nat_locks[CONNTRACK_LOCKS];
38 
39 static DEFINE_MUTEX(nf_nat_proto_mutex);
40 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
41 						__read_mostly;
42 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
43 						__read_mostly;
44 static unsigned int nat_net_id __read_mostly;
45 
46 static struct hlist_head *nf_nat_bysource __read_mostly;
47 static unsigned int nf_nat_htable_size __read_mostly;
48 static unsigned int nf_nat_hash_rnd __read_mostly;
49 
50 struct nf_nat_lookup_hook_priv {
51 	struct nf_hook_entries __rcu *entries;
52 
53 	struct rcu_head rcu_head;
54 };
55 
56 struct nf_nat_hooks_net {
57 	struct nf_hook_ops *nat_hook_ops;
58 	unsigned int users;
59 };
60 
61 struct nat_net {
62 	struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO];
63 };
64 
65 inline const struct nf_nat_l3proto *
66 __nf_nat_l3proto_find(u8 family)
67 {
68 	return rcu_dereference(nf_nat_l3protos[family]);
69 }
70 
71 inline const struct nf_nat_l4proto *
72 __nf_nat_l4proto_find(u8 family, u8 protonum)
73 {
74 	return rcu_dereference(nf_nat_l4protos[family][protonum]);
75 }
76 EXPORT_SYMBOL_GPL(__nf_nat_l4proto_find);
77 
78 #ifdef CONFIG_XFRM
79 static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl)
80 {
81 	const struct nf_nat_l3proto *l3proto;
82 	const struct nf_conn *ct;
83 	enum ip_conntrack_info ctinfo;
84 	enum ip_conntrack_dir dir;
85 	unsigned  long statusbit;
86 	u8 family;
87 
88 	ct = nf_ct_get(skb, &ctinfo);
89 	if (ct == NULL)
90 		return;
91 
92 	family = nf_ct_l3num(ct);
93 	l3proto = __nf_nat_l3proto_find(family);
94 	if (l3proto == NULL)
95 		return;
96 
97 	dir = CTINFO2DIR(ctinfo);
98 	if (dir == IP_CT_DIR_ORIGINAL)
99 		statusbit = IPS_DST_NAT;
100 	else
101 		statusbit = IPS_SRC_NAT;
102 
103 	l3proto->decode_session(skb, ct, dir, statusbit, fl);
104 }
105 
106 int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
107 {
108 	struct flowi fl;
109 	unsigned int hh_len;
110 	struct dst_entry *dst;
111 	int err;
112 
113 	err = xfrm_decode_session(skb, &fl, family);
114 	if (err < 0)
115 		return err;
116 
117 	dst = skb_dst(skb);
118 	if (dst->xfrm)
119 		dst = ((struct xfrm_dst *)dst)->route;
120 	dst_hold(dst);
121 
122 	dst = xfrm_lookup(net, dst, &fl, skb->sk, 0);
123 	if (IS_ERR(dst))
124 		return PTR_ERR(dst);
125 
126 	skb_dst_drop(skb);
127 	skb_dst_set(skb, dst);
128 
129 	/* Change in oif may mean change in hh_len. */
130 	hh_len = skb_dst(skb)->dev->hard_header_len;
131 	if (skb_headroom(skb) < hh_len &&
132 	    pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
133 		return -ENOMEM;
134 	return 0;
135 }
136 EXPORT_SYMBOL(nf_xfrm_me_harder);
137 #endif /* CONFIG_XFRM */
138 
139 /* We keep an extra hash for each conntrack, for fast searching. */
140 static unsigned int
141 hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
142 {
143 	unsigned int hash;
144 
145 	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
146 
147 	/* Original src, to ensure we map it consistently if poss. */
148 	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
149 		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
150 
151 	return reciprocal_scale(hash, nf_nat_htable_size);
152 }
153 
154 /* Is this tuple already taken? (not by us) */
155 int
156 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
157 		  const struct nf_conn *ignored_conntrack)
158 {
159 	/* Conntrack tracking doesn't keep track of outgoing tuples; only
160 	 * incoming ones.  NAT means they don't have a fixed mapping,
161 	 * so we invert the tuple and look for the incoming reply.
162 	 *
163 	 * We could keep a separate hash if this proves too slow.
164 	 */
165 	struct nf_conntrack_tuple reply;
166 
167 	nf_ct_invert_tuplepr(&reply, tuple);
168 	return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
169 }
170 EXPORT_SYMBOL(nf_nat_used_tuple);
171 
172 /* If we source map this tuple so reply looks like reply_tuple, will
173  * that meet the constraints of range.
174  */
175 static int in_range(const struct nf_nat_l3proto *l3proto,
176 		    const struct nf_nat_l4proto *l4proto,
177 		    const struct nf_conntrack_tuple *tuple,
178 		    const struct nf_nat_range2 *range)
179 {
180 	/* If we are supposed to map IPs, then we must be in the
181 	 * range specified, otherwise let this drag us onto a new src IP.
182 	 */
183 	if (range->flags & NF_NAT_RANGE_MAP_IPS &&
184 	    !l3proto->in_range(tuple, range))
185 		return 0;
186 
187 	if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) ||
188 	    l4proto->in_range(tuple, NF_NAT_MANIP_SRC,
189 			      &range->min_proto, &range->max_proto))
190 		return 1;
191 
192 	return 0;
193 }
194 
195 static inline int
196 same_src(const struct nf_conn *ct,
197 	 const struct nf_conntrack_tuple *tuple)
198 {
199 	const struct nf_conntrack_tuple *t;
200 
201 	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
202 	return (t->dst.protonum == tuple->dst.protonum &&
203 		nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) &&
204 		t->src.u.all == tuple->src.u.all);
205 }
206 
207 /* Only called for SRC manip */
208 static int
209 find_appropriate_src(struct net *net,
210 		     const struct nf_conntrack_zone *zone,
211 		     const struct nf_nat_l3proto *l3proto,
212 		     const struct nf_nat_l4proto *l4proto,
213 		     const struct nf_conntrack_tuple *tuple,
214 		     struct nf_conntrack_tuple *result,
215 		     const struct nf_nat_range2 *range)
216 {
217 	unsigned int h = hash_by_src(net, tuple);
218 	const struct nf_conn *ct;
219 
220 	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
221 		if (same_src(ct, tuple) &&
222 		    net_eq(net, nf_ct_net(ct)) &&
223 		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
224 			/* Copy source part from reply tuple. */
225 			nf_ct_invert_tuplepr(result,
226 				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
227 			result->dst = tuple->dst;
228 
229 			if (in_range(l3proto, l4proto, result, range))
230 				return 1;
231 		}
232 	}
233 	return 0;
234 }
235 
236 /* For [FUTURE] fragmentation handling, we want the least-used
237  * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
238  * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
239  * 1-65535, we don't do pro-rata allocation based on ports; we choose
240  * the ip with the lowest src-ip/dst-ip/proto usage.
241  */
242 static void
243 find_best_ips_proto(const struct nf_conntrack_zone *zone,
244 		    struct nf_conntrack_tuple *tuple,
245 		    const struct nf_nat_range2 *range,
246 		    const struct nf_conn *ct,
247 		    enum nf_nat_manip_type maniptype)
248 {
249 	union nf_inet_addr *var_ipp;
250 	unsigned int i, max;
251 	/* Host order */
252 	u32 minip, maxip, j, dist;
253 	bool full_range;
254 
255 	/* No IP mapping?  Do nothing. */
256 	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
257 		return;
258 
259 	if (maniptype == NF_NAT_MANIP_SRC)
260 		var_ipp = &tuple->src.u3;
261 	else
262 		var_ipp = &tuple->dst.u3;
263 
264 	/* Fast path: only one choice. */
265 	if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
266 		*var_ipp = range->min_addr;
267 		return;
268 	}
269 
270 	if (nf_ct_l3num(ct) == NFPROTO_IPV4)
271 		max = sizeof(var_ipp->ip) / sizeof(u32) - 1;
272 	else
273 		max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;
274 
275 	/* Hashing source and destination IPs gives a fairly even
276 	 * spread in practice (if there are a small number of IPs
277 	 * involved, there usually aren't that many connections
278 	 * anyway).  The consistency means that servers see the same
279 	 * client coming from the same IP (some Internet Banking sites
280 	 * like this), even across reboots.
281 	 */
282 	j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
283 		   range->flags & NF_NAT_RANGE_PERSISTENT ?
284 			0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
285 
286 	full_range = false;
287 	for (i = 0; i <= max; i++) {
288 		/* If first bytes of the address are at the maximum, use the
289 		 * distance. Otherwise use the full range.
290 		 */
291 		if (!full_range) {
292 			minip = ntohl((__force __be32)range->min_addr.all[i]);
293 			maxip = ntohl((__force __be32)range->max_addr.all[i]);
294 			dist  = maxip - minip + 1;
295 		} else {
296 			minip = 0;
297 			dist  = ~0;
298 		}
299 
300 		var_ipp->all[i] = (__force __u32)
301 			htonl(minip + reciprocal_scale(j, dist));
302 		if (var_ipp->all[i] != range->max_addr.all[i])
303 			full_range = true;
304 
305 		if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
306 			j ^= (__force u32)tuple->dst.u3.all[i];
307 	}
308 }
309 
310 /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
311  * we change the source to map into the range. For NF_INET_PRE_ROUTING
312  * and NF_INET_LOCAL_OUT, we change the destination to map into the
313  * range. It might not be possible to get a unique tuple, but we try.
314  * At worst (or if we race), we will end up with a final duplicate in
315  * __ip_conntrack_confirm and drop the packet. */
316 static void
317 get_unique_tuple(struct nf_conntrack_tuple *tuple,
318 		 const struct nf_conntrack_tuple *orig_tuple,
319 		 const struct nf_nat_range2 *range,
320 		 struct nf_conn *ct,
321 		 enum nf_nat_manip_type maniptype)
322 {
323 	const struct nf_conntrack_zone *zone;
324 	const struct nf_nat_l3proto *l3proto;
325 	const struct nf_nat_l4proto *l4proto;
326 	struct net *net = nf_ct_net(ct);
327 
328 	zone = nf_ct_zone(ct);
329 
330 	rcu_read_lock();
331 	l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
332 	l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
333 					orig_tuple->dst.protonum);
334 
335 	/* 1) If this srcip/proto/src-proto-part is currently mapped,
336 	 * and that same mapping gives a unique tuple within the given
337 	 * range, use that.
338 	 *
339 	 * This is only required for source (ie. NAT/masq) mappings.
340 	 * So far, we don't do local source mappings, so multiple
341 	 * manips not an issue.
342 	 */
343 	if (maniptype == NF_NAT_MANIP_SRC &&
344 	    !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
345 		/* try the original tuple first */
346 		if (in_range(l3proto, l4proto, orig_tuple, range)) {
347 			if (!nf_nat_used_tuple(orig_tuple, ct)) {
348 				*tuple = *orig_tuple;
349 				goto out;
350 			}
351 		} else if (find_appropriate_src(net, zone, l3proto, l4proto,
352 						orig_tuple, tuple, range)) {
353 			pr_debug("get_unique_tuple: Found current src map\n");
354 			if (!nf_nat_used_tuple(tuple, ct))
355 				goto out;
356 		}
357 	}
358 
359 	/* 2) Select the least-used IP/proto combination in the given range */
360 	*tuple = *orig_tuple;
361 	find_best_ips_proto(zone, tuple, range, ct, maniptype);
362 
363 	/* 3) The per-protocol part of the manip is made to map into
364 	 * the range to make a unique tuple.
365 	 */
366 
367 	/* Only bother mapping if it's not already in range and unique */
368 	if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
369 		if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
370 			if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
371 			    l4proto->in_range(tuple, maniptype,
372 			          &range->min_proto,
373 			          &range->max_proto) &&
374 			    (range->min_proto.all == range->max_proto.all ||
375 			     !nf_nat_used_tuple(tuple, ct)))
376 				goto out;
377 		} else if (!nf_nat_used_tuple(tuple, ct)) {
378 			goto out;
379 		}
380 	}
381 
382 	/* Last chance: get protocol to try to obtain unique tuple. */
383 	l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
384 out:
385 	rcu_read_unlock();
386 }
387 
388 struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct)
389 {
390 	struct nf_conn_nat *nat = nfct_nat(ct);
391 	if (nat)
392 		return nat;
393 
394 	if (!nf_ct_is_confirmed(ct))
395 		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
396 
397 	return nat;
398 }
399 EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add);
400 
401 unsigned int
402 nf_nat_setup_info(struct nf_conn *ct,
403 		  const struct nf_nat_range2 *range,
404 		  enum nf_nat_manip_type maniptype)
405 {
406 	struct net *net = nf_ct_net(ct);
407 	struct nf_conntrack_tuple curr_tuple, new_tuple;
408 
409 	/* Can't setup nat info for confirmed ct. */
410 	if (nf_ct_is_confirmed(ct))
411 		return NF_ACCEPT;
412 
413 	WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
414 		maniptype != NF_NAT_MANIP_DST);
415 
416 	if (WARN_ON(nf_nat_initialized(ct, maniptype)))
417 		return NF_DROP;
418 
419 	/* What we've got will look like inverse of reply. Normally
420 	 * this is what is in the conntrack, except for prior
421 	 * manipulations (future optimization: if num_manips == 0,
422 	 * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
423 	 */
424 	nf_ct_invert_tuplepr(&curr_tuple,
425 			     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
426 
427 	get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
428 
429 	if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
430 		struct nf_conntrack_tuple reply;
431 
432 		/* Alter conntrack table so will recognize replies. */
433 		nf_ct_invert_tuplepr(&reply, &new_tuple);
434 		nf_conntrack_alter_reply(ct, &reply);
435 
436 		/* Non-atomic: we own this at the moment. */
437 		if (maniptype == NF_NAT_MANIP_SRC)
438 			ct->status |= IPS_SRC_NAT;
439 		else
440 			ct->status |= IPS_DST_NAT;
441 
442 		if (nfct_help(ct) && !nfct_seqadj(ct))
443 			if (!nfct_seqadj_ext_add(ct))
444 				return NF_DROP;
445 	}
446 
447 	if (maniptype == NF_NAT_MANIP_SRC) {
448 		unsigned int srchash;
449 		spinlock_t *lock;
450 
451 		srchash = hash_by_src(net,
452 				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
453 		lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
454 		spin_lock_bh(lock);
455 		hlist_add_head_rcu(&ct->nat_bysource,
456 				   &nf_nat_bysource[srchash]);
457 		spin_unlock_bh(lock);
458 	}
459 
460 	/* It's done. */
461 	if (maniptype == NF_NAT_MANIP_DST)
462 		ct->status |= IPS_DST_NAT_DONE;
463 	else
464 		ct->status |= IPS_SRC_NAT_DONE;
465 
466 	return NF_ACCEPT;
467 }
468 EXPORT_SYMBOL(nf_nat_setup_info);
469 
470 static unsigned int
471 __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
472 {
473 	/* Force range to this IP; let proto decide mapping for
474 	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
475 	 * Use reply in case it's already been mangled (eg local packet).
476 	 */
477 	union nf_inet_addr ip =
478 		(manip == NF_NAT_MANIP_SRC ?
479 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
480 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
481 	struct nf_nat_range2 range = {
482 		.flags		= NF_NAT_RANGE_MAP_IPS,
483 		.min_addr	= ip,
484 		.max_addr	= ip,
485 	};
486 	return nf_nat_setup_info(ct, &range, manip);
487 }
488 
489 unsigned int
490 nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
491 {
492 	return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
493 }
494 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
495 
496 static unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
497 				     enum nf_nat_manip_type mtype,
498 				     enum ip_conntrack_dir dir)
499 {
500 	const struct nf_nat_l3proto *l3proto;
501 	const struct nf_nat_l4proto *l4proto;
502 	struct nf_conntrack_tuple target;
503 
504 	/* We are aiming to look like inverse of other direction. */
505 	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
506 
507 	l3proto = __nf_nat_l3proto_find(target.src.l3num);
508 	l4proto = __nf_nat_l4proto_find(target.src.l3num,
509 					target.dst.protonum);
510 	if (!l3proto->manip_pkt(skb, 0, l4proto, &target, mtype))
511 		return NF_DROP;
512 
513 	return NF_ACCEPT;
514 }
515 
516 /* Do packet manipulations according to nf_nat_setup_info. */
517 unsigned int nf_nat_packet(struct nf_conn *ct,
518 			   enum ip_conntrack_info ctinfo,
519 			   unsigned int hooknum,
520 			   struct sk_buff *skb)
521 {
522 	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
523 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
524 	unsigned int verdict = NF_ACCEPT;
525 	unsigned long statusbit;
526 
527 	if (mtype == NF_NAT_MANIP_SRC)
528 		statusbit = IPS_SRC_NAT;
529 	else
530 		statusbit = IPS_DST_NAT;
531 
532 	/* Invert if this is reply dir. */
533 	if (dir == IP_CT_DIR_REPLY)
534 		statusbit ^= IPS_NAT_MASK;
535 
536 	/* Non-atomic: these bits don't change. */
537 	if (ct->status & statusbit)
538 		verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
539 
540 	return verdict;
541 }
542 EXPORT_SYMBOL_GPL(nf_nat_packet);
543 
544 unsigned int
545 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
546 	       const struct nf_hook_state *state)
547 {
548 	struct nf_conn *ct;
549 	enum ip_conntrack_info ctinfo;
550 	struct nf_conn_nat *nat;
551 	/* maniptype == SRC for postrouting. */
552 	enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
553 
554 	ct = nf_ct_get(skb, &ctinfo);
555 	/* Can't track?  It's not due to stress, or conntrack would
556 	 * have dropped it.  Hence it's the user's responsibilty to
557 	 * packet filter it out, or implement conntrack/NAT for that
558 	 * protocol. 8) --RR
559 	 */
560 	if (!ct)
561 		return NF_ACCEPT;
562 
563 	nat = nfct_nat(ct);
564 
565 	switch (ctinfo) {
566 	case IP_CT_RELATED:
567 	case IP_CT_RELATED_REPLY:
568 		/* Only ICMPs can be IP_CT_IS_REPLY.  Fallthrough */
569 	case IP_CT_NEW:
570 		/* Seen it before?  This can happen for loopback, retrans,
571 		 * or local packets.
572 		 */
573 		if (!nf_nat_initialized(ct, maniptype)) {
574 			struct nf_nat_lookup_hook_priv *lpriv = priv;
575 			struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
576 			unsigned int ret;
577 			int i;
578 
579 			if (!e)
580 				goto null_bind;
581 
582 			for (i = 0; i < e->num_hook_entries; i++) {
583 				ret = e->hooks[i].hook(e->hooks[i].priv, skb,
584 						       state);
585 				if (ret != NF_ACCEPT)
586 					return ret;
587 				if (nf_nat_initialized(ct, maniptype))
588 					goto do_nat;
589 			}
590 null_bind:
591 			ret = nf_nat_alloc_null_binding(ct, state->hook);
592 			if (ret != NF_ACCEPT)
593 				return ret;
594 		} else {
595 			pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
596 				 maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
597 				 ct, ct->status);
598 			if (nf_nat_oif_changed(state->hook, ctinfo, nat,
599 					       state->out))
600 				goto oif_changed;
601 		}
602 		break;
603 	default:
604 		/* ESTABLISHED */
605 		WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
606 			ctinfo != IP_CT_ESTABLISHED_REPLY);
607 		if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
608 			goto oif_changed;
609 	}
610 do_nat:
611 	return nf_nat_packet(ct, ctinfo, state->hook, skb);
612 
613 oif_changed:
614 	nf_ct_kill_acct(ct, ctinfo, skb);
615 	return NF_DROP;
616 }
617 EXPORT_SYMBOL_GPL(nf_nat_inet_fn);
618 
619 struct nf_nat_proto_clean {
620 	u8	l3proto;
621 	u8	l4proto;
622 };
623 
624 /* kill conntracks with affected NAT section */
625 static int nf_nat_proto_remove(struct nf_conn *i, void *data)
626 {
627 	const struct nf_nat_proto_clean *clean = data;
628 
629 	if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
630 	    (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
631 		return 0;
632 
633 	return i->status & IPS_NAT_MASK ? 1 : 0;
634 }
635 
636 static void __nf_nat_cleanup_conntrack(struct nf_conn *ct)
637 {
638 	unsigned int h;
639 
640 	h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
641 	spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
642 	hlist_del_rcu(&ct->nat_bysource);
643 	spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]);
644 }
645 
646 static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
647 {
648 	if (nf_nat_proto_remove(ct, data))
649 		return 1;
650 
651 	/* This module is being removed and conntrack has nat null binding.
652 	 * Remove it from bysource hash, as the table will be freed soon.
653 	 *
654 	 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
655 	 * will delete entry from already-freed table.
656 	 */
657 	if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status))
658 		__nf_nat_cleanup_conntrack(ct);
659 
660 	/* don't delete conntrack.  Although that would make things a lot
661 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
662 	 */
663 	return 0;
664 }
665 
666 static void nf_nat_l4proto_clean(u8 l3proto, u8 l4proto)
667 {
668 	struct nf_nat_proto_clean clean = {
669 		.l3proto = l3proto,
670 		.l4proto = l4proto,
671 	};
672 
673 	nf_ct_iterate_destroy(nf_nat_proto_remove, &clean);
674 }
675 
676 static void nf_nat_l3proto_clean(u8 l3proto)
677 {
678 	struct nf_nat_proto_clean clean = {
679 		.l3proto = l3proto,
680 	};
681 
682 	nf_ct_iterate_destroy(nf_nat_proto_remove, &clean);
683 }
684 
685 /* Protocol registration. */
686 int nf_nat_l4proto_register(u8 l3proto, const struct nf_nat_l4proto *l4proto)
687 {
688 	const struct nf_nat_l4proto **l4protos;
689 	unsigned int i;
690 	int ret = 0;
691 
692 	mutex_lock(&nf_nat_proto_mutex);
693 	if (nf_nat_l4protos[l3proto] == NULL) {
694 		l4protos = kmalloc_array(IPPROTO_MAX,
695 					 sizeof(struct nf_nat_l4proto *),
696 					 GFP_KERNEL);
697 		if (l4protos == NULL) {
698 			ret = -ENOMEM;
699 			goto out;
700 		}
701 
702 		for (i = 0; i < IPPROTO_MAX; i++)
703 			RCU_INIT_POINTER(l4protos[i], &nf_nat_l4proto_unknown);
704 
705 		/* Before making proto_array visible to lockless readers,
706 		 * we must make sure its content is committed to memory.
707 		 */
708 		smp_wmb();
709 
710 		nf_nat_l4protos[l3proto] = l4protos;
711 	}
712 
713 	if (rcu_dereference_protected(
714 			nf_nat_l4protos[l3proto][l4proto->l4proto],
715 			lockdep_is_held(&nf_nat_proto_mutex)
716 			) != &nf_nat_l4proto_unknown) {
717 		ret = -EBUSY;
718 		goto out;
719 	}
720 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto], l4proto);
721  out:
722 	mutex_unlock(&nf_nat_proto_mutex);
723 	return ret;
724 }
725 EXPORT_SYMBOL_GPL(nf_nat_l4proto_register);
726 
727 /* No one stores the protocol anywhere; simply delete it. */
728 void nf_nat_l4proto_unregister(u8 l3proto, const struct nf_nat_l4proto *l4proto)
729 {
730 	mutex_lock(&nf_nat_proto_mutex);
731 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto][l4proto->l4proto],
732 			 &nf_nat_l4proto_unknown);
733 	mutex_unlock(&nf_nat_proto_mutex);
734 	synchronize_rcu();
735 
736 	nf_nat_l4proto_clean(l3proto, l4proto->l4proto);
737 }
738 EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
739 
740 int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
741 {
742 	int err;
743 
744 	err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
745 	if (err < 0)
746 		return err;
747 
748 	mutex_lock(&nf_nat_proto_mutex);
749 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
750 			 &nf_nat_l4proto_tcp);
751 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
752 			 &nf_nat_l4proto_udp);
753 #ifdef CONFIG_NF_NAT_PROTO_DCCP
754 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
755 			 &nf_nat_l4proto_dccp);
756 #endif
757 #ifdef CONFIG_NF_NAT_PROTO_SCTP
758 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
759 			 &nf_nat_l4proto_sctp);
760 #endif
761 #ifdef CONFIG_NF_NAT_PROTO_UDPLITE
762 	RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE],
763 			 &nf_nat_l4proto_udplite);
764 #endif
765 	mutex_unlock(&nf_nat_proto_mutex);
766 
767 	RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
768 	return 0;
769 }
770 EXPORT_SYMBOL_GPL(nf_nat_l3proto_register);
771 
772 void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
773 {
774 	mutex_lock(&nf_nat_proto_mutex);
775 	RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], NULL);
776 	mutex_unlock(&nf_nat_proto_mutex);
777 	synchronize_rcu();
778 
779 	nf_nat_l3proto_clean(l3proto->l3proto);
780 	nf_ct_l3proto_module_put(l3proto->l3proto);
781 }
782 EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
783 
784 /* No one using conntrack by the time this called. */
785 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
786 {
787 	if (ct->status & IPS_SRC_NAT_DONE)
788 		__nf_nat_cleanup_conntrack(ct);
789 }
790 
791 static struct nf_ct_ext_type nat_extend __read_mostly = {
792 	.len		= sizeof(struct nf_conn_nat),
793 	.align		= __alignof__(struct nf_conn_nat),
794 	.destroy	= nf_nat_cleanup_conntrack,
795 	.id		= NF_CT_EXT_NAT,
796 };
797 
798 #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
799 
800 #include <linux/netfilter/nfnetlink.h>
801 #include <linux/netfilter/nfnetlink_conntrack.h>
802 
803 static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = {
804 	[CTA_PROTONAT_PORT_MIN]	= { .type = NLA_U16 },
805 	[CTA_PROTONAT_PORT_MAX]	= { .type = NLA_U16 },
806 };
807 
808 static int nfnetlink_parse_nat_proto(struct nlattr *attr,
809 				     const struct nf_conn *ct,
810 				     struct nf_nat_range2 *range)
811 {
812 	struct nlattr *tb[CTA_PROTONAT_MAX+1];
813 	const struct nf_nat_l4proto *l4proto;
814 	int err;
815 
816 	err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr,
817 			       protonat_nla_policy, NULL);
818 	if (err < 0)
819 		return err;
820 
821 	l4proto = __nf_nat_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
822 	if (l4proto->nlattr_to_range)
823 		err = l4proto->nlattr_to_range(tb, range);
824 
825 	return err;
826 }
827 
828 static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = {
829 	[CTA_NAT_V4_MINIP]	= { .type = NLA_U32 },
830 	[CTA_NAT_V4_MAXIP]	= { .type = NLA_U32 },
831 	[CTA_NAT_V6_MINIP]	= { .len = sizeof(struct in6_addr) },
832 	[CTA_NAT_V6_MAXIP]	= { .len = sizeof(struct in6_addr) },
833 	[CTA_NAT_PROTO]		= { .type = NLA_NESTED },
834 };
835 
836 static int
837 nfnetlink_parse_nat(const struct nlattr *nat,
838 		    const struct nf_conn *ct, struct nf_nat_range2 *range,
839 		    const struct nf_nat_l3proto *l3proto)
840 {
841 	struct nlattr *tb[CTA_NAT_MAX+1];
842 	int err;
843 
844 	memset(range, 0, sizeof(*range));
845 
846 	err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL);
847 	if (err < 0)
848 		return err;
849 
850 	err = l3proto->nlattr_to_range(tb, range);
851 	if (err < 0)
852 		return err;
853 
854 	if (!tb[CTA_NAT_PROTO])
855 		return 0;
856 
857 	return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range);
858 }
859 
860 /* This function is called under rcu_read_lock() */
861 static int
862 nfnetlink_parse_nat_setup(struct nf_conn *ct,
863 			  enum nf_nat_manip_type manip,
864 			  const struct nlattr *attr)
865 {
866 	struct nf_nat_range2 range;
867 	const struct nf_nat_l3proto *l3proto;
868 	int err;
869 
870 	/* Should not happen, restricted to creating new conntracks
871 	 * via ctnetlink.
872 	 */
873 	if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
874 		return -EEXIST;
875 
876 	/* Make sure that L3 NAT is there by when we call nf_nat_setup_info to
877 	 * attach the null binding, otherwise this may oops.
878 	 */
879 	l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
880 	if (l3proto == NULL)
881 		return -EAGAIN;
882 
883 	/* No NAT information has been passed, allocate the null-binding */
884 	if (attr == NULL)
885 		return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0;
886 
887 	err = nfnetlink_parse_nat(attr, ct, &range, l3proto);
888 	if (err < 0)
889 		return err;
890 
891 	return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
892 }
893 #else
894 static int
895 nfnetlink_parse_nat_setup(struct nf_conn *ct,
896 			  enum nf_nat_manip_type manip,
897 			  const struct nlattr *attr)
898 {
899 	return -EOPNOTSUPP;
900 }
901 #endif
902 
903 static struct nf_ct_helper_expectfn follow_master_nat = {
904 	.name		= "nat-follow-master",
905 	.expectfn	= nf_nat_follow_master,
906 };
907 
908 int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops,
909 		       const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count)
910 {
911 	struct nat_net *nat_net = net_generic(net, nat_net_id);
912 	struct nf_nat_hooks_net *nat_proto_net;
913 	struct nf_nat_lookup_hook_priv *priv;
914 	unsigned int hooknum = ops->hooknum;
915 	struct nf_hook_ops *nat_ops;
916 	int i, ret;
917 
918 	if (WARN_ON_ONCE(ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net)))
919 		return -EINVAL;
920 
921 	nat_proto_net = &nat_net->nat_proto_net[ops->pf];
922 
923 	for (i = 0; i < ops_count; i++) {
924 		if (WARN_ON(orig_nat_ops[i].pf != ops->pf))
925 			return -EINVAL;
926 		if (orig_nat_ops[i].hooknum == hooknum) {
927 			hooknum = i;
928 			break;
929 		}
930 	}
931 
932 	if (WARN_ON_ONCE(i == ops_count))
933 		return -EINVAL;
934 
935 	mutex_lock(&nf_nat_proto_mutex);
936 	if (!nat_proto_net->nat_hook_ops) {
937 		WARN_ON(nat_proto_net->users != 0);
938 
939 		nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL);
940 		if (!nat_ops) {
941 			mutex_unlock(&nf_nat_proto_mutex);
942 			return -ENOMEM;
943 		}
944 
945 		for (i = 0; i < ops_count; i++) {
946 			priv = kzalloc(sizeof(*priv), GFP_KERNEL);
947 			if (priv) {
948 				nat_ops[i].priv = priv;
949 				continue;
950 			}
951 			mutex_unlock(&nf_nat_proto_mutex);
952 			while (i)
953 				kfree(nat_ops[--i].priv);
954 			kfree(nat_ops);
955 			return -ENOMEM;
956 		}
957 
958 		ret = nf_register_net_hooks(net, nat_ops, ops_count);
959 		if (ret < 0) {
960 			mutex_unlock(&nf_nat_proto_mutex);
961 			for (i = 0; i < ops_count; i++)
962 				kfree(nat_ops[i].priv);
963 			kfree(nat_ops);
964 			return ret;
965 		}
966 
967 		nat_proto_net->nat_hook_ops = nat_ops;
968 	}
969 
970 	nat_ops = nat_proto_net->nat_hook_ops;
971 	priv = nat_ops[hooknum].priv;
972 	if (WARN_ON_ONCE(!priv)) {
973 		mutex_unlock(&nf_nat_proto_mutex);
974 		return -EOPNOTSUPP;
975 	}
976 
977 	ret = nf_hook_entries_insert_raw(&priv->entries, ops);
978 	if (ret == 0)
979 		nat_proto_net->users++;
980 
981 	mutex_unlock(&nf_nat_proto_mutex);
982 	return ret;
983 }
984 EXPORT_SYMBOL_GPL(nf_nat_register_fn);
985 
986 void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops,
987 		          unsigned int ops_count)
988 {
989 	struct nat_net *nat_net = net_generic(net, nat_net_id);
990 	struct nf_nat_hooks_net *nat_proto_net;
991 	struct nf_nat_lookup_hook_priv *priv;
992 	struct nf_hook_ops *nat_ops;
993 	int hooknum = ops->hooknum;
994 	int i;
995 
996 	if (ops->pf >= ARRAY_SIZE(nat_net->nat_proto_net))
997 		return;
998 
999 	nat_proto_net = &nat_net->nat_proto_net[ops->pf];
1000 
1001 	mutex_lock(&nf_nat_proto_mutex);
1002 	if (WARN_ON(nat_proto_net->users == 0))
1003 		goto unlock;
1004 
1005 	nat_proto_net->users--;
1006 
1007 	nat_ops = nat_proto_net->nat_hook_ops;
1008 	for (i = 0; i < ops_count; i++) {
1009 		if (nat_ops[i].hooknum == hooknum) {
1010 			hooknum = i;
1011 			break;
1012 		}
1013 	}
1014 	if (WARN_ON_ONCE(i == ops_count))
1015 		goto unlock;
1016 	priv = nat_ops[hooknum].priv;
1017 	nf_hook_entries_delete_raw(&priv->entries, ops);
1018 
1019 	if (nat_proto_net->users == 0) {
1020 		nf_unregister_net_hooks(net, nat_ops, ops_count);
1021 
1022 		for (i = 0; i < ops_count; i++) {
1023 			priv = nat_ops[i].priv;
1024 			kfree_rcu(priv, rcu_head);
1025 		}
1026 
1027 		nat_proto_net->nat_hook_ops = NULL;
1028 		kfree(nat_ops);
1029 	}
1030 unlock:
1031 	mutex_unlock(&nf_nat_proto_mutex);
1032 }
1033 EXPORT_SYMBOL_GPL(nf_nat_unregister_fn);
1034 
1035 static struct pernet_operations nat_net_ops = {
1036 	.id = &nat_net_id,
1037 	.size = sizeof(struct nat_net),
1038 };
1039 
1040 static struct nf_nat_hook nat_hook = {
1041 	.parse_nat_setup	= nfnetlink_parse_nat_setup,
1042 #ifdef CONFIG_XFRM
1043 	.decode_session		= __nf_nat_decode_session,
1044 #endif
1045 	.manip_pkt		= nf_nat_manip_pkt,
1046 };
1047 
1048 static int __init nf_nat_init(void)
1049 {
1050 	int ret, i;
1051 
1052 	/* Leave them the same for the moment. */
1053 	nf_nat_htable_size = nf_conntrack_htable_size;
1054 	if (nf_nat_htable_size < CONNTRACK_LOCKS)
1055 		nf_nat_htable_size = CONNTRACK_LOCKS;
1056 
1057 	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
1058 	if (!nf_nat_bysource)
1059 		return -ENOMEM;
1060 
1061 	ret = nf_ct_extend_register(&nat_extend);
1062 	if (ret < 0) {
1063 		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
1064 		pr_err("Unable to register extension\n");
1065 		return ret;
1066 	}
1067 
1068 	for (i = 0; i < CONNTRACK_LOCKS; i++)
1069 		spin_lock_init(&nf_nat_locks[i]);
1070 
1071 	ret = register_pernet_subsys(&nat_net_ops);
1072 	if (ret < 0) {
1073 		nf_ct_extend_unregister(&nat_extend);
1074 		return ret;
1075 	}
1076 
1077 	nf_ct_helper_expectfn_register(&follow_master_nat);
1078 
1079 	WARN_ON(nf_nat_hook != NULL);
1080 	RCU_INIT_POINTER(nf_nat_hook, &nat_hook);
1081 
1082 	return 0;
1083 }
1084 
1085 static void __exit nf_nat_cleanup(void)
1086 {
1087 	struct nf_nat_proto_clean clean = {};
1088 	unsigned int i;
1089 
1090 	nf_ct_iterate_destroy(nf_nat_proto_clean, &clean);
1091 
1092 	nf_ct_extend_unregister(&nat_extend);
1093 	nf_ct_helper_expectfn_unregister(&follow_master_nat);
1094 	RCU_INIT_POINTER(nf_nat_hook, NULL);
1095 
1096 	synchronize_rcu();
1097 
1098 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
1099 		kfree(nf_nat_l4protos[i]);
1100 	synchronize_net();
1101 	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
1102 	unregister_pernet_subsys(&nat_net_ops);
1103 }
1104 
1105 MODULE_LICENSE("GPL");
1106 
1107 module_init(nf_nat_init);
1108 module_exit(nf_nat_cleanup);
1109