1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/types.h>
4 #include <linux/atomic.h>
5 #include <linux/inetdevice.h>
6 #include <linux/netfilter.h>
7 #include <linux/netfilter_ipv4.h>
8 #include <linux/netfilter_ipv6.h>
9 
10 #include <net/netfilter/nf_nat_masquerade.h>
11 
12 struct masq_dev_work {
13 	struct work_struct work;
14 	struct net *net;
15 	netns_tracker ns_tracker;
16 	union nf_inet_addr addr;
17 	int ifindex;
18 	int (*iter)(struct nf_conn *i, void *data);
19 };
20 
21 #define MAX_MASQ_WORKER_COUNT	16
22 
23 static DEFINE_MUTEX(masq_mutex);
24 static unsigned int masq_refcnt __read_mostly;
25 static atomic_t masq_worker_count __read_mostly;
26 
27 unsigned int
nf_nat_masquerade_ipv4(struct sk_buff * skb,unsigned int hooknum,const struct nf_nat_range2 * range,const struct net_device * out)28 nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
29 		       const struct nf_nat_range2 *range,
30 		       const struct net_device *out)
31 {
32 	struct nf_conn *ct;
33 	struct nf_conn_nat *nat;
34 	enum ip_conntrack_info ctinfo;
35 	struct nf_nat_range2 newrange;
36 	const struct rtable *rt;
37 	__be32 newsrc, nh;
38 
39 	WARN_ON(hooknum != NF_INET_POST_ROUTING);
40 
41 	ct = nf_ct_get(skb, &ctinfo);
42 
43 	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
44 			 ctinfo == IP_CT_RELATED_REPLY)));
45 
46 	/* Source address is 0.0.0.0 - locally generated packet that is
47 	 * probably not supposed to be masqueraded.
48 	 */
49 	if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
50 		return NF_ACCEPT;
51 
52 	rt = skb_rtable(skb);
53 	nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
54 	newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
55 	if (!newsrc) {
56 		pr_info("%s ate my IP address\n", out->name);
57 		return NF_DROP;
58 	}
59 
60 	nat = nf_ct_nat_ext_add(ct);
61 	if (nat)
62 		nat->masq_index = out->ifindex;
63 
64 	/* Transfer from original range. */
65 	memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
66 	memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
67 	newrange.flags       = range->flags | NF_NAT_RANGE_MAP_IPS;
68 	newrange.min_addr.ip = newsrc;
69 	newrange.max_addr.ip = newsrc;
70 	newrange.min_proto   = range->min_proto;
71 	newrange.max_proto   = range->max_proto;
72 
73 	/* Hand modified range to generic setup. */
74 	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
75 }
76 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
77 
iterate_cleanup_work(struct work_struct * work)78 static void iterate_cleanup_work(struct work_struct *work)
79 {
80 	struct nf_ct_iter_data iter_data = {};
81 	struct masq_dev_work *w;
82 
83 	w = container_of(work, struct masq_dev_work, work);
84 
85 	iter_data.net = w->net;
86 	iter_data.data = (void *)w;
87 	nf_ct_iterate_cleanup_net(w->iter, &iter_data);
88 
89 	put_net_track(w->net, &w->ns_tracker);
90 	kfree(w);
91 	atomic_dec(&masq_worker_count);
92 	module_put(THIS_MODULE);
93 }
94 
95 /* Iterate conntrack table in the background and remove conntrack entries
96  * that use the device/address being removed.
97  *
98  * In case too many work items have been queued already or memory allocation
99  * fails iteration is skipped, conntrack entries will time out eventually.
100  */
nf_nat_masq_schedule(struct net * net,union nf_inet_addr * addr,int ifindex,int (* iter)(struct nf_conn * i,void * data),gfp_t gfp_flags)101 static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
102 				 int ifindex,
103 				 int (*iter)(struct nf_conn *i, void *data),
104 				 gfp_t gfp_flags)
105 {
106 	struct masq_dev_work *w;
107 
108 	if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT)
109 		return;
110 
111 	net = maybe_get_net(net);
112 	if (!net)
113 		return;
114 
115 	if (!try_module_get(THIS_MODULE))
116 		goto err_module;
117 
118 	w = kzalloc(sizeof(*w), gfp_flags);
119 	if (w) {
120 		/* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */
121 		atomic_inc(&masq_worker_count);
122 
123 		INIT_WORK(&w->work, iterate_cleanup_work);
124 		w->ifindex = ifindex;
125 		w->net = net;
126 		netns_tracker_alloc(net, &w->ns_tracker, gfp_flags);
127 		w->iter = iter;
128 		if (addr)
129 			w->addr = *addr;
130 		schedule_work(&w->work);
131 		return;
132 	}
133 
134 	module_put(THIS_MODULE);
135  err_module:
136 	put_net(net);
137 }
138 
device_cmp(struct nf_conn * i,void * arg)139 static int device_cmp(struct nf_conn *i, void *arg)
140 {
141 	const struct nf_conn_nat *nat = nfct_nat(i);
142 	const struct masq_dev_work *w = arg;
143 
144 	if (!nat)
145 		return 0;
146 	return nat->masq_index == w->ifindex;
147 }
148 
masq_device_event(struct notifier_block * this,unsigned long event,void * ptr)149 static int masq_device_event(struct notifier_block *this,
150 			     unsigned long event,
151 			     void *ptr)
152 {
153 	const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
154 	struct net *net = dev_net(dev);
155 
156 	if (event == NETDEV_DOWN) {
157 		/* Device was downed.  Search entire table for
158 		 * conntracks which were associated with that device,
159 		 * and forget them.
160 		 */
161 
162 		nf_nat_masq_schedule(net, NULL, dev->ifindex,
163 				     device_cmp, GFP_KERNEL);
164 	}
165 
166 	return NOTIFY_DONE;
167 }
168 
inet_cmp(struct nf_conn * ct,void * ptr)169 static int inet_cmp(struct nf_conn *ct, void *ptr)
170 {
171 	struct nf_conntrack_tuple *tuple;
172 	struct masq_dev_work *w = ptr;
173 
174 	if (!device_cmp(ct, ptr))
175 		return 0;
176 
177 	tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
178 
179 	return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
180 }
181 
masq_inet_event(struct notifier_block * this,unsigned long event,void * ptr)182 static int masq_inet_event(struct notifier_block *this,
183 			   unsigned long event,
184 			   void *ptr)
185 {
186 	const struct in_ifaddr *ifa = ptr;
187 	const struct in_device *idev;
188 	const struct net_device *dev;
189 	union nf_inet_addr addr;
190 
191 	if (event != NETDEV_DOWN)
192 		return NOTIFY_DONE;
193 
194 	/* The masq_dev_notifier will catch the case of the device going
195 	 * down.  So if the inetdev is dead and being destroyed we have
196 	 * no work to do.  Otherwise this is an individual address removal
197 	 * and we have to perform the flush.
198 	 */
199 	idev = ifa->ifa_dev;
200 	if (idev->dead)
201 		return NOTIFY_DONE;
202 
203 	memset(&addr, 0, sizeof(addr));
204 
205 	addr.ip = ifa->ifa_address;
206 
207 	dev = idev->dev;
208 	nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex,
209 			     inet_cmp, GFP_KERNEL);
210 
211 	return NOTIFY_DONE;
212 }
213 
214 static struct notifier_block masq_dev_notifier = {
215 	.notifier_call	= masq_device_event,
216 };
217 
218 static struct notifier_block masq_inet_notifier = {
219 	.notifier_call	= masq_inet_event,
220 };
221 
222 #if IS_ENABLED(CONFIG_IPV6)
223 static int
nat_ipv6_dev_get_saddr(struct net * net,const struct net_device * dev,const struct in6_addr * daddr,unsigned int srcprefs,struct in6_addr * saddr)224 nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
225 		       const struct in6_addr *daddr, unsigned int srcprefs,
226 		       struct in6_addr *saddr)
227 {
228 #ifdef CONFIG_IPV6_MODULE
229 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
230 
231 	if (!v6_ops)
232 		return -EHOSTUNREACH;
233 
234 	return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr);
235 #else
236 	return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr);
237 #endif
238 }
239 
240 unsigned int
nf_nat_masquerade_ipv6(struct sk_buff * skb,const struct nf_nat_range2 * range,const struct net_device * out)241 nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
242 		       const struct net_device *out)
243 {
244 	enum ip_conntrack_info ctinfo;
245 	struct nf_conn_nat *nat;
246 	struct in6_addr src;
247 	struct nf_conn *ct;
248 	struct nf_nat_range2 newrange;
249 
250 	ct = nf_ct_get(skb, &ctinfo);
251 	WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
252 			 ctinfo == IP_CT_RELATED_REPLY)));
253 
254 	if (nat_ipv6_dev_get_saddr(nf_ct_net(ct), out,
255 				   &ipv6_hdr(skb)->daddr, 0, &src) < 0)
256 		return NF_DROP;
257 
258 	nat = nf_ct_nat_ext_add(ct);
259 	if (nat)
260 		nat->masq_index = out->ifindex;
261 
262 	newrange.flags		= range->flags | NF_NAT_RANGE_MAP_IPS;
263 	newrange.min_addr.in6	= src;
264 	newrange.max_addr.in6	= src;
265 	newrange.min_proto	= range->min_proto;
266 	newrange.max_proto	= range->max_proto;
267 
268 	return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
269 }
270 EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
271 
272 /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
273  *
274  * Defer it to the system workqueue.
275  *
276  * As we can have 'a lot' of inet_events (depending on amount of ipv6
277  * addresses being deleted), we also need to limit work item queue.
278  */
masq_inet6_event(struct notifier_block * this,unsigned long event,void * ptr)279 static int masq_inet6_event(struct notifier_block *this,
280 			    unsigned long event, void *ptr)
281 {
282 	struct inet6_ifaddr *ifa = ptr;
283 	const struct net_device *dev;
284 	union nf_inet_addr addr;
285 
286 	if (event != NETDEV_DOWN)
287 		return NOTIFY_DONE;
288 
289 	dev = ifa->idev->dev;
290 
291 	memset(&addr, 0, sizeof(addr));
292 
293 	addr.in6 = ifa->addr;
294 
295 	nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp,
296 			     GFP_ATOMIC);
297 	return NOTIFY_DONE;
298 }
299 
300 static struct notifier_block masq_inet6_notifier = {
301 	.notifier_call	= masq_inet6_event,
302 };
303 
nf_nat_masquerade_ipv6_register_notifier(void)304 static int nf_nat_masquerade_ipv6_register_notifier(void)
305 {
306 	return register_inet6addr_notifier(&masq_inet6_notifier);
307 }
308 #else
nf_nat_masquerade_ipv6_register_notifier(void)309 static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; }
310 #endif
311 
nf_nat_masquerade_inet_register_notifiers(void)312 int nf_nat_masquerade_inet_register_notifiers(void)
313 {
314 	int ret = 0;
315 
316 	mutex_lock(&masq_mutex);
317 	if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) {
318 		ret = -EOVERFLOW;
319 		goto out_unlock;
320 	}
321 
322 	/* check if the notifier was already set */
323 	if (++masq_refcnt > 1)
324 		goto out_unlock;
325 
326 	/* Register for device down reports */
327 	ret = register_netdevice_notifier(&masq_dev_notifier);
328 	if (ret)
329 		goto err_dec;
330 	/* Register IP address change reports */
331 	ret = register_inetaddr_notifier(&masq_inet_notifier);
332 	if (ret)
333 		goto err_unregister;
334 
335 	ret = nf_nat_masquerade_ipv6_register_notifier();
336 	if (ret)
337 		goto err_unreg_inet;
338 
339 	mutex_unlock(&masq_mutex);
340 	return ret;
341 err_unreg_inet:
342 	unregister_inetaddr_notifier(&masq_inet_notifier);
343 err_unregister:
344 	unregister_netdevice_notifier(&masq_dev_notifier);
345 err_dec:
346 	masq_refcnt--;
347 out_unlock:
348 	mutex_unlock(&masq_mutex);
349 	return ret;
350 }
351 EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers);
352 
nf_nat_masquerade_inet_unregister_notifiers(void)353 void nf_nat_masquerade_inet_unregister_notifiers(void)
354 {
355 	mutex_lock(&masq_mutex);
356 	/* check if the notifiers still have clients */
357 	if (--masq_refcnt > 0)
358 		goto out_unlock;
359 
360 	unregister_netdevice_notifier(&masq_dev_notifier);
361 	unregister_inetaddr_notifier(&masq_inet_notifier);
362 #if IS_ENABLED(CONFIG_IPV6)
363 	unregister_inet6addr_notifier(&masq_inet6_notifier);
364 #endif
365 out_unlock:
366 	mutex_unlock(&masq_mutex);
367 }
368 EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers);
369