xref: /openbmc/linux/net/netfilter/ipvs/ip_vs_xmit.c (revision 8dde5715)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ip_vs_xmit.c: various packet transmitters for IPVS
4  *
5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6  *              Julian Anastasov <ja@ssi.bg>
7  *
8  * Changes:
9  *
10  * Description of forwarding methods:
11  * - all transmitters are called from LOCAL_IN (remote clients) and
12  * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
13  * - not all connections have destination server, for example,
14  * connections in backup server when fwmark is used
15  * - bypass connections use daddr from packet
16  * - we can use dst without ref while sending in RCU section, we use
17  * ref when returning NF_ACCEPT for NAT-ed packet via loopback
18  * LOCAL_OUT rules:
19  * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
20  * - skb->pkt_type is not set yet
21  * - the only place where we can see skb->sk != NULL
22  */
23 
24 #define KMSG_COMPONENT "IPVS"
25 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
26 
27 #include <linux/kernel.h>
28 #include <linux/slab.h>
29 #include <linux/tcp.h>                  /* for tcphdr */
30 #include <net/ip.h>
31 #include <net/gue.h>
32 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
33 #include <net/udp.h>
34 #include <net/icmp.h>                   /* for icmp_send */
35 #include <net/route.h>                  /* for ip_route_output */
36 #include <net/ipv6.h>
37 #include <net/ip6_route.h>
38 #include <net/ip_tunnels.h>
39 #include <net/addrconf.h>
40 #include <linux/icmpv6.h>
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv4.h>
43 
44 #include <net/ip_vs.h>
45 
46 enum {
47 	IP_VS_RT_MODE_LOCAL	= 1, /* Allow local dest */
48 	IP_VS_RT_MODE_NON_LOCAL	= 2, /* Allow non-local dest */
49 	IP_VS_RT_MODE_RDR	= 4, /* Allow redirect from remote daddr to
50 				      * local
51 				      */
52 	IP_VS_RT_MODE_CONNECT	= 8, /* Always bind route to saddr */
53 	IP_VS_RT_MODE_KNOWN_NH	= 16,/* Route via remote addr */
54 	IP_VS_RT_MODE_TUNNEL	= 32,/* Tunnel mode */
55 };
56 
57 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
58 {
59 	return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
60 }
61 
62 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
63 {
64 	kfree(dest_dst);
65 }
66 
67 /*
68  *      Destination cache to speed up outgoing route lookup
69  */
70 static inline void
71 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
72 		struct dst_entry *dst, u32 dst_cookie)
73 {
74 	struct ip_vs_dest_dst *old;
75 
76 	old = rcu_dereference_protected(dest->dest_dst,
77 					lockdep_is_held(&dest->dst_lock));
78 
79 	if (dest_dst) {
80 		dest_dst->dst_cache = dst;
81 		dest_dst->dst_cookie = dst_cookie;
82 	}
83 	rcu_assign_pointer(dest->dest_dst, dest_dst);
84 
85 	if (old)
86 		call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
87 }
88 
89 static inline struct ip_vs_dest_dst *
90 __ip_vs_dst_check(struct ip_vs_dest *dest)
91 {
92 	struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
93 	struct dst_entry *dst;
94 
95 	if (!dest_dst)
96 		return NULL;
97 	dst = dest_dst->dst_cache;
98 	if (dst->obsolete &&
99 	    dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
100 		return NULL;
101 	return dest_dst;
102 }
103 
104 static inline bool
105 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
106 {
107 	if (IP6CB(skb)->frag_max_size) {
108 		/* frag_max_size tell us that, this packet have been
109 		 * defragmented by netfilter IPv6 conntrack module.
110 		 */
111 		if (IP6CB(skb)->frag_max_size > mtu)
112 			return true; /* largest fragment violate MTU */
113 	}
114 	else if (skb->len > mtu && !skb_is_gso(skb)) {
115 		return true; /* Packet size violate MTU size */
116 	}
117 	return false;
118 }
119 
120 /* Get route to daddr, update *saddr, optionally bind route to saddr */
121 static struct rtable *do_output_route4(struct net *net, __be32 daddr,
122 				       int rt_mode, __be32 *saddr)
123 {
124 	struct flowi4 fl4;
125 	struct rtable *rt;
126 	bool loop = false;
127 
128 	memset(&fl4, 0, sizeof(fl4));
129 	fl4.daddr = daddr;
130 	fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
131 			   FLOWI_FLAG_KNOWN_NH : 0;
132 
133 retry:
134 	rt = ip_route_output_key(net, &fl4);
135 	if (IS_ERR(rt)) {
136 		/* Invalid saddr ? */
137 		if (PTR_ERR(rt) == -EINVAL && *saddr &&
138 		    rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
139 			*saddr = 0;
140 			flowi4_update_output(&fl4, 0, 0, daddr, 0);
141 			goto retry;
142 		}
143 		IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
144 		return NULL;
145 	} else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
146 		ip_rt_put(rt);
147 		*saddr = fl4.saddr;
148 		flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
149 		loop = true;
150 		goto retry;
151 	}
152 	*saddr = fl4.saddr;
153 	return rt;
154 }
155 
156 #ifdef CONFIG_IP_VS_IPV6
157 static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
158 {
159 	return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
160 }
161 #endif
162 
163 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
164 						int rt_mode,
165 						bool new_rt_is_local)
166 {
167 	bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
168 	bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
169 	bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
170 	bool source_is_loopback;
171 	bool old_rt_is_local;
172 
173 #ifdef CONFIG_IP_VS_IPV6
174 	if (skb_af == AF_INET6) {
175 		int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
176 
177 		source_is_loopback =
178 			(!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
179 			(addr_type & IPV6_ADDR_LOOPBACK);
180 		old_rt_is_local = __ip_vs_is_local_route6(
181 			(struct rt6_info *)skb_dst(skb));
182 	} else
183 #endif
184 	{
185 		source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
186 		old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
187 	}
188 
189 	if (unlikely(new_rt_is_local)) {
190 		if (!rt_mode_allow_local)
191 			return true;
192 		if (!rt_mode_allow_redirect && !old_rt_is_local)
193 			return true;
194 	} else {
195 		if (!rt_mode_allow_non_local)
196 			return true;
197 		if (source_is_loopback)
198 			return true;
199 	}
200 	return false;
201 }
202 
203 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
204 {
205 	struct sock *sk = skb->sk;
206 	struct rtable *ort = skb_rtable(skb);
207 
208 	if (!skb->dev && sk && sk_fullsock(sk))
209 		ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
210 }
211 
212 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
213 					  int rt_mode,
214 					  struct ip_vs_iphdr *ipvsh,
215 					  struct sk_buff *skb, int mtu)
216 {
217 #ifdef CONFIG_IP_VS_IPV6
218 	if (skb_af == AF_INET6) {
219 		struct net *net = ipvs->net;
220 
221 		if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
222 			if (!skb->dev)
223 				skb->dev = net->loopback_dev;
224 			/* only send ICMP too big on first fragment */
225 			if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
226 				icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
227 			IP_VS_DBG(1, "frag needed for %pI6c\n",
228 				  &ipv6_hdr(skb)->saddr);
229 			return false;
230 		}
231 	} else
232 #endif
233 	{
234 		/* If we're going to tunnel the packet and pmtu discovery
235 		 * is disabled, we'll just fragment it anyway
236 		 */
237 		if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
238 			return true;
239 
240 		if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
241 			     skb->len > mtu && !skb_is_gso(skb) &&
242 			     !ip_vs_iph_icmp(ipvsh))) {
243 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
244 				  htonl(mtu));
245 			IP_VS_DBG(1, "frag needed for %pI4\n",
246 				  &ip_hdr(skb)->saddr);
247 			return false;
248 		}
249 	}
250 
251 	return true;
252 }
253 
254 static inline bool decrement_ttl(struct netns_ipvs *ipvs,
255 				 int skb_af,
256 				 struct sk_buff *skb)
257 {
258 	struct net *net = ipvs->net;
259 
260 #ifdef CONFIG_IP_VS_IPV6
261 	if (skb_af == AF_INET6) {
262 		struct dst_entry *dst = skb_dst(skb);
263 
264 		/* check and decrement ttl */
265 		if (ipv6_hdr(skb)->hop_limit <= 1) {
266 			struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
267 
268 			/* Force OUTPUT device used as source address */
269 			skb->dev = dst->dev;
270 			icmpv6_send(skb, ICMPV6_TIME_EXCEED,
271 				    ICMPV6_EXC_HOPLIMIT, 0);
272 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
273 
274 			return false;
275 		}
276 
277 		/* don't propagate ttl change to cloned packets */
278 		if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
279 			return false;
280 
281 		ipv6_hdr(skb)->hop_limit--;
282 	} else
283 #endif
284 	{
285 		if (ip_hdr(skb)->ttl <= 1) {
286 			/* Tell the sender its packet died... */
287 			__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
288 			icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
289 			return false;
290 		}
291 
292 		/* don't propagate ttl change to cloned packets */
293 		if (!skb_make_writable(skb, sizeof(struct iphdr)))
294 			return false;
295 
296 		/* Decrease ttl */
297 		ip_decrease_ttl(ip_hdr(skb));
298 	}
299 
300 	return true;
301 }
302 
303 /* Get route to destination or remote server */
304 static int
305 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
306 		   struct ip_vs_dest *dest,
307 		   __be32 daddr, int rt_mode, __be32 *ret_saddr,
308 		   struct ip_vs_iphdr *ipvsh)
309 {
310 	struct net *net = ipvs->net;
311 	struct ip_vs_dest_dst *dest_dst;
312 	struct rtable *rt;			/* Route to the other host */
313 	int mtu;
314 	int local, noref = 1;
315 
316 	if (dest) {
317 		dest_dst = __ip_vs_dst_check(dest);
318 		if (likely(dest_dst))
319 			rt = (struct rtable *) dest_dst->dst_cache;
320 		else {
321 			dest_dst = ip_vs_dest_dst_alloc();
322 			spin_lock_bh(&dest->dst_lock);
323 			if (!dest_dst) {
324 				__ip_vs_dst_set(dest, NULL, NULL, 0);
325 				spin_unlock_bh(&dest->dst_lock);
326 				goto err_unreach;
327 			}
328 			rt = do_output_route4(net, dest->addr.ip, rt_mode,
329 					      &dest_dst->dst_saddr.ip);
330 			if (!rt) {
331 				__ip_vs_dst_set(dest, NULL, NULL, 0);
332 				spin_unlock_bh(&dest->dst_lock);
333 				ip_vs_dest_dst_free(dest_dst);
334 				goto err_unreach;
335 			}
336 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
337 			spin_unlock_bh(&dest->dst_lock);
338 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
339 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
340 				  atomic_read(&rt->dst.__refcnt));
341 		}
342 		if (ret_saddr)
343 			*ret_saddr = dest_dst->dst_saddr.ip;
344 	} else {
345 		__be32 saddr = htonl(INADDR_ANY);
346 
347 		noref = 0;
348 
349 		/* For such unconfigured boxes avoid many route lookups
350 		 * for performance reasons because we do not remember saddr
351 		 */
352 		rt_mode &= ~IP_VS_RT_MODE_CONNECT;
353 		rt = do_output_route4(net, daddr, rt_mode, &saddr);
354 		if (!rt)
355 			goto err_unreach;
356 		if (ret_saddr)
357 			*ret_saddr = saddr;
358 	}
359 
360 	local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
361 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
362 						  local))) {
363 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
364 			     " daddr=%pI4\n", &daddr);
365 		goto err_put;
366 	}
367 
368 	if (unlikely(local)) {
369 		/* skb to local stack, preserve old route */
370 		if (!noref)
371 			ip_rt_put(rt);
372 		return local;
373 	}
374 
375 	if (!decrement_ttl(ipvs, skb_af, skb))
376 		goto err_put;
377 
378 	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
379 		mtu = dst_mtu(&rt->dst);
380 	} else {
381 		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
382 		if (!dest)
383 			goto err_put;
384 		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
385 			mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
386 		if (mtu < 68) {
387 			IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
388 			goto err_put;
389 		}
390 		maybe_update_pmtu(skb_af, skb, mtu);
391 	}
392 
393 	if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
394 		goto err_put;
395 
396 	skb_dst_drop(skb);
397 	if (noref) {
398 		if (!local)
399 			skb_dst_set_noref(skb, &rt->dst);
400 		else
401 			skb_dst_set(skb, dst_clone(&rt->dst));
402 	} else
403 		skb_dst_set(skb, &rt->dst);
404 
405 	return local;
406 
407 err_put:
408 	if (!noref)
409 		ip_rt_put(rt);
410 	return -1;
411 
412 err_unreach:
413 	dst_link_failure(skb);
414 	return -1;
415 }
416 
417 #ifdef CONFIG_IP_VS_IPV6
418 static struct dst_entry *
419 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
420 			struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
421 {
422 	struct dst_entry *dst;
423 	struct flowi6 fl6 = {
424 		.daddr = *daddr,
425 	};
426 
427 	if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
428 		fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
429 
430 	dst = ip6_route_output(net, NULL, &fl6);
431 	if (dst->error)
432 		goto out_err;
433 	if (!ret_saddr)
434 		return dst;
435 	if (ipv6_addr_any(&fl6.saddr) &&
436 	    ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
437 			       &fl6.daddr, 0, &fl6.saddr) < 0)
438 		goto out_err;
439 	if (do_xfrm) {
440 		dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
441 		if (IS_ERR(dst)) {
442 			dst = NULL;
443 			goto out_err;
444 		}
445 	}
446 	*ret_saddr = fl6.saddr;
447 	return dst;
448 
449 out_err:
450 	dst_release(dst);
451 	IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
452 	return NULL;
453 }
454 
455 /*
456  * Get route to destination or remote server
457  */
458 static int
459 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
460 		      struct ip_vs_dest *dest,
461 		      struct in6_addr *daddr, struct in6_addr *ret_saddr,
462 		      struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
463 {
464 	struct net *net = ipvs->net;
465 	struct ip_vs_dest_dst *dest_dst;
466 	struct rt6_info *rt;			/* Route to the other host */
467 	struct dst_entry *dst;
468 	int mtu;
469 	int local, noref = 1;
470 
471 	if (dest) {
472 		dest_dst = __ip_vs_dst_check(dest);
473 		if (likely(dest_dst))
474 			rt = (struct rt6_info *) dest_dst->dst_cache;
475 		else {
476 			u32 cookie;
477 
478 			dest_dst = ip_vs_dest_dst_alloc();
479 			spin_lock_bh(&dest->dst_lock);
480 			if (!dest_dst) {
481 				__ip_vs_dst_set(dest, NULL, NULL, 0);
482 				spin_unlock_bh(&dest->dst_lock);
483 				goto err_unreach;
484 			}
485 			dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
486 						      &dest_dst->dst_saddr.in6,
487 						      do_xfrm, rt_mode);
488 			if (!dst) {
489 				__ip_vs_dst_set(dest, NULL, NULL, 0);
490 				spin_unlock_bh(&dest->dst_lock);
491 				ip_vs_dest_dst_free(dest_dst);
492 				goto err_unreach;
493 			}
494 			rt = (struct rt6_info *) dst;
495 			cookie = rt6_get_cookie(rt);
496 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
497 			spin_unlock_bh(&dest->dst_lock);
498 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
499 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
500 				  atomic_read(&rt->dst.__refcnt));
501 		}
502 		if (ret_saddr)
503 			*ret_saddr = dest_dst->dst_saddr.in6;
504 	} else {
505 		noref = 0;
506 		dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
507 					      rt_mode);
508 		if (!dst)
509 			goto err_unreach;
510 		rt = (struct rt6_info *) dst;
511 	}
512 
513 	local = __ip_vs_is_local_route6(rt);
514 
515 	if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
516 						  local))) {
517 		IP_VS_DBG_RL("We are crossing local and non-local addresses"
518 			     " daddr=%pI6\n", daddr);
519 		goto err_put;
520 	}
521 
522 	if (unlikely(local)) {
523 		/* skb to local stack, preserve old route */
524 		if (!noref)
525 			dst_release(&rt->dst);
526 		return local;
527 	}
528 
529 	if (!decrement_ttl(ipvs, skb_af, skb))
530 		goto err_put;
531 
532 	/* MTU checking */
533 	if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
534 		mtu = dst_mtu(&rt->dst);
535 	else {
536 		mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
537 		if (!dest)
538 			goto err_put;
539 		if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
540 			mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
541 		if (mtu < IPV6_MIN_MTU) {
542 			IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
543 				     IPV6_MIN_MTU);
544 			goto err_put;
545 		}
546 		maybe_update_pmtu(skb_af, skb, mtu);
547 	}
548 
549 	if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
550 		goto err_put;
551 
552 	skb_dst_drop(skb);
553 	if (noref) {
554 		if (!local)
555 			skb_dst_set_noref(skb, &rt->dst);
556 		else
557 			skb_dst_set(skb, dst_clone(&rt->dst));
558 	} else
559 		skb_dst_set(skb, &rt->dst);
560 
561 	return local;
562 
563 err_put:
564 	if (!noref)
565 		dst_release(&rt->dst);
566 	return -1;
567 
568 err_unreach:
569 	/* The ip6_link_failure function requires the dev field to be set
570 	 * in order to get the net (further for the sake of fwmark
571 	 * reflection).
572 	 */
573 	if (!skb->dev)
574 		skb->dev = skb_dst(skb)->dev;
575 
576 	dst_link_failure(skb);
577 	return -1;
578 }
579 #endif
580 
581 
582 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
583 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
584 					    struct ip_vs_conn *cp)
585 {
586 	int ret = NF_ACCEPT;
587 
588 	skb->ipvs_property = 1;
589 	if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
590 		ret = ip_vs_confirm_conntrack(skb);
591 	if (ret == NF_ACCEPT) {
592 		nf_reset(skb);
593 		skb_forward_csum(skb);
594 	}
595 	return ret;
596 }
597 
598 /* In the event of a remote destination, it's possible that we would have
599  * matches against an old socket (particularly a TIME-WAIT socket). This
600  * causes havoc down the line (ip_local_out et. al. expect regular sockets
601  * and invalid memory accesses will happen) so simply drop the association
602  * in this case.
603 */
604 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
605 {
606 	/* If dev is set, the packet came from the LOCAL_IN callback and
607 	 * not from a local TCP socket.
608 	 */
609 	if (skb->dev)
610 		skb_orphan(skb);
611 }
612 
613 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
614 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
615 					 struct ip_vs_conn *cp, int local)
616 {
617 	int ret = NF_STOLEN;
618 
619 	skb->ipvs_property = 1;
620 	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
621 		ip_vs_notrack(skb);
622 	else
623 		ip_vs_update_conntrack(skb, cp, 1);
624 
625 	/* Remove the early_demux association unless it's bound for the
626 	 * exact same port and address on this host after translation.
627 	 */
628 	if (!local || cp->vport != cp->dport ||
629 	    !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
630 		ip_vs_drop_early_demux_sk(skb);
631 
632 	if (!local) {
633 		skb_forward_csum(skb);
634 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
635 			NULL, skb_dst(skb)->dev, dst_output);
636 	} else
637 		ret = NF_ACCEPT;
638 
639 	return ret;
640 }
641 
642 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
643 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
644 				     struct ip_vs_conn *cp, int local)
645 {
646 	int ret = NF_STOLEN;
647 
648 	skb->ipvs_property = 1;
649 	if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
650 		ip_vs_notrack(skb);
651 	if (!local) {
652 		ip_vs_drop_early_demux_sk(skb);
653 		skb_forward_csum(skb);
654 		NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
655 			NULL, skb_dst(skb)->dev, dst_output);
656 	} else
657 		ret = NF_ACCEPT;
658 	return ret;
659 }
660 
661 
662 /*
663  *      NULL transmitter (do nothing except return NF_ACCEPT)
664  */
665 int
666 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
667 		struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
668 {
669 	/* we do not touch skb and do not need pskb ptr */
670 	return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
671 }
672 
673 
674 /*
675  *      Bypass transmitter
676  *      Let packets bypass the destination when the destination is not
677  *      available, it may be only used in transparent cache cluster.
678  */
679 int
680 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
681 		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
682 {
683 	struct iphdr  *iph = ip_hdr(skb);
684 
685 	EnterFunction(10);
686 
687 	if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
688 			       IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
689 		goto tx_error;
690 
691 	ip_send_check(iph);
692 
693 	/* Another hack: avoid icmp_send in ip_fragment */
694 	skb->ignore_df = 1;
695 
696 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
697 
698 	LeaveFunction(10);
699 	return NF_STOLEN;
700 
701  tx_error:
702 	kfree_skb(skb);
703 	LeaveFunction(10);
704 	return NF_STOLEN;
705 }
706 
707 #ifdef CONFIG_IP_VS_IPV6
708 int
709 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
710 		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
711 {
712 	struct ipv6hdr *iph = ipv6_hdr(skb);
713 
714 	EnterFunction(10);
715 
716 	if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
717 				  &iph->daddr, NULL,
718 				  ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
719 		goto tx_error;
720 
721 	/* Another hack: avoid icmp_send in ip_fragment */
722 	skb->ignore_df = 1;
723 
724 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
725 
726 	LeaveFunction(10);
727 	return NF_STOLEN;
728 
729  tx_error:
730 	kfree_skb(skb);
731 	LeaveFunction(10);
732 	return NF_STOLEN;
733 }
734 #endif
735 
736 /*
737  *      NAT transmitter (only for outside-to-inside nat forwarding)
738  *      Not used for related ICMP
739  */
740 int
741 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
742 	       struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
743 {
744 	struct rtable *rt;		/* Route to the other host */
745 	int local, rc, was_input;
746 
747 	EnterFunction(10);
748 
749 	/* check if it is a connection of no-client-port */
750 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
751 		__be16 _pt, *p;
752 
753 		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
754 		if (p == NULL)
755 			goto tx_error;
756 		ip_vs_conn_fill_cport(cp, *p);
757 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
758 	}
759 
760 	was_input = rt_is_input_route(skb_rtable(skb));
761 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
762 				   IP_VS_RT_MODE_LOCAL |
763 				   IP_VS_RT_MODE_NON_LOCAL |
764 				   IP_VS_RT_MODE_RDR, NULL, ipvsh);
765 	if (local < 0)
766 		goto tx_error;
767 	rt = skb_rtable(skb);
768 	/*
769 	 * Avoid duplicate tuple in reply direction for NAT traffic
770 	 * to local address when connection is sync-ed
771 	 */
772 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
773 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
774 		enum ip_conntrack_info ctinfo;
775 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
776 
777 		if (ct) {
778 			IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
779 					 "ip_vs_nat_xmit(): "
780 					 "stopping DNAT to local address");
781 			goto tx_error;
782 		}
783 	}
784 #endif
785 
786 	/* From world but DNAT to loopback address? */
787 	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
788 		IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
789 				 "ip_vs_nat_xmit(): stopping DNAT to loopback "
790 				 "address");
791 		goto tx_error;
792 	}
793 
794 	/* copy-on-write the packet before mangling it */
795 	if (!skb_make_writable(skb, sizeof(struct iphdr)))
796 		goto tx_error;
797 
798 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
799 		goto tx_error;
800 
801 	/* mangle the packet */
802 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
803 		goto tx_error;
804 	ip_hdr(skb)->daddr = cp->daddr.ip;
805 	ip_send_check(ip_hdr(skb));
806 
807 	IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
808 
809 	/* FIXME: when application helper enlarges the packet and the length
810 	   is larger than the MTU of outgoing device, there will be still
811 	   MTU problem. */
812 
813 	/* Another hack: avoid icmp_send in ip_fragment */
814 	skb->ignore_df = 1;
815 
816 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
817 
818 	LeaveFunction(10);
819 	return rc;
820 
821   tx_error:
822 	kfree_skb(skb);
823 	LeaveFunction(10);
824 	return NF_STOLEN;
825 }
826 
827 #ifdef CONFIG_IP_VS_IPV6
828 int
829 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
830 		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
831 {
832 	struct rt6_info *rt;		/* Route to the other host */
833 	int local, rc;
834 
835 	EnterFunction(10);
836 
837 	/* check if it is a connection of no-client-port */
838 	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
839 		__be16 _pt, *p;
840 		p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
841 		if (p == NULL)
842 			goto tx_error;
843 		ip_vs_conn_fill_cport(cp, *p);
844 		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
845 	}
846 
847 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
848 				      &cp->daddr.in6,
849 				      NULL, ipvsh, 0,
850 				      IP_VS_RT_MODE_LOCAL |
851 				      IP_VS_RT_MODE_NON_LOCAL |
852 				      IP_VS_RT_MODE_RDR);
853 	if (local < 0)
854 		goto tx_error;
855 	rt = (struct rt6_info *) skb_dst(skb);
856 	/*
857 	 * Avoid duplicate tuple in reply direction for NAT traffic
858 	 * to local address when connection is sync-ed
859 	 */
860 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
861 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
862 		enum ip_conntrack_info ctinfo;
863 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
864 
865 		if (ct) {
866 			IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
867 					 "ip_vs_nat_xmit_v6(): "
868 					 "stopping DNAT to local address");
869 			goto tx_error;
870 		}
871 	}
872 #endif
873 
874 	/* From world but DNAT to loopback address? */
875 	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
876 	    ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
877 		IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
878 				 "ip_vs_nat_xmit_v6(): "
879 				 "stopping DNAT to loopback address");
880 		goto tx_error;
881 	}
882 
883 	/* copy-on-write the packet before mangling it */
884 	if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
885 		goto tx_error;
886 
887 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
888 		goto tx_error;
889 
890 	/* mangle the packet */
891 	if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
892 		goto tx_error;
893 	ipv6_hdr(skb)->daddr = cp->daddr.in6;
894 
895 	IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
896 
897 	/* FIXME: when application helper enlarges the packet and the length
898 	   is larger than the MTU of outgoing device, there will be still
899 	   MTU problem. */
900 
901 	/* Another hack: avoid icmp_send in ip_fragment */
902 	skb->ignore_df = 1;
903 
904 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
905 
906 	LeaveFunction(10);
907 	return rc;
908 
909 tx_error:
910 	LeaveFunction(10);
911 	kfree_skb(skb);
912 	return NF_STOLEN;
913 }
914 #endif
915 
916 /* When forwarding a packet, we must ensure that we've got enough headroom
917  * for the encapsulation packet in the skb.  This also gives us an
918  * opportunity to figure out what the payload_len, dsfield, ttl, and df
919  * values should be, so that we won't need to look at the old ip header
920  * again
921  */
922 static struct sk_buff *
923 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
924 			   unsigned int max_headroom, __u8 *next_protocol,
925 			   __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
926 			   __be16 *df)
927 {
928 	struct sk_buff *new_skb = NULL;
929 	struct iphdr *old_iph = NULL;
930 	__u8 old_dsfield;
931 #ifdef CONFIG_IP_VS_IPV6
932 	struct ipv6hdr *old_ipv6h = NULL;
933 #endif
934 
935 	ip_vs_drop_early_demux_sk(skb);
936 
937 	if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
938 		new_skb = skb_realloc_headroom(skb, max_headroom);
939 		if (!new_skb)
940 			goto error;
941 		if (skb->sk)
942 			skb_set_owner_w(new_skb, skb->sk);
943 		consume_skb(skb);
944 		skb = new_skb;
945 	}
946 
947 #ifdef CONFIG_IP_VS_IPV6
948 	if (skb_af == AF_INET6) {
949 		old_ipv6h = ipv6_hdr(skb);
950 		*next_protocol = IPPROTO_IPV6;
951 		if (payload_len)
952 			*payload_len =
953 				ntohs(old_ipv6h->payload_len) +
954 				sizeof(*old_ipv6h);
955 		old_dsfield = ipv6_get_dsfield(old_ipv6h);
956 		*ttl = old_ipv6h->hop_limit;
957 		if (df)
958 			*df = 0;
959 	} else
960 #endif
961 	{
962 		old_iph = ip_hdr(skb);
963 		/* Copy DF, reset fragment offset and MF */
964 		if (df)
965 			*df = (old_iph->frag_off & htons(IP_DF));
966 		*next_protocol = IPPROTO_IPIP;
967 
968 		/* fix old IP header checksum */
969 		ip_send_check(old_iph);
970 		old_dsfield = ipv4_get_dsfield(old_iph);
971 		*ttl = old_iph->ttl;
972 		if (payload_len)
973 			*payload_len = ntohs(old_iph->tot_len);
974 	}
975 
976 	/* Implement full-functionality option for ECN encapsulation */
977 	*dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
978 
979 	return skb;
980 error:
981 	kfree_skb(skb);
982 	return ERR_PTR(-ENOMEM);
983 }
984 
985 static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
986 {
987 	switch (encaps_af) {
988 	case AF_INET:
989 		return SKB_GSO_IPXIP4;
990 	case AF_INET6:
991 		return SKB_GSO_IPXIP6;
992 	default:
993 		return 0;
994 	}
995 }
996 
997 static int
998 ipvs_gue_encap(struct net *net, struct sk_buff *skb,
999 	       struct ip_vs_conn *cp, __u8 *next_protocol)
1000 {
1001 	__be16 dport;
1002 	__be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
1003 	struct udphdr  *udph;	/* Our new UDP header */
1004 	struct guehdr  *gueh;	/* Our new GUE header */
1005 
1006 	skb_push(skb, sizeof(struct guehdr));
1007 
1008 	gueh = (struct guehdr *)skb->data;
1009 
1010 	gueh->control = 0;
1011 	gueh->version = 0;
1012 	gueh->hlen = 0;
1013 	gueh->flags = 0;
1014 	gueh->proto_ctype = *next_protocol;
1015 
1016 	skb_push(skb, sizeof(struct udphdr));
1017 	skb_reset_transport_header(skb);
1018 
1019 	udph = udp_hdr(skb);
1020 
1021 	dport = cp->dest->tun_port;
1022 	udph->dest = dport;
1023 	udph->source = sport;
1024 	udph->len = htons(skb->len);
1025 	udph->check = 0;
1026 
1027 	*next_protocol = IPPROTO_UDP;
1028 
1029 	return 0;
1030 }
1031 
1032 /*
1033  *   IP Tunneling transmitter
1034  *
1035  *   This function encapsulates the packet in a new IP packet, its
1036  *   destination will be set to cp->daddr. Most code of this function
1037  *   is taken from ipip.c.
1038  *
1039  *   It is used in VS/TUN cluster. The load balancer selects a real
1040  *   server from a cluster based on a scheduling algorithm,
1041  *   encapsulates the request packet and forwards it to the selected
1042  *   server. For example, all real servers are configured with
1043  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
1044  *   the encapsulated packet, it will decapsulate the packet, processe
1045  *   the request and return the response packets directly to the client
1046  *   without passing the load balancer. This can greatly increase the
1047  *   scalability of virtual server.
1048  *
1049  *   Used for ANY protocol
1050  */
1051 int
1052 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1053 		  struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1054 {
1055 	struct netns_ipvs *ipvs = cp->ipvs;
1056 	struct net *net = ipvs->net;
1057 	struct rtable *rt;			/* Route to the other host */
1058 	__be32 saddr;				/* Source for tunnel */
1059 	struct net_device *tdev;		/* Device to other host */
1060 	__u8 next_protocol = 0;
1061 	__u8 dsfield = 0;
1062 	__u8 ttl = 0;
1063 	__be16 df = 0;
1064 	__be16 *dfp = NULL;
1065 	struct iphdr  *iph;			/* Our new IP header */
1066 	unsigned int max_headroom;		/* The extra header space needed */
1067 	int ret, local;
1068 	int tun_type, gso_type;
1069 
1070 	EnterFunction(10);
1071 
1072 	local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1073 				   IP_VS_RT_MODE_LOCAL |
1074 				   IP_VS_RT_MODE_NON_LOCAL |
1075 				   IP_VS_RT_MODE_CONNECT |
1076 				   IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
1077 	if (local < 0)
1078 		goto tx_error;
1079 	if (local)
1080 		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1081 
1082 	rt = skb_rtable(skb);
1083 	tdev = rt->dst.dev;
1084 
1085 	/*
1086 	 * Okay, now see if we can stuff it in the buffer as-is.
1087 	 */
1088 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
1089 
1090 	tun_type = cp->dest->tun_type;
1091 
1092 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
1093 		max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
1094 
1095 	/* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
1096 	dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
1097 	skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1098 					 &next_protocol, NULL, &dsfield,
1099 					 &ttl, dfp);
1100 	if (IS_ERR(skb))
1101 		goto tx_error;
1102 
1103 	gso_type = __tun_gso_type_mask(AF_INET, cp->af);
1104 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
1105 		gso_type |= SKB_GSO_UDP_TUNNEL;
1106 
1107 	if (iptunnel_handle_offloads(skb, gso_type))
1108 		goto tx_error;
1109 
1110 	skb->transport_header = skb->network_header;
1111 
1112 	skb_set_inner_ipproto(skb, next_protocol);
1113 
1114 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
1115 		ipvs_gue_encap(net, skb, cp, &next_protocol);
1116 
1117 	skb_push(skb, sizeof(struct iphdr));
1118 	skb_reset_network_header(skb);
1119 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1120 
1121 	/*
1122 	 *	Push down and install the IPIP header.
1123 	 */
1124 	iph			=	ip_hdr(skb);
1125 	iph->version		=	4;
1126 	iph->ihl		=	sizeof(struct iphdr)>>2;
1127 	iph->frag_off		=	df;
1128 	iph->protocol		=	next_protocol;
1129 	iph->tos		=	dsfield;
1130 	iph->daddr		=	cp->daddr.ip;
1131 	iph->saddr		=	saddr;
1132 	iph->ttl		=	ttl;
1133 	ip_select_ident(net, skb, NULL);
1134 
1135 	/* Another hack: avoid icmp_send in ip_fragment */
1136 	skb->ignore_df = 1;
1137 
1138 	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1139 	if (ret == NF_ACCEPT)
1140 		ip_local_out(net, skb->sk, skb);
1141 	else if (ret == NF_DROP)
1142 		kfree_skb(skb);
1143 
1144 	LeaveFunction(10);
1145 
1146 	return NF_STOLEN;
1147 
1148   tx_error:
1149 	if (!IS_ERR(skb))
1150 		kfree_skb(skb);
1151 	LeaveFunction(10);
1152 	return NF_STOLEN;
1153 }
1154 
1155 #ifdef CONFIG_IP_VS_IPV6
1156 int
1157 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1158 		     struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1159 {
1160 	struct netns_ipvs *ipvs = cp->ipvs;
1161 	struct net *net = ipvs->net;
1162 	struct rt6_info *rt;		/* Route to the other host */
1163 	struct in6_addr saddr;		/* Source for tunnel */
1164 	struct net_device *tdev;	/* Device to other host */
1165 	__u8 next_protocol = 0;
1166 	__u32 payload_len = 0;
1167 	__u8 dsfield = 0;
1168 	__u8 ttl = 0;
1169 	struct ipv6hdr  *iph;		/* Our new IP header */
1170 	unsigned int max_headroom;	/* The extra header space needed */
1171 	int ret, local;
1172 	int tun_type, gso_type;
1173 
1174 	EnterFunction(10);
1175 
1176 	local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
1177 				      &cp->daddr.in6,
1178 				      &saddr, ipvsh, 1,
1179 				      IP_VS_RT_MODE_LOCAL |
1180 				      IP_VS_RT_MODE_NON_LOCAL |
1181 				      IP_VS_RT_MODE_TUNNEL);
1182 	if (local < 0)
1183 		goto tx_error;
1184 	if (local)
1185 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1186 
1187 	rt = (struct rt6_info *) skb_dst(skb);
1188 	tdev = rt->dst.dev;
1189 
1190 	/*
1191 	 * Okay, now see if we can stuff it in the buffer as-is.
1192 	 */
1193 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
1194 
1195 	tun_type = cp->dest->tun_type;
1196 
1197 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
1198 		max_headroom += sizeof(struct udphdr) + sizeof(struct guehdr);
1199 
1200 	skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
1201 					 &next_protocol, &payload_len,
1202 					 &dsfield, &ttl, NULL);
1203 	if (IS_ERR(skb))
1204 		goto tx_error;
1205 
1206 	gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
1207 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
1208 		gso_type |= SKB_GSO_UDP_TUNNEL;
1209 
1210 	if (iptunnel_handle_offloads(skb, gso_type))
1211 		goto tx_error;
1212 
1213 	skb->transport_header = skb->network_header;
1214 
1215 	skb_set_inner_ipproto(skb, next_protocol);
1216 
1217 	if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE)
1218 		ipvs_gue_encap(net, skb, cp, &next_protocol);
1219 
1220 	skb_push(skb, sizeof(struct ipv6hdr));
1221 	skb_reset_network_header(skb);
1222 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1223 
1224 	/*
1225 	 *	Push down and install the IPIP header.
1226 	 */
1227 	iph			=	ipv6_hdr(skb);
1228 	iph->version		=	6;
1229 	iph->nexthdr		=	next_protocol;
1230 	iph->payload_len	=	htons(payload_len);
1231 	memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
1232 	ipv6_change_dsfield(iph, 0, dsfield);
1233 	iph->daddr = cp->daddr.in6;
1234 	iph->saddr = saddr;
1235 	iph->hop_limit		=	ttl;
1236 
1237 	/* Another hack: avoid icmp_send in ip_fragment */
1238 	skb->ignore_df = 1;
1239 
1240 	ret = ip_vs_tunnel_xmit_prepare(skb, cp);
1241 	if (ret == NF_ACCEPT)
1242 		ip6_local_out(net, skb->sk, skb);
1243 	else if (ret == NF_DROP)
1244 		kfree_skb(skb);
1245 
1246 	LeaveFunction(10);
1247 
1248 	return NF_STOLEN;
1249 
1250 tx_error:
1251 	if (!IS_ERR(skb))
1252 		kfree_skb(skb);
1253 	LeaveFunction(10);
1254 	return NF_STOLEN;
1255 }
1256 #endif
1257 
1258 
1259 /*
1260  *      Direct Routing transmitter
1261  *      Used for ANY protocol
1262  */
1263 int
1264 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1265 	      struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1266 {
1267 	int local;
1268 
1269 	EnterFunction(10);
1270 
1271 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
1272 				   IP_VS_RT_MODE_LOCAL |
1273 				   IP_VS_RT_MODE_NON_LOCAL |
1274 				   IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
1275 	if (local < 0)
1276 		goto tx_error;
1277 	if (local)
1278 		return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
1279 
1280 	ip_send_check(ip_hdr(skb));
1281 
1282 	/* Another hack: avoid icmp_send in ip_fragment */
1283 	skb->ignore_df = 1;
1284 
1285 	ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
1286 
1287 	LeaveFunction(10);
1288 	return NF_STOLEN;
1289 
1290   tx_error:
1291 	kfree_skb(skb);
1292 	LeaveFunction(10);
1293 	return NF_STOLEN;
1294 }
1295 
1296 #ifdef CONFIG_IP_VS_IPV6
1297 int
1298 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1299 		 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
1300 {
1301 	int local;
1302 
1303 	EnterFunction(10);
1304 
1305 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1306 				      &cp->daddr.in6,
1307 				      NULL, ipvsh, 0,
1308 				      IP_VS_RT_MODE_LOCAL |
1309 				      IP_VS_RT_MODE_NON_LOCAL |
1310 				      IP_VS_RT_MODE_KNOWN_NH);
1311 	if (local < 0)
1312 		goto tx_error;
1313 	if (local)
1314 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
1315 
1316 	/* Another hack: avoid icmp_send in ip_fragment */
1317 	skb->ignore_df = 1;
1318 
1319 	ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
1320 
1321 	LeaveFunction(10);
1322 	return NF_STOLEN;
1323 
1324 tx_error:
1325 	kfree_skb(skb);
1326 	LeaveFunction(10);
1327 	return NF_STOLEN;
1328 }
1329 #endif
1330 
1331 
1332 /*
1333  *	ICMP packet transmitter
1334  *	called by the ip_vs_in_icmp
1335  */
1336 int
1337 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1338 		struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1339 		struct ip_vs_iphdr *iph)
1340 {
1341 	struct rtable	*rt;	/* Route to the other host */
1342 	int rc;
1343 	int local;
1344 	int rt_mode, was_input;
1345 
1346 	EnterFunction(10);
1347 
1348 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1349 	   forwarded directly here, because there is no need to
1350 	   translate address/port back */
1351 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1352 		if (cp->packet_xmit)
1353 			rc = cp->packet_xmit(skb, cp, pp, iph);
1354 		else
1355 			rc = NF_ACCEPT;
1356 		/* do not touch skb anymore */
1357 		atomic_inc(&cp->in_pkts);
1358 		goto out;
1359 	}
1360 
1361 	/*
1362 	 * mangle and send the packet here (only for VS/NAT)
1363 	 */
1364 	was_input = rt_is_input_route(skb_rtable(skb));
1365 
1366 	/* LOCALNODE from FORWARD hook is not supported */
1367 	rt_mode = (hooknum != NF_INET_FORWARD) ?
1368 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1369 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1370 	local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
1371 				   NULL, iph);
1372 	if (local < 0)
1373 		goto tx_error;
1374 	rt = skb_rtable(skb);
1375 
1376 	/*
1377 	 * Avoid duplicate tuple in reply direction for NAT traffic
1378 	 * to local address when connection is sync-ed
1379 	 */
1380 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1381 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1382 		enum ip_conntrack_info ctinfo;
1383 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1384 
1385 		if (ct) {
1386 			IP_VS_DBG(10, "%s(): "
1387 				  "stopping DNAT to local address %pI4\n",
1388 				  __func__, &cp->daddr.ip);
1389 			goto tx_error;
1390 		}
1391 	}
1392 #endif
1393 
1394 	/* From world but DNAT to loopback address? */
1395 	if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
1396 		IP_VS_DBG(1, "%s(): "
1397 			  "stopping DNAT to loopback %pI4\n",
1398 			  __func__, &cp->daddr.ip);
1399 		goto tx_error;
1400 	}
1401 
1402 	/* copy-on-write the packet before mangling it */
1403 	if (!skb_make_writable(skb, offset))
1404 		goto tx_error;
1405 
1406 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
1407 		goto tx_error;
1408 
1409 	ip_vs_nat_icmp(skb, pp, cp, 0);
1410 
1411 	/* Another hack: avoid icmp_send in ip_fragment */
1412 	skb->ignore_df = 1;
1413 
1414 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
1415 	goto out;
1416 
1417   tx_error:
1418 	kfree_skb(skb);
1419 	rc = NF_STOLEN;
1420   out:
1421 	LeaveFunction(10);
1422 	return rc;
1423 }
1424 
1425 #ifdef CONFIG_IP_VS_IPV6
1426 int
1427 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1428 		struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
1429 		struct ip_vs_iphdr *ipvsh)
1430 {
1431 	struct rt6_info	*rt;	/* Route to the other host */
1432 	int rc;
1433 	int local;
1434 	int rt_mode;
1435 
1436 	EnterFunction(10);
1437 
1438 	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
1439 	   forwarded directly here, because there is no need to
1440 	   translate address/port back */
1441 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
1442 		if (cp->packet_xmit)
1443 			rc = cp->packet_xmit(skb, cp, pp, ipvsh);
1444 		else
1445 			rc = NF_ACCEPT;
1446 		/* do not touch skb anymore */
1447 		atomic_inc(&cp->in_pkts);
1448 		goto out;
1449 	}
1450 
1451 	/*
1452 	 * mangle and send the packet here (only for VS/NAT)
1453 	 */
1454 
1455 	/* LOCALNODE from FORWARD hook is not supported */
1456 	rt_mode = (hooknum != NF_INET_FORWARD) ?
1457 		  IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
1458 		  IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
1459 	local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
1460 				      &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
1461 	if (local < 0)
1462 		goto tx_error;
1463 	rt = (struct rt6_info *) skb_dst(skb);
1464 	/*
1465 	 * Avoid duplicate tuple in reply direction for NAT traffic
1466 	 * to local address when connection is sync-ed
1467 	 */
1468 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
1469 	if (cp->flags & IP_VS_CONN_F_SYNC && local) {
1470 		enum ip_conntrack_info ctinfo;
1471 		struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
1472 
1473 		if (ct) {
1474 			IP_VS_DBG(10, "%s(): "
1475 				  "stopping DNAT to local address %pI6\n",
1476 				  __func__, &cp->daddr.in6);
1477 			goto tx_error;
1478 		}
1479 	}
1480 #endif
1481 
1482 	/* From world but DNAT to loopback address? */
1483 	if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
1484 	    ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
1485 		IP_VS_DBG(1, "%s(): "
1486 			  "stopping DNAT to loopback %pI6\n",
1487 			  __func__, &cp->daddr.in6);
1488 		goto tx_error;
1489 	}
1490 
1491 	/* copy-on-write the packet before mangling it */
1492 	if (!skb_make_writable(skb, offset))
1493 		goto tx_error;
1494 
1495 	if (skb_cow(skb, rt->dst.dev->hard_header_len))
1496 		goto tx_error;
1497 
1498 	ip_vs_nat_icmp_v6(skb, pp, cp, 0);
1499 
1500 	/* Another hack: avoid icmp_send in ip_fragment */
1501 	skb->ignore_df = 1;
1502 
1503 	rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
1504 	goto out;
1505 
1506 tx_error:
1507 	kfree_skb(skb);
1508 	rc = NF_STOLEN;
1509 out:
1510 	LeaveFunction(10);
1511 	return rc;
1512 }
1513 #endif
1514