1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_xmit.c: various packet transmitters for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * Julian Anastasov <ja@ssi.bg> 7 * 8 * Changes: 9 * 10 * Description of forwarding methods: 11 * - all transmitters are called from LOCAL_IN (remote clients) and 12 * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD 13 * - not all connections have destination server, for example, 14 * connections in backup server when fwmark is used 15 * - bypass connections use daddr from packet 16 * - we can use dst without ref while sending in RCU section, we use 17 * ref when returning NF_ACCEPT for NAT-ed packet via loopback 18 * LOCAL_OUT rules: 19 * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) 20 * - skb->pkt_type is not set yet 21 * - the only place where we can see skb->sk != NULL 22 */ 23 24 #define KMSG_COMPONENT "IPVS" 25 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt 26 27 #include <linux/kernel.h> 28 #include <linux/slab.h> 29 #include <linux/tcp.h> /* for tcphdr */ 30 #include <net/ip.h> 31 #include <net/gue.h> 32 #include <net/gre.h> 33 #include <net/tcp.h> /* for csum_tcpudp_magic */ 34 #include <net/udp.h> 35 #include <net/icmp.h> /* for icmp_send */ 36 #include <net/route.h> /* for ip_route_output */ 37 #include <net/ipv6.h> 38 #include <net/ip6_route.h> 39 #include <net/ip_tunnels.h> 40 #include <net/ip6_checksum.h> 41 #include <net/addrconf.h> 42 #include <linux/icmpv6.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv4.h> 45 46 #include <net/ip_vs.h> 47 48 enum { 49 IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ 50 IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ 51 IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to 52 * local 53 */ 54 IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ 55 IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ 56 IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */ 57 }; 58 59 static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void) 60 { 61 return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC); 62 } 63 64 static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst) 65 { 66 kfree(dest_dst); 67 } 68 69 /* 70 * Destination cache to speed up outgoing route lookup 71 */ 72 static inline void 73 __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst, 74 struct dst_entry *dst, u32 dst_cookie) 75 { 76 struct ip_vs_dest_dst *old; 77 78 old = rcu_dereference_protected(dest->dest_dst, 79 lockdep_is_held(&dest->dst_lock)); 80 81 if (dest_dst) { 82 dest_dst->dst_cache = dst; 83 dest_dst->dst_cookie = dst_cookie; 84 } 85 rcu_assign_pointer(dest->dest_dst, dest_dst); 86 87 if (old) 88 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free); 89 } 90 91 static inline struct ip_vs_dest_dst * 92 __ip_vs_dst_check(struct ip_vs_dest *dest) 93 { 94 struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst); 95 struct dst_entry *dst; 96 97 if (!dest_dst) 98 return NULL; 99 dst = dest_dst->dst_cache; 100 if (dst->obsolete && 101 dst->ops->check(dst, dest_dst->dst_cookie) == NULL) 102 return NULL; 103 return dest_dst; 104 } 105 106 static inline bool 107 __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) 108 { 109 if (IP6CB(skb)->frag_max_size) { 110 /* frag_max_size tell us that, this packet have been 111 * defragmented by netfilter IPv6 conntrack module. 112 */ 113 if (IP6CB(skb)->frag_max_size > mtu) 114 return true; /* largest fragment violate MTU */ 115 } 116 else if (skb->len > mtu && !skb_is_gso(skb)) { 117 return true; /* Packet size violate MTU size */ 118 } 119 return false; 120 } 121 122 /* Get route to daddr, update *saddr, optionally bind route to saddr */ 123 static struct rtable *do_output_route4(struct net *net, __be32 daddr, 124 int rt_mode, __be32 *saddr) 125 { 126 struct flowi4 fl4; 127 struct rtable *rt; 128 bool loop = false; 129 130 memset(&fl4, 0, sizeof(fl4)); 131 fl4.daddr = daddr; 132 fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? 133 FLOWI_FLAG_KNOWN_NH : 0; 134 135 retry: 136 rt = ip_route_output_key(net, &fl4); 137 if (IS_ERR(rt)) { 138 /* Invalid saddr ? */ 139 if (PTR_ERR(rt) == -EINVAL && *saddr && 140 rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { 141 *saddr = 0; 142 flowi4_update_output(&fl4, 0, 0, daddr, 0); 143 goto retry; 144 } 145 IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); 146 return NULL; 147 } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { 148 ip_rt_put(rt); 149 *saddr = fl4.saddr; 150 flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); 151 loop = true; 152 goto retry; 153 } 154 *saddr = fl4.saddr; 155 return rt; 156 } 157 158 #ifdef CONFIG_IP_VS_IPV6 159 static inline int __ip_vs_is_local_route6(struct rt6_info *rt) 160 { 161 return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK; 162 } 163 #endif 164 165 static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb, 166 int rt_mode, 167 bool new_rt_is_local) 168 { 169 bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL); 170 bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL); 171 bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR); 172 bool source_is_loopback; 173 bool old_rt_is_local; 174 175 #ifdef CONFIG_IP_VS_IPV6 176 if (skb_af == AF_INET6) { 177 int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr); 178 179 source_is_loopback = 180 (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 181 (addr_type & IPV6_ADDR_LOOPBACK); 182 old_rt_is_local = __ip_vs_is_local_route6( 183 (struct rt6_info *)skb_dst(skb)); 184 } else 185 #endif 186 { 187 source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr); 188 old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL; 189 } 190 191 if (unlikely(new_rt_is_local)) { 192 if (!rt_mode_allow_local) 193 return true; 194 if (!rt_mode_allow_redirect && !old_rt_is_local) 195 return true; 196 } else { 197 if (!rt_mode_allow_non_local) 198 return true; 199 if (source_is_loopback) 200 return true; 201 } 202 return false; 203 } 204 205 static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu) 206 { 207 struct sock *sk = skb->sk; 208 struct rtable *ort = skb_rtable(skb); 209 210 if (!skb->dev && sk && sk_fullsock(sk)) 211 ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true); 212 } 213 214 static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af, 215 int rt_mode, 216 struct ip_vs_iphdr *ipvsh, 217 struct sk_buff *skb, int mtu) 218 { 219 #ifdef CONFIG_IP_VS_IPV6 220 if (skb_af == AF_INET6) { 221 struct net *net = ipvs->net; 222 223 if (unlikely(__mtu_check_toobig_v6(skb, mtu))) { 224 if (!skb->dev) 225 skb->dev = net->loopback_dev; 226 /* only send ICMP too big on first fragment */ 227 if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh)) 228 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 229 IP_VS_DBG(1, "frag needed for %pI6c\n", 230 &ipv6_hdr(skb)->saddr); 231 return false; 232 } 233 } else 234 #endif 235 { 236 /* If we're going to tunnel the packet and pmtu discovery 237 * is disabled, we'll just fragment it anyway 238 */ 239 if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs)) 240 return true; 241 242 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) && 243 skb->len > mtu && !skb_is_gso(skb) && 244 !ip_vs_iph_icmp(ipvsh))) { 245 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, 246 htonl(mtu)); 247 IP_VS_DBG(1, "frag needed for %pI4\n", 248 &ip_hdr(skb)->saddr); 249 return false; 250 } 251 } 252 253 return true; 254 } 255 256 static inline bool decrement_ttl(struct netns_ipvs *ipvs, 257 int skb_af, 258 struct sk_buff *skb) 259 { 260 struct net *net = ipvs->net; 261 262 #ifdef CONFIG_IP_VS_IPV6 263 if (skb_af == AF_INET6) { 264 struct dst_entry *dst = skb_dst(skb); 265 266 /* check and decrement ttl */ 267 if (ipv6_hdr(skb)->hop_limit <= 1) { 268 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 269 270 /* Force OUTPUT device used as source address */ 271 skb->dev = dst->dev; 272 icmpv6_send(skb, ICMPV6_TIME_EXCEED, 273 ICMPV6_EXC_HOPLIMIT, 0); 274 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 275 276 return false; 277 } 278 279 /* don't propagate ttl change to cloned packets */ 280 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 281 return false; 282 283 ipv6_hdr(skb)->hop_limit--; 284 } else 285 #endif 286 { 287 if (ip_hdr(skb)->ttl <= 1) { 288 /* Tell the sender its packet died... */ 289 __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); 290 icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); 291 return false; 292 } 293 294 /* don't propagate ttl change to cloned packets */ 295 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 296 return false; 297 298 /* Decrease ttl */ 299 ip_decrease_ttl(ip_hdr(skb)); 300 } 301 302 return true; 303 } 304 305 /* Get route to destination or remote server */ 306 static int 307 __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 308 struct ip_vs_dest *dest, 309 __be32 daddr, int rt_mode, __be32 *ret_saddr, 310 struct ip_vs_iphdr *ipvsh) 311 { 312 struct net *net = ipvs->net; 313 struct ip_vs_dest_dst *dest_dst; 314 struct rtable *rt; /* Route to the other host */ 315 int mtu; 316 int local, noref = 1; 317 318 if (dest) { 319 dest_dst = __ip_vs_dst_check(dest); 320 if (likely(dest_dst)) 321 rt = (struct rtable *) dest_dst->dst_cache; 322 else { 323 dest_dst = ip_vs_dest_dst_alloc(); 324 spin_lock_bh(&dest->dst_lock); 325 if (!dest_dst) { 326 __ip_vs_dst_set(dest, NULL, NULL, 0); 327 spin_unlock_bh(&dest->dst_lock); 328 goto err_unreach; 329 } 330 rt = do_output_route4(net, dest->addr.ip, rt_mode, 331 &dest_dst->dst_saddr.ip); 332 if (!rt) { 333 __ip_vs_dst_set(dest, NULL, NULL, 0); 334 spin_unlock_bh(&dest->dst_lock); 335 ip_vs_dest_dst_free(dest_dst); 336 goto err_unreach; 337 } 338 __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0); 339 spin_unlock_bh(&dest->dst_lock); 340 IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", 341 &dest->addr.ip, &dest_dst->dst_saddr.ip, 342 atomic_read(&rt->dst.__refcnt)); 343 } 344 if (ret_saddr) 345 *ret_saddr = dest_dst->dst_saddr.ip; 346 } else { 347 __be32 saddr = htonl(INADDR_ANY); 348 349 noref = 0; 350 351 /* For such unconfigured boxes avoid many route lookups 352 * for performance reasons because we do not remember saddr 353 */ 354 rt_mode &= ~IP_VS_RT_MODE_CONNECT; 355 rt = do_output_route4(net, daddr, rt_mode, &saddr); 356 if (!rt) 357 goto err_unreach; 358 if (ret_saddr) 359 *ret_saddr = saddr; 360 } 361 362 local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; 363 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 364 local))) { 365 IP_VS_DBG_RL("We are crossing local and non-local addresses" 366 " daddr=%pI4\n", &daddr); 367 goto err_put; 368 } 369 370 if (unlikely(local)) { 371 /* skb to local stack, preserve old route */ 372 if (!noref) 373 ip_rt_put(rt); 374 return local; 375 } 376 377 if (!decrement_ttl(ipvs, skb_af, skb)) 378 goto err_put; 379 380 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) { 381 mtu = dst_mtu(&rt->dst); 382 } else { 383 mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); 384 if (!dest) 385 goto err_put; 386 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 387 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 388 if ((dest->tun_flags & 389 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 390 skb->ip_summed == CHECKSUM_PARTIAL) 391 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 392 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 393 __be16 tflags = 0; 394 395 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 396 tflags |= TUNNEL_CSUM; 397 mtu -= gre_calc_hlen(tflags); 398 } 399 if (mtu < 68) { 400 IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); 401 goto err_put; 402 } 403 maybe_update_pmtu(skb_af, skb, mtu); 404 } 405 406 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 407 goto err_put; 408 409 skb_dst_drop(skb); 410 if (noref) 411 skb_dst_set_noref(skb, &rt->dst); 412 else 413 skb_dst_set(skb, &rt->dst); 414 415 return local; 416 417 err_put: 418 if (!noref) 419 ip_rt_put(rt); 420 return -1; 421 422 err_unreach: 423 dst_link_failure(skb); 424 return -1; 425 } 426 427 #ifdef CONFIG_IP_VS_IPV6 428 static struct dst_entry * 429 __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, 430 struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) 431 { 432 struct dst_entry *dst; 433 struct flowi6 fl6 = { 434 .daddr = *daddr, 435 }; 436 437 if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) 438 fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; 439 440 dst = ip6_route_output(net, NULL, &fl6); 441 if (dst->error) 442 goto out_err; 443 if (!ret_saddr) 444 return dst; 445 if (ipv6_addr_any(&fl6.saddr) && 446 ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, 447 &fl6.daddr, 0, &fl6.saddr) < 0) 448 goto out_err; 449 if (do_xfrm) { 450 dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); 451 if (IS_ERR(dst)) { 452 dst = NULL; 453 goto out_err; 454 } 455 } 456 *ret_saddr = fl6.saddr; 457 return dst; 458 459 out_err: 460 dst_release(dst); 461 IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); 462 return NULL; 463 } 464 465 /* 466 * Get route to destination or remote server 467 */ 468 static int 469 __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, 470 struct ip_vs_dest *dest, 471 struct in6_addr *daddr, struct in6_addr *ret_saddr, 472 struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode) 473 { 474 struct net *net = ipvs->net; 475 struct ip_vs_dest_dst *dest_dst; 476 struct rt6_info *rt; /* Route to the other host */ 477 struct dst_entry *dst; 478 int mtu; 479 int local, noref = 1; 480 481 if (dest) { 482 dest_dst = __ip_vs_dst_check(dest); 483 if (likely(dest_dst)) 484 rt = (struct rt6_info *) dest_dst->dst_cache; 485 else { 486 u32 cookie; 487 488 dest_dst = ip_vs_dest_dst_alloc(); 489 spin_lock_bh(&dest->dst_lock); 490 if (!dest_dst) { 491 __ip_vs_dst_set(dest, NULL, NULL, 0); 492 spin_unlock_bh(&dest->dst_lock); 493 goto err_unreach; 494 } 495 dst = __ip_vs_route_output_v6(net, &dest->addr.in6, 496 &dest_dst->dst_saddr.in6, 497 do_xfrm, rt_mode); 498 if (!dst) { 499 __ip_vs_dst_set(dest, NULL, NULL, 0); 500 spin_unlock_bh(&dest->dst_lock); 501 ip_vs_dest_dst_free(dest_dst); 502 goto err_unreach; 503 } 504 rt = (struct rt6_info *) dst; 505 cookie = rt6_get_cookie(rt); 506 __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); 507 spin_unlock_bh(&dest->dst_lock); 508 IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", 509 &dest->addr.in6, &dest_dst->dst_saddr.in6, 510 atomic_read(&rt->dst.__refcnt)); 511 } 512 if (ret_saddr) 513 *ret_saddr = dest_dst->dst_saddr.in6; 514 } else { 515 noref = 0; 516 dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, 517 rt_mode); 518 if (!dst) 519 goto err_unreach; 520 rt = (struct rt6_info *) dst; 521 } 522 523 local = __ip_vs_is_local_route6(rt); 524 525 if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode, 526 local))) { 527 IP_VS_DBG_RL("We are crossing local and non-local addresses" 528 " daddr=%pI6\n", daddr); 529 goto err_put; 530 } 531 532 if (unlikely(local)) { 533 /* skb to local stack, preserve old route */ 534 if (!noref) 535 dst_release(&rt->dst); 536 return local; 537 } 538 539 if (!decrement_ttl(ipvs, skb_af, skb)) 540 goto err_put; 541 542 /* MTU checking */ 543 if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) 544 mtu = dst_mtu(&rt->dst); 545 else { 546 mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); 547 if (!dest) 548 goto err_put; 549 if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 550 mtu -= sizeof(struct udphdr) + sizeof(struct guehdr); 551 if ((dest->tun_flags & 552 IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 553 skb->ip_summed == CHECKSUM_PARTIAL) 554 mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 555 } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 556 __be16 tflags = 0; 557 558 if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 559 tflags |= TUNNEL_CSUM; 560 mtu -= gre_calc_hlen(tflags); 561 } 562 if (mtu < IPV6_MIN_MTU) { 563 IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, 564 IPV6_MIN_MTU); 565 goto err_put; 566 } 567 maybe_update_pmtu(skb_af, skb, mtu); 568 } 569 570 if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu)) 571 goto err_put; 572 573 skb_dst_drop(skb); 574 if (noref) 575 skb_dst_set_noref(skb, &rt->dst); 576 else 577 skb_dst_set(skb, &rt->dst); 578 579 return local; 580 581 err_put: 582 if (!noref) 583 dst_release(&rt->dst); 584 return -1; 585 586 err_unreach: 587 /* The ip6_link_failure function requires the dev field to be set 588 * in order to get the net (further for the sake of fwmark 589 * reflection). 590 */ 591 if (!skb->dev) 592 skb->dev = skb_dst(skb)->dev; 593 594 dst_link_failure(skb); 595 return -1; 596 } 597 #endif 598 599 600 /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */ 601 static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, 602 struct ip_vs_conn *cp) 603 { 604 int ret = NF_ACCEPT; 605 606 skb->ipvs_property = 1; 607 if (unlikely(cp->flags & IP_VS_CONN_F_NFCT)) 608 ret = ip_vs_confirm_conntrack(skb); 609 if (ret == NF_ACCEPT) { 610 nf_reset_ct(skb); 611 skb_forward_csum(skb); 612 if (skb->dev) 613 skb->tstamp = 0; 614 } 615 return ret; 616 } 617 618 /* In the event of a remote destination, it's possible that we would have 619 * matches against an old socket (particularly a TIME-WAIT socket). This 620 * causes havoc down the line (ip_local_out et. al. expect regular sockets 621 * and invalid memory accesses will happen) so simply drop the association 622 * in this case. 623 */ 624 static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) 625 { 626 /* If dev is set, the packet came from the LOCAL_IN callback and 627 * not from a local TCP socket. 628 */ 629 if (skb->dev) 630 skb_orphan(skb); 631 } 632 633 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 634 static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, 635 struct ip_vs_conn *cp, int local) 636 { 637 int ret = NF_STOLEN; 638 639 skb->ipvs_property = 1; 640 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 641 ip_vs_notrack(skb); 642 else 643 ip_vs_update_conntrack(skb, cp, 1); 644 645 /* Remove the early_demux association unless it's bound for the 646 * exact same port and address on this host after translation. 647 */ 648 if (!local || cp->vport != cp->dport || 649 !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) 650 ip_vs_drop_early_demux_sk(skb); 651 652 if (!local) { 653 skb_forward_csum(skb); 654 if (skb->dev) 655 skb->tstamp = 0; 656 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 657 NULL, skb_dst(skb)->dev, dst_output); 658 } else 659 ret = NF_ACCEPT; 660 661 return ret; 662 } 663 664 /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ 665 static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, 666 struct ip_vs_conn *cp, int local) 667 { 668 int ret = NF_STOLEN; 669 670 skb->ipvs_property = 1; 671 if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) 672 ip_vs_notrack(skb); 673 if (!local) { 674 ip_vs_drop_early_demux_sk(skb); 675 skb_forward_csum(skb); 676 if (skb->dev) 677 skb->tstamp = 0; 678 NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, 679 NULL, skb_dst(skb)->dev, dst_output); 680 } else 681 ret = NF_ACCEPT; 682 return ret; 683 } 684 685 686 /* 687 * NULL transmitter (do nothing except return NF_ACCEPT) 688 */ 689 int 690 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 691 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 692 { 693 /* we do not touch skb and do not need pskb ptr */ 694 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 695 } 696 697 698 /* 699 * Bypass transmitter 700 * Let packets bypass the destination when the destination is not 701 * available, it may be only used in transparent cache cluster. 702 */ 703 int 704 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 705 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 706 { 707 struct iphdr *iph = ip_hdr(skb); 708 709 EnterFunction(10); 710 711 if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, 712 IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0) 713 goto tx_error; 714 715 ip_send_check(iph); 716 717 /* Another hack: avoid icmp_send in ip_fragment */ 718 skb->ignore_df = 1; 719 720 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 721 722 LeaveFunction(10); 723 return NF_STOLEN; 724 725 tx_error: 726 kfree_skb(skb); 727 LeaveFunction(10); 728 return NF_STOLEN; 729 } 730 731 #ifdef CONFIG_IP_VS_IPV6 732 int 733 ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 734 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 735 { 736 struct ipv6hdr *iph = ipv6_hdr(skb); 737 738 EnterFunction(10); 739 740 if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL, 741 &iph->daddr, NULL, 742 ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0) 743 goto tx_error; 744 745 /* Another hack: avoid icmp_send in ip_fragment */ 746 skb->ignore_df = 1; 747 748 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 749 750 LeaveFunction(10); 751 return NF_STOLEN; 752 753 tx_error: 754 kfree_skb(skb); 755 LeaveFunction(10); 756 return NF_STOLEN; 757 } 758 #endif 759 760 /* 761 * NAT transmitter (only for outside-to-inside nat forwarding) 762 * Not used for related ICMP 763 */ 764 int 765 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 766 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 767 { 768 struct rtable *rt; /* Route to the other host */ 769 int local, rc, was_input; 770 771 EnterFunction(10); 772 773 /* check if it is a connection of no-client-port */ 774 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { 775 __be16 _pt, *p; 776 777 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 778 if (p == NULL) 779 goto tx_error; 780 ip_vs_conn_fill_cport(cp, *p); 781 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 782 } 783 784 was_input = rt_is_input_route(skb_rtable(skb)); 785 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 786 IP_VS_RT_MODE_LOCAL | 787 IP_VS_RT_MODE_NON_LOCAL | 788 IP_VS_RT_MODE_RDR, NULL, ipvsh); 789 if (local < 0) 790 goto tx_error; 791 rt = skb_rtable(skb); 792 /* 793 * Avoid duplicate tuple in reply direction for NAT traffic 794 * to local address when connection is sync-ed 795 */ 796 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 797 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 798 enum ip_conntrack_info ctinfo; 799 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 800 801 if (ct) { 802 IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, 803 "ip_vs_nat_xmit(): " 804 "stopping DNAT to local address"); 805 goto tx_error; 806 } 807 } 808 #endif 809 810 /* From world but DNAT to loopback address? */ 811 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 812 IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, 813 "ip_vs_nat_xmit(): stopping DNAT to loopback " 814 "address"); 815 goto tx_error; 816 } 817 818 /* copy-on-write the packet before mangling it */ 819 if (skb_ensure_writable(skb, sizeof(struct iphdr))) 820 goto tx_error; 821 822 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 823 goto tx_error; 824 825 /* mangle the packet */ 826 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 827 goto tx_error; 828 ip_hdr(skb)->daddr = cp->daddr.ip; 829 ip_send_check(ip_hdr(skb)); 830 831 IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); 832 833 /* FIXME: when application helper enlarges the packet and the length 834 is larger than the MTU of outgoing device, there will be still 835 MTU problem. */ 836 837 /* Another hack: avoid icmp_send in ip_fragment */ 838 skb->ignore_df = 1; 839 840 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 841 842 LeaveFunction(10); 843 return rc; 844 845 tx_error: 846 kfree_skb(skb); 847 LeaveFunction(10); 848 return NF_STOLEN; 849 } 850 851 #ifdef CONFIG_IP_VS_IPV6 852 int 853 ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 854 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 855 { 856 struct rt6_info *rt; /* Route to the other host */ 857 int local, rc; 858 859 EnterFunction(10); 860 861 /* check if it is a connection of no-client-port */ 862 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) { 863 __be16 _pt, *p; 864 p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); 865 if (p == NULL) 866 goto tx_error; 867 ip_vs_conn_fill_cport(cp, *p); 868 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); 869 } 870 871 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 872 &cp->daddr.in6, 873 NULL, ipvsh, 0, 874 IP_VS_RT_MODE_LOCAL | 875 IP_VS_RT_MODE_NON_LOCAL | 876 IP_VS_RT_MODE_RDR); 877 if (local < 0) 878 goto tx_error; 879 rt = (struct rt6_info *) skb_dst(skb); 880 /* 881 * Avoid duplicate tuple in reply direction for NAT traffic 882 * to local address when connection is sync-ed 883 */ 884 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 885 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 886 enum ip_conntrack_info ctinfo; 887 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 888 889 if (ct) { 890 IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, 891 "ip_vs_nat_xmit_v6(): " 892 "stopping DNAT to local address"); 893 goto tx_error; 894 } 895 } 896 #endif 897 898 /* From world but DNAT to loopback address? */ 899 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 900 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 901 IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off, 902 "ip_vs_nat_xmit_v6(): " 903 "stopping DNAT to loopback address"); 904 goto tx_error; 905 } 906 907 /* copy-on-write the packet before mangling it */ 908 if (skb_ensure_writable(skb, sizeof(struct ipv6hdr))) 909 goto tx_error; 910 911 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 912 goto tx_error; 913 914 /* mangle the packet */ 915 if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) 916 goto tx_error; 917 ipv6_hdr(skb)->daddr = cp->daddr.in6; 918 919 IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT"); 920 921 /* FIXME: when application helper enlarges the packet and the length 922 is larger than the MTU of outgoing device, there will be still 923 MTU problem. */ 924 925 /* Another hack: avoid icmp_send in ip_fragment */ 926 skb->ignore_df = 1; 927 928 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 929 930 LeaveFunction(10); 931 return rc; 932 933 tx_error: 934 LeaveFunction(10); 935 kfree_skb(skb); 936 return NF_STOLEN; 937 } 938 #endif 939 940 /* When forwarding a packet, we must ensure that we've got enough headroom 941 * for the encapsulation packet in the skb. This also gives us an 942 * opportunity to figure out what the payload_len, dsfield, ttl, and df 943 * values should be, so that we won't need to look at the old ip header 944 * again 945 */ 946 static struct sk_buff * 947 ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, 948 unsigned int max_headroom, __u8 *next_protocol, 949 __u32 *payload_len, __u8 *dsfield, __u8 *ttl, 950 __be16 *df) 951 { 952 struct sk_buff *new_skb = NULL; 953 struct iphdr *old_iph = NULL; 954 __u8 old_dsfield; 955 #ifdef CONFIG_IP_VS_IPV6 956 struct ipv6hdr *old_ipv6h = NULL; 957 #endif 958 959 ip_vs_drop_early_demux_sk(skb); 960 961 if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { 962 new_skb = skb_realloc_headroom(skb, max_headroom); 963 if (!new_skb) 964 goto error; 965 if (skb->sk) 966 skb_set_owner_w(new_skb, skb->sk); 967 consume_skb(skb); 968 skb = new_skb; 969 } 970 971 #ifdef CONFIG_IP_VS_IPV6 972 if (skb_af == AF_INET6) { 973 old_ipv6h = ipv6_hdr(skb); 974 *next_protocol = IPPROTO_IPV6; 975 if (payload_len) 976 *payload_len = 977 ntohs(old_ipv6h->payload_len) + 978 sizeof(*old_ipv6h); 979 old_dsfield = ipv6_get_dsfield(old_ipv6h); 980 *ttl = old_ipv6h->hop_limit; 981 if (df) 982 *df = 0; 983 } else 984 #endif 985 { 986 old_iph = ip_hdr(skb); 987 /* Copy DF, reset fragment offset and MF */ 988 if (df) 989 *df = (old_iph->frag_off & htons(IP_DF)); 990 *next_protocol = IPPROTO_IPIP; 991 992 /* fix old IP header checksum */ 993 ip_send_check(old_iph); 994 old_dsfield = ipv4_get_dsfield(old_iph); 995 *ttl = old_iph->ttl; 996 if (payload_len) 997 *payload_len = ntohs(old_iph->tot_len); 998 } 999 1000 /* Implement full-functionality option for ECN encapsulation */ 1001 *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); 1002 1003 return skb; 1004 error: 1005 kfree_skb(skb); 1006 return ERR_PTR(-ENOMEM); 1007 } 1008 1009 static inline int __tun_gso_type_mask(int encaps_af, int orig_af) 1010 { 1011 switch (encaps_af) { 1012 case AF_INET: 1013 return SKB_GSO_IPXIP4; 1014 case AF_INET6: 1015 return SKB_GSO_IPXIP6; 1016 default: 1017 return 0; 1018 } 1019 } 1020 1021 static int 1022 ipvs_gue_encap(struct net *net, struct sk_buff *skb, 1023 struct ip_vs_conn *cp, __u8 *next_protocol) 1024 { 1025 __be16 dport; 1026 __be16 sport = udp_flow_src_port(net, skb, 0, 0, false); 1027 struct udphdr *udph; /* Our new UDP header */ 1028 struct guehdr *gueh; /* Our new GUE header */ 1029 size_t hdrlen, optlen = 0; 1030 void *data; 1031 bool need_priv = false; 1032 1033 if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1034 skb->ip_summed == CHECKSUM_PARTIAL) { 1035 optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1036 need_priv = true; 1037 } 1038 1039 hdrlen = sizeof(struct guehdr) + optlen; 1040 1041 skb_push(skb, hdrlen); 1042 1043 gueh = (struct guehdr *)skb->data; 1044 1045 gueh->control = 0; 1046 gueh->version = 0; 1047 gueh->hlen = optlen >> 2; 1048 gueh->flags = 0; 1049 gueh->proto_ctype = *next_protocol; 1050 1051 data = &gueh[1]; 1052 1053 if (need_priv) { 1054 __be32 *flags = data; 1055 u16 csum_start = skb_checksum_start_offset(skb); 1056 __be16 *pd; 1057 1058 gueh->flags |= GUE_FLAG_PRIV; 1059 *flags = 0; 1060 data += GUE_LEN_PRIV; 1061 1062 if (csum_start < hdrlen) 1063 return -EINVAL; 1064 1065 csum_start -= hdrlen; 1066 pd = data; 1067 pd[0] = htons(csum_start); 1068 pd[1] = htons(csum_start + skb->csum_offset); 1069 1070 if (!skb_is_gso(skb)) { 1071 skb->ip_summed = CHECKSUM_NONE; 1072 skb->encapsulation = 0; 1073 } 1074 1075 *flags |= GUE_PFLAG_REMCSUM; 1076 data += GUE_PLEN_REMCSUM; 1077 } 1078 1079 skb_push(skb, sizeof(struct udphdr)); 1080 skb_reset_transport_header(skb); 1081 1082 udph = udp_hdr(skb); 1083 1084 dport = cp->dest->tun_port; 1085 udph->dest = dport; 1086 udph->source = sport; 1087 udph->len = htons(skb->len); 1088 udph->check = 0; 1089 1090 *next_protocol = IPPROTO_UDP; 1091 1092 return 0; 1093 } 1094 1095 static void 1096 ipvs_gre_encap(struct net *net, struct sk_buff *skb, 1097 struct ip_vs_conn *cp, __u8 *next_protocol) 1098 { 1099 __be16 proto = *next_protocol == IPPROTO_IPIP ? 1100 htons(ETH_P_IP) : htons(ETH_P_IPV6); 1101 __be16 tflags = 0; 1102 size_t hdrlen; 1103 1104 if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1105 tflags |= TUNNEL_CSUM; 1106 1107 hdrlen = gre_calc_hlen(tflags); 1108 gre_build_header(skb, hdrlen, tflags, proto, 0, 0); 1109 1110 *next_protocol = IPPROTO_GRE; 1111 } 1112 1113 /* 1114 * IP Tunneling transmitter 1115 * 1116 * This function encapsulates the packet in a new IP packet, its 1117 * destination will be set to cp->daddr. Most code of this function 1118 * is taken from ipip.c. 1119 * 1120 * It is used in VS/TUN cluster. The load balancer selects a real 1121 * server from a cluster based on a scheduling algorithm, 1122 * encapsulates the request packet and forwards it to the selected 1123 * server. For example, all real servers are configured with 1124 * "ifconfig tunl0 <Virtual IP Address> up". When the server receives 1125 * the encapsulated packet, it will decapsulate the packet, processe 1126 * the request and return the response packets directly to the client 1127 * without passing the load balancer. This can greatly increase the 1128 * scalability of virtual server. 1129 * 1130 * Used for ANY protocol 1131 */ 1132 int 1133 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1134 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1135 { 1136 struct netns_ipvs *ipvs = cp->ipvs; 1137 struct net *net = ipvs->net; 1138 struct rtable *rt; /* Route to the other host */ 1139 __be32 saddr; /* Source for tunnel */ 1140 struct net_device *tdev; /* Device to other host */ 1141 __u8 next_protocol = 0; 1142 __u8 dsfield = 0; 1143 __u8 ttl = 0; 1144 __be16 df = 0; 1145 __be16 *dfp = NULL; 1146 struct iphdr *iph; /* Our new IP header */ 1147 unsigned int max_headroom; /* The extra header space needed */ 1148 int ret, local; 1149 int tun_type, gso_type; 1150 int tun_flags; 1151 1152 EnterFunction(10); 1153 1154 local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1155 IP_VS_RT_MODE_LOCAL | 1156 IP_VS_RT_MODE_NON_LOCAL | 1157 IP_VS_RT_MODE_CONNECT | 1158 IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh); 1159 if (local < 0) 1160 goto tx_error; 1161 if (local) 1162 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1163 1164 rt = skb_rtable(skb); 1165 tdev = rt->dst.dev; 1166 1167 /* 1168 * Okay, now see if we can stuff it in the buffer as-is. 1169 */ 1170 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); 1171 1172 tun_type = cp->dest->tun_type; 1173 tun_flags = cp->dest->tun_flags; 1174 1175 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1176 size_t gue_hdrlen, gue_optlen = 0; 1177 1178 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1179 skb->ip_summed == CHECKSUM_PARTIAL) { 1180 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1181 } 1182 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1183 1184 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1185 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1186 size_t gre_hdrlen; 1187 __be16 tflags = 0; 1188 1189 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1190 tflags |= TUNNEL_CSUM; 1191 gre_hdrlen = gre_calc_hlen(tflags); 1192 1193 max_headroom += gre_hdrlen; 1194 } 1195 1196 /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */ 1197 dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL; 1198 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1199 &next_protocol, NULL, &dsfield, 1200 &ttl, dfp); 1201 if (IS_ERR(skb)) 1202 goto tx_error; 1203 1204 gso_type = __tun_gso_type_mask(AF_INET, cp->af); 1205 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1206 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1207 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1208 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1209 else 1210 gso_type |= SKB_GSO_UDP_TUNNEL; 1211 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1212 skb->ip_summed == CHECKSUM_PARTIAL) { 1213 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1214 } 1215 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1216 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1217 gso_type |= SKB_GSO_GRE_CSUM; 1218 else 1219 gso_type |= SKB_GSO_GRE; 1220 } 1221 1222 if (iptunnel_handle_offloads(skb, gso_type)) 1223 goto tx_error; 1224 1225 skb->transport_header = skb->network_header; 1226 1227 skb_set_inner_ipproto(skb, next_protocol); 1228 1229 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1230 bool check = false; 1231 1232 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1233 goto tx_error; 1234 1235 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1236 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1237 check = true; 1238 1239 udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len); 1240 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1241 ipvs_gre_encap(net, skb, cp, &next_protocol); 1242 1243 skb_push(skb, sizeof(struct iphdr)); 1244 skb_reset_network_header(skb); 1245 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1246 1247 /* 1248 * Push down and install the IPIP header. 1249 */ 1250 iph = ip_hdr(skb); 1251 iph->version = 4; 1252 iph->ihl = sizeof(struct iphdr)>>2; 1253 iph->frag_off = df; 1254 iph->protocol = next_protocol; 1255 iph->tos = dsfield; 1256 iph->daddr = cp->daddr.ip; 1257 iph->saddr = saddr; 1258 iph->ttl = ttl; 1259 ip_select_ident(net, skb, NULL); 1260 1261 /* Another hack: avoid icmp_send in ip_fragment */ 1262 skb->ignore_df = 1; 1263 1264 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1265 if (ret == NF_ACCEPT) 1266 ip_local_out(net, skb->sk, skb); 1267 else if (ret == NF_DROP) 1268 kfree_skb(skb); 1269 1270 LeaveFunction(10); 1271 1272 return NF_STOLEN; 1273 1274 tx_error: 1275 if (!IS_ERR(skb)) 1276 kfree_skb(skb); 1277 LeaveFunction(10); 1278 return NF_STOLEN; 1279 } 1280 1281 #ifdef CONFIG_IP_VS_IPV6 1282 int 1283 ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1284 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1285 { 1286 struct netns_ipvs *ipvs = cp->ipvs; 1287 struct net *net = ipvs->net; 1288 struct rt6_info *rt; /* Route to the other host */ 1289 struct in6_addr saddr; /* Source for tunnel */ 1290 struct net_device *tdev; /* Device to other host */ 1291 __u8 next_protocol = 0; 1292 __u32 payload_len = 0; 1293 __u8 dsfield = 0; 1294 __u8 ttl = 0; 1295 struct ipv6hdr *iph; /* Our new IP header */ 1296 unsigned int max_headroom; /* The extra header space needed */ 1297 int ret, local; 1298 int tun_type, gso_type; 1299 int tun_flags; 1300 1301 EnterFunction(10); 1302 1303 local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest, 1304 &cp->daddr.in6, 1305 &saddr, ipvsh, 1, 1306 IP_VS_RT_MODE_LOCAL | 1307 IP_VS_RT_MODE_NON_LOCAL | 1308 IP_VS_RT_MODE_TUNNEL); 1309 if (local < 0) 1310 goto tx_error; 1311 if (local) 1312 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1313 1314 rt = (struct rt6_info *) skb_dst(skb); 1315 tdev = rt->dst.dev; 1316 1317 /* 1318 * Okay, now see if we can stuff it in the buffer as-is. 1319 */ 1320 max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); 1321 1322 tun_type = cp->dest->tun_type; 1323 tun_flags = cp->dest->tun_flags; 1324 1325 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1326 size_t gue_hdrlen, gue_optlen = 0; 1327 1328 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1329 skb->ip_summed == CHECKSUM_PARTIAL) { 1330 gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV; 1331 } 1332 gue_hdrlen = sizeof(struct guehdr) + gue_optlen; 1333 1334 max_headroom += sizeof(struct udphdr) + gue_hdrlen; 1335 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1336 size_t gre_hdrlen; 1337 __be16 tflags = 0; 1338 1339 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1340 tflags |= TUNNEL_CSUM; 1341 gre_hdrlen = gre_calc_hlen(tflags); 1342 1343 max_headroom += gre_hdrlen; 1344 } 1345 1346 skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom, 1347 &next_protocol, &payload_len, 1348 &dsfield, &ttl, NULL); 1349 if (IS_ERR(skb)) 1350 goto tx_error; 1351 1352 gso_type = __tun_gso_type_mask(AF_INET6, cp->af); 1353 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1354 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1355 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1356 gso_type |= SKB_GSO_UDP_TUNNEL_CSUM; 1357 else 1358 gso_type |= SKB_GSO_UDP_TUNNEL; 1359 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) && 1360 skb->ip_summed == CHECKSUM_PARTIAL) { 1361 gso_type |= SKB_GSO_TUNNEL_REMCSUM; 1362 } 1363 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) { 1364 if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) 1365 gso_type |= SKB_GSO_GRE_CSUM; 1366 else 1367 gso_type |= SKB_GSO_GRE; 1368 } 1369 1370 if (iptunnel_handle_offloads(skb, gso_type)) 1371 goto tx_error; 1372 1373 skb->transport_header = skb->network_header; 1374 1375 skb_set_inner_ipproto(skb, next_protocol); 1376 1377 if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) { 1378 bool check = false; 1379 1380 if (ipvs_gue_encap(net, skb, cp, &next_protocol)) 1381 goto tx_error; 1382 1383 if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) || 1384 (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM)) 1385 check = true; 1386 1387 udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len); 1388 } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) 1389 ipvs_gre_encap(net, skb, cp, &next_protocol); 1390 1391 skb_push(skb, sizeof(struct ipv6hdr)); 1392 skb_reset_network_header(skb); 1393 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1394 1395 /* 1396 * Push down and install the IPIP header. 1397 */ 1398 iph = ipv6_hdr(skb); 1399 iph->version = 6; 1400 iph->nexthdr = next_protocol; 1401 iph->payload_len = htons(payload_len); 1402 memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); 1403 ipv6_change_dsfield(iph, 0, dsfield); 1404 iph->daddr = cp->daddr.in6; 1405 iph->saddr = saddr; 1406 iph->hop_limit = ttl; 1407 1408 /* Another hack: avoid icmp_send in ip_fragment */ 1409 skb->ignore_df = 1; 1410 1411 ret = ip_vs_tunnel_xmit_prepare(skb, cp); 1412 if (ret == NF_ACCEPT) 1413 ip6_local_out(net, skb->sk, skb); 1414 else if (ret == NF_DROP) 1415 kfree_skb(skb); 1416 1417 LeaveFunction(10); 1418 1419 return NF_STOLEN; 1420 1421 tx_error: 1422 if (!IS_ERR(skb)) 1423 kfree_skb(skb); 1424 LeaveFunction(10); 1425 return NF_STOLEN; 1426 } 1427 #endif 1428 1429 1430 /* 1431 * Direct Routing transmitter 1432 * Used for ANY protocol 1433 */ 1434 int 1435 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1436 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1437 { 1438 int local; 1439 1440 EnterFunction(10); 1441 1442 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, 1443 IP_VS_RT_MODE_LOCAL | 1444 IP_VS_RT_MODE_NON_LOCAL | 1445 IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh); 1446 if (local < 0) 1447 goto tx_error; 1448 if (local) 1449 return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1); 1450 1451 ip_send_check(ip_hdr(skb)); 1452 1453 /* Another hack: avoid icmp_send in ip_fragment */ 1454 skb->ignore_df = 1; 1455 1456 ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0); 1457 1458 LeaveFunction(10); 1459 return NF_STOLEN; 1460 1461 tx_error: 1462 kfree_skb(skb); 1463 LeaveFunction(10); 1464 return NF_STOLEN; 1465 } 1466 1467 #ifdef CONFIG_IP_VS_IPV6 1468 int 1469 ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1470 struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) 1471 { 1472 int local; 1473 1474 EnterFunction(10); 1475 1476 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1477 &cp->daddr.in6, 1478 NULL, ipvsh, 0, 1479 IP_VS_RT_MODE_LOCAL | 1480 IP_VS_RT_MODE_NON_LOCAL | 1481 IP_VS_RT_MODE_KNOWN_NH); 1482 if (local < 0) 1483 goto tx_error; 1484 if (local) 1485 return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1); 1486 1487 /* Another hack: avoid icmp_send in ip_fragment */ 1488 skb->ignore_df = 1; 1489 1490 ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0); 1491 1492 LeaveFunction(10); 1493 return NF_STOLEN; 1494 1495 tx_error: 1496 kfree_skb(skb); 1497 LeaveFunction(10); 1498 return NF_STOLEN; 1499 } 1500 #endif 1501 1502 1503 /* 1504 * ICMP packet transmitter 1505 * called by the ip_vs_in_icmp 1506 */ 1507 int 1508 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, 1509 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1510 struct ip_vs_iphdr *iph) 1511 { 1512 struct rtable *rt; /* Route to the other host */ 1513 int rc; 1514 int local; 1515 int rt_mode, was_input; 1516 1517 EnterFunction(10); 1518 1519 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1520 forwarded directly here, because there is no need to 1521 translate address/port back */ 1522 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1523 if (cp->packet_xmit) 1524 rc = cp->packet_xmit(skb, cp, pp, iph); 1525 else 1526 rc = NF_ACCEPT; 1527 /* do not touch skb anymore */ 1528 atomic_inc(&cp->in_pkts); 1529 goto out; 1530 } 1531 1532 /* 1533 * mangle and send the packet here (only for VS/NAT) 1534 */ 1535 was_input = rt_is_input_route(skb_rtable(skb)); 1536 1537 /* LOCALNODE from FORWARD hook is not supported */ 1538 rt_mode = (hooknum != NF_INET_FORWARD) ? 1539 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1540 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1541 local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode, 1542 NULL, iph); 1543 if (local < 0) 1544 goto tx_error; 1545 rt = skb_rtable(skb); 1546 1547 /* 1548 * Avoid duplicate tuple in reply direction for NAT traffic 1549 * to local address when connection is sync-ed 1550 */ 1551 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1552 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1553 enum ip_conntrack_info ctinfo; 1554 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1555 1556 if (ct) { 1557 IP_VS_DBG(10, "%s(): " 1558 "stopping DNAT to local address %pI4\n", 1559 __func__, &cp->daddr.ip); 1560 goto tx_error; 1561 } 1562 } 1563 #endif 1564 1565 /* From world but DNAT to loopback address? */ 1566 if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { 1567 IP_VS_DBG(1, "%s(): " 1568 "stopping DNAT to loopback %pI4\n", 1569 __func__, &cp->daddr.ip); 1570 goto tx_error; 1571 } 1572 1573 /* copy-on-write the packet before mangling it */ 1574 if (skb_ensure_writable(skb, offset)) 1575 goto tx_error; 1576 1577 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1578 goto tx_error; 1579 1580 ip_vs_nat_icmp(skb, pp, cp, 0); 1581 1582 /* Another hack: avoid icmp_send in ip_fragment */ 1583 skb->ignore_df = 1; 1584 1585 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); 1586 goto out; 1587 1588 tx_error: 1589 kfree_skb(skb); 1590 rc = NF_STOLEN; 1591 out: 1592 LeaveFunction(10); 1593 return rc; 1594 } 1595 1596 #ifdef CONFIG_IP_VS_IPV6 1597 int 1598 ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, 1599 struct ip_vs_protocol *pp, int offset, unsigned int hooknum, 1600 struct ip_vs_iphdr *ipvsh) 1601 { 1602 struct rt6_info *rt; /* Route to the other host */ 1603 int rc; 1604 int local; 1605 int rt_mode; 1606 1607 EnterFunction(10); 1608 1609 /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be 1610 forwarded directly here, because there is no need to 1611 translate address/port back */ 1612 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { 1613 if (cp->packet_xmit) 1614 rc = cp->packet_xmit(skb, cp, pp, ipvsh); 1615 else 1616 rc = NF_ACCEPT; 1617 /* do not touch skb anymore */ 1618 atomic_inc(&cp->in_pkts); 1619 goto out; 1620 } 1621 1622 /* 1623 * mangle and send the packet here (only for VS/NAT) 1624 */ 1625 1626 /* LOCALNODE from FORWARD hook is not supported */ 1627 rt_mode = (hooknum != NF_INET_FORWARD) ? 1628 IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | 1629 IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL; 1630 local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest, 1631 &cp->daddr.in6, NULL, ipvsh, 0, rt_mode); 1632 if (local < 0) 1633 goto tx_error; 1634 rt = (struct rt6_info *) skb_dst(skb); 1635 /* 1636 * Avoid duplicate tuple in reply direction for NAT traffic 1637 * to local address when connection is sync-ed 1638 */ 1639 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 1640 if (cp->flags & IP_VS_CONN_F_SYNC && local) { 1641 enum ip_conntrack_info ctinfo; 1642 struct nf_conn *ct = nf_ct_get(skb, &ctinfo); 1643 1644 if (ct) { 1645 IP_VS_DBG(10, "%s(): " 1646 "stopping DNAT to local address %pI6\n", 1647 __func__, &cp->daddr.in6); 1648 goto tx_error; 1649 } 1650 } 1651 #endif 1652 1653 /* From world but DNAT to loopback address? */ 1654 if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && 1655 ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { 1656 IP_VS_DBG(1, "%s(): " 1657 "stopping DNAT to loopback %pI6\n", 1658 __func__, &cp->daddr.in6); 1659 goto tx_error; 1660 } 1661 1662 /* copy-on-write the packet before mangling it */ 1663 if (skb_ensure_writable(skb, offset)) 1664 goto tx_error; 1665 1666 if (skb_cow(skb, rt->dst.dev->hard_header_len)) 1667 goto tx_error; 1668 1669 ip_vs_nat_icmp_v6(skb, pp, cp, 0); 1670 1671 /* Another hack: avoid icmp_send in ip_fragment */ 1672 skb->ignore_df = 1; 1673 1674 rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local); 1675 goto out; 1676 1677 tx_error: 1678 kfree_skb(skb); 1679 rc = NF_STOLEN; 1680 out: 1681 LeaveFunction(10); 1682 return rc; 1683 } 1684 #endif 1685