1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst->dev; 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* Make sure idev stays alive */ 74 rcu_read_lock(); 75 skb = skb_expand_head(skb, hh_len); 76 if (!skb) { 77 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 78 rcu_read_unlock(); 79 return -ENOMEM; 80 } 81 rcu_read_unlock(); 82 } 83 84 hdr = ipv6_hdr(skb); 85 daddr = &hdr->daddr; 86 if (ipv6_addr_is_multicast(daddr)) { 87 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 88 ((mroute6_is_socket(net, skb) && 89 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 90 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 91 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 92 93 /* Do not check for IFF_ALLMULTI; multicast routing 94 is not supported in any case. 95 */ 96 if (newskb) 97 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 98 net, sk, newskb, NULL, newskb->dev, 99 dev_loopback_xmit); 100 101 if (hdr->hop_limit == 0) { 102 IP6_INC_STATS(net, idev, 103 IPSTATS_MIB_OUTDISCARDS); 104 kfree_skb(skb); 105 return 0; 106 } 107 } 108 109 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 110 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 111 !(dev->flags & IFF_LOOPBACK)) { 112 kfree_skb(skb); 113 return 0; 114 } 115 } 116 117 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 118 int res = lwtunnel_xmit(skb); 119 120 if (res != LWTUNNEL_XMIT_CONTINUE) 121 return res; 122 } 123 124 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 125 126 rcu_read_lock(); 127 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); 128 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 129 130 if (unlikely(IS_ERR_OR_NULL(neigh))) { 131 if (unlikely(!neigh)) 132 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 133 if (IS_ERR(neigh)) { 134 rcu_read_unlock(); 135 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 136 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 137 return -EINVAL; 138 } 139 } 140 sock_confirm_neigh(skb, neigh); 141 ret = neigh_output(neigh, skb, false); 142 rcu_read_unlock(); 143 return ret; 144 } 145 146 static int 147 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 148 struct sk_buff *skb, unsigned int mtu) 149 { 150 struct sk_buff *segs, *nskb; 151 netdev_features_t features; 152 int ret = 0; 153 154 /* Please see corresponding comment in ip_finish_output_gso 155 * describing the cases where GSO segment length exceeds the 156 * egress MTU. 157 */ 158 features = netif_skb_features(skb); 159 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 160 if (IS_ERR_OR_NULL(segs)) { 161 kfree_skb(skb); 162 return -ENOMEM; 163 } 164 165 consume_skb(skb); 166 167 skb_list_walk_safe(segs, segs, nskb) { 168 int err; 169 170 skb_mark_not_on_list(segs); 171 /* Last GSO segment can be smaller than gso_size (and MTU). 172 * Adding a fragment header would produce an "atomic fragment", 173 * which is considered harmful (RFC-8021). Avoid that. 174 */ 175 err = segs->len > mtu ? 176 ip6_fragment(net, sk, segs, ip6_finish_output2) : 177 ip6_finish_output2(net, sk, segs); 178 if (err && ret == 0) 179 ret = err; 180 } 181 182 return ret; 183 } 184 185 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 186 { 187 unsigned int mtu; 188 189 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 190 /* Policy lookup after SNAT yielded a new policy */ 191 if (skb_dst(skb)->xfrm) { 192 IP6CB(skb)->flags |= IP6SKB_REROUTED; 193 return dst_output(net, sk, skb); 194 } 195 #endif 196 197 mtu = ip6_skb_dst_mtu(skb); 198 if (skb_is_gso(skb) && 199 !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 200 !skb_gso_validate_network_len(skb, mtu)) 201 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 202 203 if ((skb->len > mtu && !skb_is_gso(skb)) || 204 dst_allfrag(skb_dst(skb)) || 205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 206 return ip6_fragment(net, sk, skb, ip6_finish_output2); 207 else 208 return ip6_finish_output2(net, sk, skb); 209 } 210 211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 212 { 213 int ret; 214 215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 216 switch (ret) { 217 case NET_XMIT_SUCCESS: 218 case NET_XMIT_CN: 219 return __ip6_finish_output(net, sk, skb) ? : ret; 220 default: 221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 222 return ret; 223 } 224 } 225 226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 227 { 228 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; 229 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 230 231 skb->protocol = htons(ETH_P_IPV6); 232 skb->dev = dev; 233 234 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 235 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 236 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 237 return 0; 238 } 239 240 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 241 net, sk, skb, indev, dev, 242 ip6_finish_output, 243 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 244 } 245 EXPORT_SYMBOL(ip6_output); 246 247 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 248 { 249 if (!np->autoflowlabel_set) 250 return ip6_default_np_autolabel(net); 251 else 252 return np->autoflowlabel; 253 } 254 255 /* 256 * xmit an sk_buff (used by TCP, SCTP and DCCP) 257 * Note : socket lock is not held for SYNACK packets, but might be modified 258 * by calls to skb_set_owner_w() and ipv6_local_error(), 259 * which are using proper atomic operations or spinlocks. 260 */ 261 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 262 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 263 { 264 struct net *net = sock_net(sk); 265 const struct ipv6_pinfo *np = inet6_sk(sk); 266 struct in6_addr *first_hop = &fl6->daddr; 267 struct dst_entry *dst = skb_dst(skb); 268 struct net_device *dev = dst->dev; 269 struct inet6_dev *idev = ip6_dst_idev(dst); 270 struct hop_jumbo_hdr *hop_jumbo; 271 int hoplen = sizeof(*hop_jumbo); 272 unsigned int head_room; 273 struct ipv6hdr *hdr; 274 u8 proto = fl6->flowi6_proto; 275 int seg_len = skb->len; 276 int hlimit = -1; 277 u32 mtu; 278 279 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 280 if (opt) 281 head_room += opt->opt_nflen + opt->opt_flen; 282 283 if (unlikely(head_room > skb_headroom(skb))) { 284 /* Make sure idev stays alive */ 285 rcu_read_lock(); 286 skb = skb_expand_head(skb, head_room); 287 if (!skb) { 288 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 289 rcu_read_unlock(); 290 return -ENOBUFS; 291 } 292 rcu_read_unlock(); 293 } 294 295 if (opt) { 296 seg_len += opt->opt_nflen + opt->opt_flen; 297 298 if (opt->opt_flen) 299 ipv6_push_frag_opts(skb, opt, &proto); 300 301 if (opt->opt_nflen) 302 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 303 &fl6->saddr); 304 } 305 306 if (unlikely(seg_len > IPV6_MAXPLEN)) { 307 hop_jumbo = skb_push(skb, hoplen); 308 309 hop_jumbo->nexthdr = proto; 310 hop_jumbo->hdrlen = 0; 311 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 312 hop_jumbo->tlv_len = 4; 313 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 314 315 proto = IPPROTO_HOPOPTS; 316 seg_len = 0; 317 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 318 } 319 320 skb_push(skb, sizeof(struct ipv6hdr)); 321 skb_reset_network_header(skb); 322 hdr = ipv6_hdr(skb); 323 324 /* 325 * Fill in the IPv6 header 326 */ 327 if (np) 328 hlimit = np->hop_limit; 329 if (hlimit < 0) 330 hlimit = ip6_dst_hoplimit(dst); 331 332 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 333 ip6_autoflowlabel(net, np), fl6)); 334 335 hdr->payload_len = htons(seg_len); 336 hdr->nexthdr = proto; 337 hdr->hop_limit = hlimit; 338 339 hdr->saddr = fl6->saddr; 340 hdr->daddr = *first_hop; 341 342 skb->protocol = htons(ETH_P_IPV6); 343 skb->priority = priority; 344 skb->mark = mark; 345 346 mtu = dst_mtu(dst); 347 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 348 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 349 350 /* if egress device is enslaved to an L3 master device pass the 351 * skb to its handler for processing 352 */ 353 skb = l3mdev_ip6_out((struct sock *)sk, skb); 354 if (unlikely(!skb)) 355 return 0; 356 357 /* hooks should never assume socket lock is held. 358 * we promote our socket to non const 359 */ 360 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 361 net, (struct sock *)sk, skb, NULL, dev, 362 dst_output); 363 } 364 365 skb->dev = dev; 366 /* ipv6_local_error() does not require socket lock, 367 * we promote our socket to non const 368 */ 369 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 370 371 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 372 kfree_skb(skb); 373 return -EMSGSIZE; 374 } 375 EXPORT_SYMBOL(ip6_xmit); 376 377 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 378 { 379 struct ip6_ra_chain *ra; 380 struct sock *last = NULL; 381 382 read_lock(&ip6_ra_lock); 383 for (ra = ip6_ra_chain; ra; ra = ra->next) { 384 struct sock *sk = ra->sk; 385 if (sk && ra->sel == sel && 386 (!sk->sk_bound_dev_if || 387 sk->sk_bound_dev_if == skb->dev->ifindex)) { 388 struct ipv6_pinfo *np = inet6_sk(sk); 389 390 if (np && np->rtalert_isolate && 391 !net_eq(sock_net(sk), dev_net(skb->dev))) { 392 continue; 393 } 394 if (last) { 395 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 396 if (skb2) 397 rawv6_rcv(last, skb2); 398 } 399 last = sk; 400 } 401 } 402 403 if (last) { 404 rawv6_rcv(last, skb); 405 read_unlock(&ip6_ra_lock); 406 return 1; 407 } 408 read_unlock(&ip6_ra_lock); 409 return 0; 410 } 411 412 static int ip6_forward_proxy_check(struct sk_buff *skb) 413 { 414 struct ipv6hdr *hdr = ipv6_hdr(skb); 415 u8 nexthdr = hdr->nexthdr; 416 __be16 frag_off; 417 int offset; 418 419 if (ipv6_ext_hdr(nexthdr)) { 420 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 421 if (offset < 0) 422 return 0; 423 } else 424 offset = sizeof(struct ipv6hdr); 425 426 if (nexthdr == IPPROTO_ICMPV6) { 427 struct icmp6hdr *icmp6; 428 429 if (!pskb_may_pull(skb, (skb_network_header(skb) + 430 offset + 1 - skb->data))) 431 return 0; 432 433 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 434 435 switch (icmp6->icmp6_type) { 436 case NDISC_ROUTER_SOLICITATION: 437 case NDISC_ROUTER_ADVERTISEMENT: 438 case NDISC_NEIGHBOUR_SOLICITATION: 439 case NDISC_NEIGHBOUR_ADVERTISEMENT: 440 case NDISC_REDIRECT: 441 /* For reaction involving unicast neighbor discovery 442 * message destined to the proxied address, pass it to 443 * input function. 444 */ 445 return 1; 446 default: 447 break; 448 } 449 } 450 451 /* 452 * The proxying router can't forward traffic sent to a link-local 453 * address, so signal the sender and discard the packet. This 454 * behavior is clarified by the MIPv6 specification. 455 */ 456 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 457 dst_link_failure(skb); 458 return -1; 459 } 460 461 return 0; 462 } 463 464 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 465 struct sk_buff *skb) 466 { 467 struct dst_entry *dst = skb_dst(skb); 468 469 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 470 471 #ifdef CONFIG_NET_SWITCHDEV 472 if (skb->offload_l3_fwd_mark) { 473 consume_skb(skb); 474 return 0; 475 } 476 #endif 477 478 skb_clear_tstamp(skb); 479 return dst_output(net, sk, skb); 480 } 481 482 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 483 { 484 if (skb->len <= mtu) 485 return false; 486 487 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 488 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 489 return true; 490 491 if (skb->ignore_df) 492 return false; 493 494 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 495 return false; 496 497 return true; 498 } 499 500 int ip6_forward(struct sk_buff *skb) 501 { 502 struct dst_entry *dst = skb_dst(skb); 503 struct ipv6hdr *hdr = ipv6_hdr(skb); 504 struct inet6_skb_parm *opt = IP6CB(skb); 505 struct net *net = dev_net(dst->dev); 506 struct inet6_dev *idev; 507 SKB_DR(reason); 508 u32 mtu; 509 510 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 511 if (net->ipv6.devconf_all->forwarding == 0) 512 goto error; 513 514 if (skb->pkt_type != PACKET_HOST) 515 goto drop; 516 517 if (unlikely(skb->sk)) 518 goto drop; 519 520 if (skb_warn_if_lro(skb)) 521 goto drop; 522 523 if (!net->ipv6.devconf_all->disable_policy && 524 (!idev || !idev->cnf.disable_policy) && 525 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 526 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 527 goto drop; 528 } 529 530 skb_forward_csum(skb); 531 532 /* 533 * We DO NOT make any processing on 534 * RA packets, pushing them to user level AS IS 535 * without ane WARRANTY that application will be able 536 * to interpret them. The reason is that we 537 * cannot make anything clever here. 538 * 539 * We are not end-node, so that if packet contains 540 * AH/ESP, we cannot make anything. 541 * Defragmentation also would be mistake, RA packets 542 * cannot be fragmented, because there is no warranty 543 * that different fragments will go along one path. --ANK 544 */ 545 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 546 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 547 return 0; 548 } 549 550 /* 551 * check and decrement ttl 552 */ 553 if (hdr->hop_limit <= 1) { 554 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 555 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 556 557 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 558 return -ETIMEDOUT; 559 } 560 561 /* XXX: idev->cnf.proxy_ndp? */ 562 if (net->ipv6.devconf_all->proxy_ndp && 563 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 564 int proxied = ip6_forward_proxy_check(skb); 565 if (proxied > 0) { 566 /* It's tempting to decrease the hop limit 567 * here by 1, as we do at the end of the 568 * function too. 569 * 570 * But that would be incorrect, as proxying is 571 * not forwarding. The ip6_input function 572 * will handle this packet locally, and it 573 * depends on the hop limit being unchanged. 574 * 575 * One example is the NDP hop limit, that 576 * always has to stay 255, but other would be 577 * similar checks around RA packets, where the 578 * user can even change the desired limit. 579 */ 580 return ip6_input(skb); 581 } else if (proxied < 0) { 582 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 583 goto drop; 584 } 585 } 586 587 if (!xfrm6_route_forward(skb)) { 588 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 589 SKB_DR_SET(reason, XFRM_POLICY); 590 goto drop; 591 } 592 dst = skb_dst(skb); 593 594 /* IPv6 specs say nothing about it, but it is clear that we cannot 595 send redirects to source routed frames. 596 We don't send redirects to frames decapsulated from IPsec. 597 */ 598 if (IP6CB(skb)->iif == dst->dev->ifindex && 599 opt->srcrt == 0 && !skb_sec_path(skb)) { 600 struct in6_addr *target = NULL; 601 struct inet_peer *peer; 602 struct rt6_info *rt; 603 604 /* 605 * incoming and outgoing devices are the same 606 * send a redirect. 607 */ 608 609 rt = (struct rt6_info *) dst; 610 if (rt->rt6i_flags & RTF_GATEWAY) 611 target = &rt->rt6i_gateway; 612 else 613 target = &hdr->daddr; 614 615 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 616 617 /* Limit redirects both by destination (here) 618 and by source (inside ndisc_send_redirect) 619 */ 620 if (inet_peer_xrlim_allow(peer, 1*HZ)) 621 ndisc_send_redirect(skb, target); 622 if (peer) 623 inet_putpeer(peer); 624 } else { 625 int addrtype = ipv6_addr_type(&hdr->saddr); 626 627 /* This check is security critical. */ 628 if (addrtype == IPV6_ADDR_ANY || 629 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 630 goto error; 631 if (addrtype & IPV6_ADDR_LINKLOCAL) { 632 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 633 ICMPV6_NOT_NEIGHBOUR, 0); 634 goto error; 635 } 636 } 637 638 mtu = ip6_dst_mtu_maybe_forward(dst, true); 639 if (mtu < IPV6_MIN_MTU) 640 mtu = IPV6_MIN_MTU; 641 642 if (ip6_pkt_too_big(skb, mtu)) { 643 /* Again, force OUTPUT device used as source address */ 644 skb->dev = dst->dev; 645 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 646 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 647 __IP6_INC_STATS(net, ip6_dst_idev(dst), 648 IPSTATS_MIB_FRAGFAILS); 649 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 650 return -EMSGSIZE; 651 } 652 653 if (skb_cow(skb, dst->dev->hard_header_len)) { 654 __IP6_INC_STATS(net, ip6_dst_idev(dst), 655 IPSTATS_MIB_OUTDISCARDS); 656 goto drop; 657 } 658 659 hdr = ipv6_hdr(skb); 660 661 /* Mangling hops number delayed to point after skb COW */ 662 663 hdr->hop_limit--; 664 665 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 666 net, NULL, skb, skb->dev, dst->dev, 667 ip6_forward_finish); 668 669 error: 670 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 671 SKB_DR_SET(reason, IP_INADDRERRORS); 672 drop: 673 kfree_skb_reason(skb, reason); 674 return -EINVAL; 675 } 676 677 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 678 { 679 to->pkt_type = from->pkt_type; 680 to->priority = from->priority; 681 to->protocol = from->protocol; 682 skb_dst_drop(to); 683 skb_dst_set(to, dst_clone(skb_dst(from))); 684 to->dev = from->dev; 685 to->mark = from->mark; 686 687 skb_copy_hash(to, from); 688 689 #ifdef CONFIG_NET_SCHED 690 to->tc_index = from->tc_index; 691 #endif 692 nf_copy(to, from); 693 skb_ext_copy(to, from); 694 skb_copy_secmark(to, from); 695 } 696 697 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 698 u8 nexthdr, __be32 frag_id, 699 struct ip6_fraglist_iter *iter) 700 { 701 unsigned int first_len; 702 struct frag_hdr *fh; 703 704 /* BUILD HEADER */ 705 *prevhdr = NEXTHDR_FRAGMENT; 706 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 707 if (!iter->tmp_hdr) 708 return -ENOMEM; 709 710 iter->frag = skb_shinfo(skb)->frag_list; 711 skb_frag_list_init(skb); 712 713 iter->offset = 0; 714 iter->hlen = hlen; 715 iter->frag_id = frag_id; 716 iter->nexthdr = nexthdr; 717 718 __skb_pull(skb, hlen); 719 fh = __skb_push(skb, sizeof(struct frag_hdr)); 720 __skb_push(skb, hlen); 721 skb_reset_network_header(skb); 722 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 723 724 fh->nexthdr = nexthdr; 725 fh->reserved = 0; 726 fh->frag_off = htons(IP6_MF); 727 fh->identification = frag_id; 728 729 first_len = skb_pagelen(skb); 730 skb->data_len = first_len - skb_headlen(skb); 731 skb->len = first_len; 732 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 733 734 return 0; 735 } 736 EXPORT_SYMBOL(ip6_fraglist_init); 737 738 void ip6_fraglist_prepare(struct sk_buff *skb, 739 struct ip6_fraglist_iter *iter) 740 { 741 struct sk_buff *frag = iter->frag; 742 unsigned int hlen = iter->hlen; 743 struct frag_hdr *fh; 744 745 frag->ip_summed = CHECKSUM_NONE; 746 skb_reset_transport_header(frag); 747 fh = __skb_push(frag, sizeof(struct frag_hdr)); 748 __skb_push(frag, hlen); 749 skb_reset_network_header(frag); 750 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 751 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 752 fh->nexthdr = iter->nexthdr; 753 fh->reserved = 0; 754 fh->frag_off = htons(iter->offset); 755 if (frag->next) 756 fh->frag_off |= htons(IP6_MF); 757 fh->identification = iter->frag_id; 758 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 759 ip6_copy_metadata(frag, skb); 760 } 761 EXPORT_SYMBOL(ip6_fraglist_prepare); 762 763 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 764 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 765 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 766 { 767 state->prevhdr = prevhdr; 768 state->nexthdr = nexthdr; 769 state->frag_id = frag_id; 770 771 state->hlen = hlen; 772 state->mtu = mtu; 773 774 state->left = skb->len - hlen; /* Space per frame */ 775 state->ptr = hlen; /* Where to start from */ 776 777 state->hroom = hdr_room; 778 state->troom = needed_tailroom; 779 780 state->offset = 0; 781 } 782 EXPORT_SYMBOL(ip6_frag_init); 783 784 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 785 { 786 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 787 struct sk_buff *frag; 788 struct frag_hdr *fh; 789 unsigned int len; 790 791 len = state->left; 792 /* IF: it doesn't fit, use 'mtu' - the data space left */ 793 if (len > state->mtu) 794 len = state->mtu; 795 /* IF: we are not sending up to and including the packet end 796 then align the next start on an eight byte boundary */ 797 if (len < state->left) 798 len &= ~7; 799 800 /* Allocate buffer */ 801 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 802 state->hroom + state->troom, GFP_ATOMIC); 803 if (!frag) 804 return ERR_PTR(-ENOMEM); 805 806 /* 807 * Set up data on packet 808 */ 809 810 ip6_copy_metadata(frag, skb); 811 skb_reserve(frag, state->hroom); 812 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 813 skb_reset_network_header(frag); 814 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 815 frag->transport_header = (frag->network_header + state->hlen + 816 sizeof(struct frag_hdr)); 817 818 /* 819 * Charge the memory for the fragment to any owner 820 * it might possess 821 */ 822 if (skb->sk) 823 skb_set_owner_w(frag, skb->sk); 824 825 /* 826 * Copy the packet header into the new buffer. 827 */ 828 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 829 830 fragnexthdr_offset = skb_network_header(frag); 831 fragnexthdr_offset += prevhdr - skb_network_header(skb); 832 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 833 834 /* 835 * Build fragment header. 836 */ 837 fh->nexthdr = state->nexthdr; 838 fh->reserved = 0; 839 fh->identification = state->frag_id; 840 841 /* 842 * Copy a block of the IP datagram. 843 */ 844 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 845 len)); 846 state->left -= len; 847 848 fh->frag_off = htons(state->offset); 849 if (state->left > 0) 850 fh->frag_off |= htons(IP6_MF); 851 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 852 853 state->ptr += len; 854 state->offset += len; 855 856 return frag; 857 } 858 EXPORT_SYMBOL(ip6_frag_next); 859 860 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 861 int (*output)(struct net *, struct sock *, struct sk_buff *)) 862 { 863 struct sk_buff *frag; 864 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 865 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 866 inet6_sk(skb->sk) : NULL; 867 bool mono_delivery_time = skb->mono_delivery_time; 868 struct ip6_frag_state state; 869 unsigned int mtu, hlen, nexthdr_offset; 870 ktime_t tstamp = skb->tstamp; 871 int hroom, err = 0; 872 __be32 frag_id; 873 u8 *prevhdr, nexthdr = 0; 874 875 err = ip6_find_1stfragopt(skb, &prevhdr); 876 if (err < 0) 877 goto fail; 878 hlen = err; 879 nexthdr = *prevhdr; 880 nexthdr_offset = prevhdr - skb_network_header(skb); 881 882 mtu = ip6_skb_dst_mtu(skb); 883 884 /* We must not fragment if the socket is set to force MTU discovery 885 * or if the skb it not generated by a local socket. 886 */ 887 if (unlikely(!skb->ignore_df && skb->len > mtu)) 888 goto fail_toobig; 889 890 if (IP6CB(skb)->frag_max_size) { 891 if (IP6CB(skb)->frag_max_size > mtu) 892 goto fail_toobig; 893 894 /* don't send fragments larger than what we received */ 895 mtu = IP6CB(skb)->frag_max_size; 896 if (mtu < IPV6_MIN_MTU) 897 mtu = IPV6_MIN_MTU; 898 } 899 900 if (np && np->frag_size < mtu) { 901 if (np->frag_size) 902 mtu = np->frag_size; 903 } 904 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 905 goto fail_toobig; 906 mtu -= hlen + sizeof(struct frag_hdr); 907 908 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 909 &ipv6_hdr(skb)->saddr); 910 911 if (skb->ip_summed == CHECKSUM_PARTIAL && 912 (err = skb_checksum_help(skb))) 913 goto fail; 914 915 prevhdr = skb_network_header(skb) + nexthdr_offset; 916 hroom = LL_RESERVED_SPACE(rt->dst.dev); 917 if (skb_has_frag_list(skb)) { 918 unsigned int first_len = skb_pagelen(skb); 919 struct ip6_fraglist_iter iter; 920 struct sk_buff *frag2; 921 922 if (first_len - hlen > mtu || 923 ((first_len - hlen) & 7) || 924 skb_cloned(skb) || 925 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 926 goto slow_path; 927 928 skb_walk_frags(skb, frag) { 929 /* Correct geometry. */ 930 if (frag->len > mtu || 931 ((frag->len & 7) && frag->next) || 932 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 933 goto slow_path_clean; 934 935 /* Partially cloned skb? */ 936 if (skb_shared(frag)) 937 goto slow_path_clean; 938 939 BUG_ON(frag->sk); 940 if (skb->sk) { 941 frag->sk = skb->sk; 942 frag->destructor = sock_wfree; 943 } 944 skb->truesize -= frag->truesize; 945 } 946 947 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 948 &iter); 949 if (err < 0) 950 goto fail; 951 952 /* We prevent @rt from being freed. */ 953 rcu_read_lock(); 954 955 for (;;) { 956 /* Prepare header of the next frame, 957 * before previous one went down. */ 958 if (iter.frag) 959 ip6_fraglist_prepare(skb, &iter); 960 961 skb_set_delivery_time(skb, tstamp, mono_delivery_time); 962 err = output(net, sk, skb); 963 if (!err) 964 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 965 IPSTATS_MIB_FRAGCREATES); 966 967 if (err || !iter.frag) 968 break; 969 970 skb = ip6_fraglist_next(&iter); 971 } 972 973 kfree(iter.tmp_hdr); 974 975 if (err == 0) { 976 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 977 IPSTATS_MIB_FRAGOKS); 978 rcu_read_unlock(); 979 return 0; 980 } 981 982 kfree_skb_list(iter.frag); 983 984 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 985 IPSTATS_MIB_FRAGFAILS); 986 rcu_read_unlock(); 987 return err; 988 989 slow_path_clean: 990 skb_walk_frags(skb, frag2) { 991 if (frag2 == frag) 992 break; 993 frag2->sk = NULL; 994 frag2->destructor = NULL; 995 skb->truesize += frag2->truesize; 996 } 997 } 998 999 slow_path: 1000 /* 1001 * Fragment the datagram. 1002 */ 1003 1004 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1005 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1006 &state); 1007 1008 /* 1009 * Keep copying data until we run out. 1010 */ 1011 1012 while (state.left > 0) { 1013 frag = ip6_frag_next(skb, &state); 1014 if (IS_ERR(frag)) { 1015 err = PTR_ERR(frag); 1016 goto fail; 1017 } 1018 1019 /* 1020 * Put this fragment into the sending queue. 1021 */ 1022 skb_set_delivery_time(frag, tstamp, mono_delivery_time); 1023 err = output(net, sk, frag); 1024 if (err) 1025 goto fail; 1026 1027 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1028 IPSTATS_MIB_FRAGCREATES); 1029 } 1030 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1031 IPSTATS_MIB_FRAGOKS); 1032 consume_skb(skb); 1033 return err; 1034 1035 fail_toobig: 1036 if (skb->sk && dst_allfrag(skb_dst(skb))) 1037 sk_gso_disable(skb->sk); 1038 1039 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1040 err = -EMSGSIZE; 1041 1042 fail: 1043 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1044 IPSTATS_MIB_FRAGFAILS); 1045 kfree_skb(skb); 1046 return err; 1047 } 1048 1049 static inline int ip6_rt_check(const struct rt6key *rt_key, 1050 const struct in6_addr *fl_addr, 1051 const struct in6_addr *addr_cache) 1052 { 1053 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1054 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1055 } 1056 1057 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1058 struct dst_entry *dst, 1059 const struct flowi6 *fl6) 1060 { 1061 struct ipv6_pinfo *np = inet6_sk(sk); 1062 struct rt6_info *rt; 1063 1064 if (!dst) 1065 goto out; 1066 1067 if (dst->ops->family != AF_INET6) { 1068 dst_release(dst); 1069 return NULL; 1070 } 1071 1072 rt = (struct rt6_info *)dst; 1073 /* Yes, checking route validity in not connected 1074 * case is not very simple. Take into account, 1075 * that we do not support routing by source, TOS, 1076 * and MSG_DONTROUTE --ANK (980726) 1077 * 1078 * 1. ip6_rt_check(): If route was host route, 1079 * check that cached destination is current. 1080 * If it is network route, we still may 1081 * check its validity using saved pointer 1082 * to the last used address: daddr_cache. 1083 * We do not want to save whole address now, 1084 * (because main consumer of this service 1085 * is tcp, which has not this problem), 1086 * so that the last trick works only on connected 1087 * sockets. 1088 * 2. oif also should be the same. 1089 */ 1090 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1091 #ifdef CONFIG_IPV6_SUBTREES 1092 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1093 #endif 1094 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { 1095 dst_release(dst); 1096 dst = NULL; 1097 } 1098 1099 out: 1100 return dst; 1101 } 1102 1103 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1104 struct dst_entry **dst, struct flowi6 *fl6) 1105 { 1106 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1107 struct neighbour *n; 1108 struct rt6_info *rt; 1109 #endif 1110 int err; 1111 int flags = 0; 1112 1113 /* The correct way to handle this would be to do 1114 * ip6_route_get_saddr, and then ip6_route_output; however, 1115 * the route-specific preferred source forces the 1116 * ip6_route_output call _before_ ip6_route_get_saddr. 1117 * 1118 * In source specific routing (no src=any default route), 1119 * ip6_route_output will fail given src=any saddr, though, so 1120 * that's why we try it again later. 1121 */ 1122 if (ipv6_addr_any(&fl6->saddr)) { 1123 struct fib6_info *from; 1124 struct rt6_info *rt; 1125 1126 *dst = ip6_route_output(net, sk, fl6); 1127 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 1128 1129 rcu_read_lock(); 1130 from = rt ? rcu_dereference(rt->from) : NULL; 1131 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1132 sk ? inet6_sk(sk)->srcprefs : 0, 1133 fl6->flowi6_l3mdev, 1134 &fl6->saddr); 1135 rcu_read_unlock(); 1136 1137 if (err) 1138 goto out_err_release; 1139 1140 /* If we had an erroneous initial result, pretend it 1141 * never existed and let the SA-enabled version take 1142 * over. 1143 */ 1144 if ((*dst)->error) { 1145 dst_release(*dst); 1146 *dst = NULL; 1147 } 1148 1149 if (fl6->flowi6_oif) 1150 flags |= RT6_LOOKUP_F_IFACE; 1151 } 1152 1153 if (!*dst) 1154 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1155 1156 err = (*dst)->error; 1157 if (err) 1158 goto out_err_release; 1159 1160 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1161 /* 1162 * Here if the dst entry we've looked up 1163 * has a neighbour entry that is in the INCOMPLETE 1164 * state and the src address from the flow is 1165 * marked as OPTIMISTIC, we release the found 1166 * dst entry and replace it instead with the 1167 * dst entry of the nexthop router 1168 */ 1169 rt = (struct rt6_info *) *dst; 1170 rcu_read_lock(); 1171 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1172 rt6_nexthop(rt, &fl6->daddr)); 1173 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1174 rcu_read_unlock(); 1175 1176 if (err) { 1177 struct inet6_ifaddr *ifp; 1178 struct flowi6 fl_gw6; 1179 int redirect; 1180 1181 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1182 (*dst)->dev, 1); 1183 1184 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1185 if (ifp) 1186 in6_ifa_put(ifp); 1187 1188 if (redirect) { 1189 /* 1190 * We need to get the dst entry for the 1191 * default router instead 1192 */ 1193 dst_release(*dst); 1194 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1195 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1196 *dst = ip6_route_output(net, sk, &fl_gw6); 1197 err = (*dst)->error; 1198 if (err) 1199 goto out_err_release; 1200 } 1201 } 1202 #endif 1203 if (ipv6_addr_v4mapped(&fl6->saddr) && 1204 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1205 err = -EAFNOSUPPORT; 1206 goto out_err_release; 1207 } 1208 1209 return 0; 1210 1211 out_err_release: 1212 dst_release(*dst); 1213 *dst = NULL; 1214 1215 if (err == -ENETUNREACH) 1216 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1217 return err; 1218 } 1219 1220 /** 1221 * ip6_dst_lookup - perform route lookup on flow 1222 * @net: Network namespace to perform lookup in 1223 * @sk: socket which provides route info 1224 * @dst: pointer to dst_entry * for result 1225 * @fl6: flow to lookup 1226 * 1227 * This function performs a route lookup on the given flow. 1228 * 1229 * It returns zero on success, or a standard errno code on error. 1230 */ 1231 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1232 struct flowi6 *fl6) 1233 { 1234 *dst = NULL; 1235 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1236 } 1237 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1238 1239 /** 1240 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1241 * @net: Network namespace to perform lookup in 1242 * @sk: socket which provides route info 1243 * @fl6: flow to lookup 1244 * @final_dst: final destination address for ipsec lookup 1245 * 1246 * This function performs a route lookup on the given flow. 1247 * 1248 * It returns a valid dst pointer on success, or a pointer encoded 1249 * error code. 1250 */ 1251 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1252 const struct in6_addr *final_dst) 1253 { 1254 struct dst_entry *dst = NULL; 1255 int err; 1256 1257 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1258 if (err) 1259 return ERR_PTR(err); 1260 if (final_dst) 1261 fl6->daddr = *final_dst; 1262 1263 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1264 } 1265 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1266 1267 /** 1268 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1269 * @sk: socket which provides the dst cache and route info 1270 * @fl6: flow to lookup 1271 * @final_dst: final destination address for ipsec lookup 1272 * @connected: whether @sk is connected or not 1273 * 1274 * This function performs a route lookup on the given flow with the 1275 * possibility of using the cached route in the socket if it is valid. 1276 * It will take the socket dst lock when operating on the dst cache. 1277 * As a result, this function can only be used in process context. 1278 * 1279 * In addition, for a connected socket, cache the dst in the socket 1280 * if the current cache is not valid. 1281 * 1282 * It returns a valid dst pointer on success, or a pointer encoded 1283 * error code. 1284 */ 1285 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1286 const struct in6_addr *final_dst, 1287 bool connected) 1288 { 1289 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1290 1291 dst = ip6_sk_dst_check(sk, dst, fl6); 1292 if (dst) 1293 return dst; 1294 1295 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1296 if (connected && !IS_ERR(dst)) 1297 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1298 1299 return dst; 1300 } 1301 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1302 1303 /** 1304 * ip6_dst_lookup_tunnel - perform route lookup on tunnel 1305 * @skb: Packet for which lookup is done 1306 * @dev: Tunnel device 1307 * @net: Network namespace of tunnel device 1308 * @sock: Socket which provides route info 1309 * @saddr: Memory to store the src ip address 1310 * @info: Tunnel information 1311 * @protocol: IP protocol 1312 * @use_cache: Flag to enable cache usage 1313 * This function performs a route lookup on a tunnel 1314 * 1315 * It returns a valid dst pointer and stores src address to be used in 1316 * tunnel in param saddr on success, else a pointer encoded error code. 1317 */ 1318 1319 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, 1320 struct net_device *dev, 1321 struct net *net, 1322 struct socket *sock, 1323 struct in6_addr *saddr, 1324 const struct ip_tunnel_info *info, 1325 u8 protocol, 1326 bool use_cache) 1327 { 1328 struct dst_entry *dst = NULL; 1329 #ifdef CONFIG_DST_CACHE 1330 struct dst_cache *dst_cache; 1331 #endif 1332 struct flowi6 fl6; 1333 __u8 prio; 1334 1335 #ifdef CONFIG_DST_CACHE 1336 dst_cache = (struct dst_cache *)&info->dst_cache; 1337 if (use_cache) { 1338 dst = dst_cache_get_ip6(dst_cache, saddr); 1339 if (dst) 1340 return dst; 1341 } 1342 #endif 1343 memset(&fl6, 0, sizeof(fl6)); 1344 fl6.flowi6_mark = skb->mark; 1345 fl6.flowi6_proto = protocol; 1346 fl6.daddr = info->key.u.ipv6.dst; 1347 fl6.saddr = info->key.u.ipv6.src; 1348 prio = info->key.tos; 1349 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label); 1350 1351 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, 1352 NULL); 1353 if (IS_ERR(dst)) { 1354 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); 1355 return ERR_PTR(-ENETUNREACH); 1356 } 1357 if (dst->dev == dev) { /* is this necessary? */ 1358 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); 1359 dst_release(dst); 1360 return ERR_PTR(-ELOOP); 1361 } 1362 #ifdef CONFIG_DST_CACHE 1363 if (use_cache) 1364 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); 1365 #endif 1366 *saddr = fl6.saddr; 1367 return dst; 1368 } 1369 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel); 1370 1371 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1372 gfp_t gfp) 1373 { 1374 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1375 } 1376 1377 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1378 gfp_t gfp) 1379 { 1380 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1381 } 1382 1383 static void ip6_append_data_mtu(unsigned int *mtu, 1384 int *maxfraglen, 1385 unsigned int fragheaderlen, 1386 struct sk_buff *skb, 1387 struct rt6_info *rt, 1388 unsigned int orig_mtu) 1389 { 1390 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1391 if (!skb) { 1392 /* first fragment, reserve header_len */ 1393 *mtu = orig_mtu - rt->dst.header_len; 1394 1395 } else { 1396 /* 1397 * this fragment is not first, the headers 1398 * space is regarded as data space. 1399 */ 1400 *mtu = orig_mtu; 1401 } 1402 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1403 + fragheaderlen - sizeof(struct frag_hdr); 1404 } 1405 } 1406 1407 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1408 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1409 struct rt6_info *rt) 1410 { 1411 struct ipv6_pinfo *np = inet6_sk(sk); 1412 unsigned int mtu; 1413 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1414 1415 /* callers pass dst together with a reference, set it first so 1416 * ip6_cork_release() can put it down even in case of an error. 1417 */ 1418 cork->base.dst = &rt->dst; 1419 1420 /* 1421 * setup for corking 1422 */ 1423 if (opt) { 1424 if (WARN_ON(v6_cork->opt)) 1425 return -EINVAL; 1426 1427 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1428 if (unlikely(!nopt)) 1429 return -ENOBUFS; 1430 1431 nopt->tot_len = sizeof(*opt); 1432 nopt->opt_flen = opt->opt_flen; 1433 nopt->opt_nflen = opt->opt_nflen; 1434 1435 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1436 if (opt->dst0opt && !nopt->dst0opt) 1437 return -ENOBUFS; 1438 1439 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1440 if (opt->dst1opt && !nopt->dst1opt) 1441 return -ENOBUFS; 1442 1443 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1444 if (opt->hopopt && !nopt->hopopt) 1445 return -ENOBUFS; 1446 1447 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1448 if (opt->srcrt && !nopt->srcrt) 1449 return -ENOBUFS; 1450 1451 /* need source address above miyazawa*/ 1452 } 1453 v6_cork->hop_limit = ipc6->hlimit; 1454 v6_cork->tclass = ipc6->tclass; 1455 if (rt->dst.flags & DST_XFRM_TUNNEL) 1456 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1457 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1458 else 1459 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1460 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1461 if (np->frag_size < mtu) { 1462 if (np->frag_size) 1463 mtu = np->frag_size; 1464 } 1465 cork->base.fragsize = mtu; 1466 cork->base.gso_size = ipc6->gso_size; 1467 cork->base.tx_flags = 0; 1468 cork->base.mark = ipc6->sockc.mark; 1469 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1470 1471 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1472 cork->base.flags |= IPCORK_ALLFRAG; 1473 cork->base.length = 0; 1474 1475 cork->base.transmit_time = ipc6->sockc.transmit_time; 1476 1477 return 0; 1478 } 1479 1480 static int __ip6_append_data(struct sock *sk, 1481 struct sk_buff_head *queue, 1482 struct inet_cork_full *cork_full, 1483 struct inet6_cork *v6_cork, 1484 struct page_frag *pfrag, 1485 int getfrag(void *from, char *to, int offset, 1486 int len, int odd, struct sk_buff *skb), 1487 void *from, size_t length, int transhdrlen, 1488 unsigned int flags, struct ipcm6_cookie *ipc6) 1489 { 1490 struct sk_buff *skb, *skb_prev = NULL; 1491 struct inet_cork *cork = &cork_full->base; 1492 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1493 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1494 struct ubuf_info *uarg = NULL; 1495 int exthdrlen = 0; 1496 int dst_exthdrlen = 0; 1497 int hh_len; 1498 int copy; 1499 int err; 1500 int offset = 0; 1501 bool zc = false; 1502 u32 tskey = 0; 1503 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1504 struct ipv6_txoptions *opt = v6_cork->opt; 1505 int csummode = CHECKSUM_NONE; 1506 unsigned int maxnonfragsize, headersize; 1507 unsigned int wmem_alloc_delta = 0; 1508 bool paged, extra_uref = false; 1509 1510 skb = skb_peek_tail(queue); 1511 if (!skb) { 1512 exthdrlen = opt ? opt->opt_flen : 0; 1513 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1514 } 1515 1516 paged = !!cork->gso_size; 1517 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1518 orig_mtu = mtu; 1519 1520 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1521 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) 1522 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1523 1524 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1525 1526 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1527 (opt ? opt->opt_nflen : 0); 1528 1529 headersize = sizeof(struct ipv6hdr) + 1530 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1531 (dst_allfrag(&rt->dst) ? 1532 sizeof(struct frag_hdr) : 0) + 1533 rt->rt6i_nfheader_len; 1534 1535 if (mtu <= fragheaderlen || 1536 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1537 goto emsgsize; 1538 1539 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1540 sizeof(struct frag_hdr); 1541 1542 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1543 * the first fragment 1544 */ 1545 if (headersize + transhdrlen > mtu) 1546 goto emsgsize; 1547 1548 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1549 (sk->sk_protocol == IPPROTO_UDP || 1550 sk->sk_protocol == IPPROTO_ICMPV6 || 1551 sk->sk_protocol == IPPROTO_RAW)) { 1552 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1553 sizeof(struct ipv6hdr)); 1554 goto emsgsize; 1555 } 1556 1557 if (ip6_sk_ignore_df(sk)) 1558 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1559 else 1560 maxnonfragsize = mtu; 1561 1562 if (cork->length + length > maxnonfragsize - headersize) { 1563 emsgsize: 1564 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1565 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1566 return -EMSGSIZE; 1567 } 1568 1569 /* CHECKSUM_PARTIAL only with no extension headers and when 1570 * we are not going to fragment 1571 */ 1572 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1573 headersize == sizeof(struct ipv6hdr) && 1574 length <= mtu - headersize && 1575 (!(flags & MSG_MORE) || cork->gso_size) && 1576 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1577 csummode = CHECKSUM_PARTIAL; 1578 1579 if ((flags & MSG_ZEROCOPY) && length) { 1580 struct msghdr *msg = from; 1581 1582 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1583 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1584 return -EINVAL; 1585 1586 /* Leave uarg NULL if can't zerocopy, callers should 1587 * be able to handle it. 1588 */ 1589 if ((rt->dst.dev->features & NETIF_F_SG) && 1590 csummode == CHECKSUM_PARTIAL) { 1591 paged = true; 1592 zc = true; 1593 uarg = msg->msg_ubuf; 1594 } 1595 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1596 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1597 if (!uarg) 1598 return -ENOBUFS; 1599 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1600 if (rt->dst.dev->features & NETIF_F_SG && 1601 csummode == CHECKSUM_PARTIAL) { 1602 paged = true; 1603 zc = true; 1604 } else { 1605 uarg_to_msgzc(uarg)->zerocopy = 0; 1606 skb_zcopy_set(skb, uarg, &extra_uref); 1607 } 1608 } 1609 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1610 if (inet_test_bit(HDRINCL, sk)) 1611 return -EPERM; 1612 if (rt->dst.dev->features & NETIF_F_SG && 1613 getfrag == ip_generic_getfrag) 1614 /* We need an empty buffer to attach stuff to */ 1615 paged = true; 1616 else 1617 flags &= ~MSG_SPLICE_PAGES; 1618 } 1619 1620 /* 1621 * Let's try using as much space as possible. 1622 * Use MTU if total length of the message fits into the MTU. 1623 * Otherwise, we need to reserve fragment header and 1624 * fragment alignment (= 8-15 octects, in total). 1625 * 1626 * Note that we may need to "move" the data from the tail 1627 * of the buffer to the new fragment when we split 1628 * the message. 1629 * 1630 * FIXME: It may be fragmented into multiple chunks 1631 * at once if non-fragmentable extension headers 1632 * are too large. 1633 * --yoshfuji 1634 */ 1635 1636 cork->length += length; 1637 if (!skb) 1638 goto alloc_new_skb; 1639 1640 while (length > 0) { 1641 /* Check if the remaining data fits into current packet. */ 1642 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1643 if (copy < length) 1644 copy = maxfraglen - skb->len; 1645 1646 if (copy <= 0) { 1647 char *data; 1648 unsigned int datalen; 1649 unsigned int fraglen; 1650 unsigned int fraggap; 1651 unsigned int alloclen, alloc_extra; 1652 unsigned int pagedlen; 1653 alloc_new_skb: 1654 /* There's no room in the current skb */ 1655 if (skb) 1656 fraggap = skb->len - maxfraglen; 1657 else 1658 fraggap = 0; 1659 /* update mtu and maxfraglen if necessary */ 1660 if (!skb || !skb_prev) 1661 ip6_append_data_mtu(&mtu, &maxfraglen, 1662 fragheaderlen, skb, rt, 1663 orig_mtu); 1664 1665 skb_prev = skb; 1666 1667 /* 1668 * If remaining data exceeds the mtu, 1669 * we know we need more fragment(s). 1670 */ 1671 datalen = length + fraggap; 1672 1673 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1674 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1675 fraglen = datalen + fragheaderlen; 1676 pagedlen = 0; 1677 1678 alloc_extra = hh_len; 1679 alloc_extra += dst_exthdrlen; 1680 alloc_extra += rt->dst.trailer_len; 1681 1682 /* We just reserve space for fragment header. 1683 * Note: this may be overallocation if the message 1684 * (without MSG_MORE) fits into the MTU. 1685 */ 1686 alloc_extra += sizeof(struct frag_hdr); 1687 1688 if ((flags & MSG_MORE) && 1689 !(rt->dst.dev->features&NETIF_F_SG)) 1690 alloclen = mtu; 1691 else if (!paged && 1692 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1693 !(rt->dst.dev->features & NETIF_F_SG))) 1694 alloclen = fraglen; 1695 else { 1696 alloclen = fragheaderlen + transhdrlen; 1697 pagedlen = datalen - transhdrlen; 1698 } 1699 alloclen += alloc_extra; 1700 1701 if (datalen != length + fraggap) { 1702 /* 1703 * this is not the last fragment, the trailer 1704 * space is regarded as data space. 1705 */ 1706 datalen += rt->dst.trailer_len; 1707 } 1708 1709 fraglen = datalen + fragheaderlen; 1710 1711 copy = datalen - transhdrlen - fraggap - pagedlen; 1712 /* [!] NOTE: copy may be negative if pagedlen>0 1713 * because then the equation may reduces to -fraggap. 1714 */ 1715 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1716 err = -EINVAL; 1717 goto error; 1718 } 1719 if (transhdrlen) { 1720 skb = sock_alloc_send_skb(sk, alloclen, 1721 (flags & MSG_DONTWAIT), &err); 1722 } else { 1723 skb = NULL; 1724 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1725 2 * sk->sk_sndbuf) 1726 skb = alloc_skb(alloclen, 1727 sk->sk_allocation); 1728 if (unlikely(!skb)) 1729 err = -ENOBUFS; 1730 } 1731 if (!skb) 1732 goto error; 1733 /* 1734 * Fill in the control structures 1735 */ 1736 skb->protocol = htons(ETH_P_IPV6); 1737 skb->ip_summed = csummode; 1738 skb->csum = 0; 1739 /* reserve for fragmentation and ipsec header */ 1740 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1741 dst_exthdrlen); 1742 1743 /* 1744 * Find where to start putting bytes 1745 */ 1746 data = skb_put(skb, fraglen - pagedlen); 1747 skb_set_network_header(skb, exthdrlen); 1748 data += fragheaderlen; 1749 skb->transport_header = (skb->network_header + 1750 fragheaderlen); 1751 if (fraggap) { 1752 skb->csum = skb_copy_and_csum_bits( 1753 skb_prev, maxfraglen, 1754 data + transhdrlen, fraggap); 1755 skb_prev->csum = csum_sub(skb_prev->csum, 1756 skb->csum); 1757 data += fraggap; 1758 pskb_trim_unique(skb_prev, maxfraglen); 1759 } 1760 if (copy > 0 && 1761 getfrag(from, data + transhdrlen, offset, 1762 copy, fraggap, skb) < 0) { 1763 err = -EFAULT; 1764 kfree_skb(skb); 1765 goto error; 1766 } else if (flags & MSG_SPLICE_PAGES) { 1767 copy = 0; 1768 } 1769 1770 offset += copy; 1771 length -= copy + transhdrlen; 1772 transhdrlen = 0; 1773 exthdrlen = 0; 1774 dst_exthdrlen = 0; 1775 1776 /* Only the initial fragment is time stamped */ 1777 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1778 cork->tx_flags = 0; 1779 skb_shinfo(skb)->tskey = tskey; 1780 tskey = 0; 1781 skb_zcopy_set(skb, uarg, &extra_uref); 1782 1783 if ((flags & MSG_CONFIRM) && !skb_prev) 1784 skb_set_dst_pending_confirm(skb, 1); 1785 1786 /* 1787 * Put the packet on the pending queue 1788 */ 1789 if (!skb->destructor) { 1790 skb->destructor = sock_wfree; 1791 skb->sk = sk; 1792 wmem_alloc_delta += skb->truesize; 1793 } 1794 __skb_queue_tail(queue, skb); 1795 continue; 1796 } 1797 1798 if (copy > length) 1799 copy = length; 1800 1801 if (!(rt->dst.dev->features&NETIF_F_SG) && 1802 skb_tailroom(skb) >= copy) { 1803 unsigned int off; 1804 1805 off = skb->len; 1806 if (getfrag(from, skb_put(skb, copy), 1807 offset, copy, off, skb) < 0) { 1808 __skb_trim(skb, off); 1809 err = -EFAULT; 1810 goto error; 1811 } 1812 } else if (flags & MSG_SPLICE_PAGES) { 1813 struct msghdr *msg = from; 1814 1815 err = -EIO; 1816 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1817 goto error; 1818 1819 err = skb_splice_from_iter(skb, &msg->msg_iter, copy, 1820 sk->sk_allocation); 1821 if (err < 0) 1822 goto error; 1823 copy = err; 1824 wmem_alloc_delta += copy; 1825 } else if (!zc) { 1826 int i = skb_shinfo(skb)->nr_frags; 1827 1828 err = -ENOMEM; 1829 if (!sk_page_frag_refill(sk, pfrag)) 1830 goto error; 1831 1832 skb_zcopy_downgrade_managed(skb); 1833 if (!skb_can_coalesce(skb, i, pfrag->page, 1834 pfrag->offset)) { 1835 err = -EMSGSIZE; 1836 if (i == MAX_SKB_FRAGS) 1837 goto error; 1838 1839 __skb_fill_page_desc(skb, i, pfrag->page, 1840 pfrag->offset, 0); 1841 skb_shinfo(skb)->nr_frags = ++i; 1842 get_page(pfrag->page); 1843 } 1844 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1845 if (getfrag(from, 1846 page_address(pfrag->page) + pfrag->offset, 1847 offset, copy, skb->len, skb) < 0) 1848 goto error_efault; 1849 1850 pfrag->offset += copy; 1851 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1852 skb->len += copy; 1853 skb->data_len += copy; 1854 skb->truesize += copy; 1855 wmem_alloc_delta += copy; 1856 } else { 1857 err = skb_zerocopy_iter_dgram(skb, from, copy); 1858 if (err < 0) 1859 goto error; 1860 } 1861 offset += copy; 1862 length -= copy; 1863 } 1864 1865 if (wmem_alloc_delta) 1866 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1867 return 0; 1868 1869 error_efault: 1870 err = -EFAULT; 1871 error: 1872 net_zcopy_put_abort(uarg, extra_uref); 1873 cork->length -= length; 1874 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1875 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1876 return err; 1877 } 1878 1879 int ip6_append_data(struct sock *sk, 1880 int getfrag(void *from, char *to, int offset, int len, 1881 int odd, struct sk_buff *skb), 1882 void *from, size_t length, int transhdrlen, 1883 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1884 struct rt6_info *rt, unsigned int flags) 1885 { 1886 struct inet_sock *inet = inet_sk(sk); 1887 struct ipv6_pinfo *np = inet6_sk(sk); 1888 int exthdrlen; 1889 int err; 1890 1891 if (flags&MSG_PROBE) 1892 return 0; 1893 if (skb_queue_empty(&sk->sk_write_queue)) { 1894 /* 1895 * setup for corking 1896 */ 1897 dst_hold(&rt->dst); 1898 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1899 ipc6, rt); 1900 if (err) 1901 return err; 1902 1903 inet->cork.fl.u.ip6 = *fl6; 1904 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1905 length += exthdrlen; 1906 transhdrlen += exthdrlen; 1907 } else { 1908 transhdrlen = 0; 1909 } 1910 1911 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1912 &np->cork, sk_page_frag(sk), getfrag, 1913 from, length, transhdrlen, flags, ipc6); 1914 } 1915 EXPORT_SYMBOL_GPL(ip6_append_data); 1916 1917 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1918 { 1919 struct dst_entry *dst = cork->base.dst; 1920 1921 cork->base.dst = NULL; 1922 cork->base.flags &= ~IPCORK_ALLFRAG; 1923 skb_dst_set(skb, dst); 1924 } 1925 1926 static void ip6_cork_release(struct inet_cork_full *cork, 1927 struct inet6_cork *v6_cork) 1928 { 1929 if (v6_cork->opt) { 1930 struct ipv6_txoptions *opt = v6_cork->opt; 1931 1932 kfree(opt->dst0opt); 1933 kfree(opt->dst1opt); 1934 kfree(opt->hopopt); 1935 kfree(opt->srcrt); 1936 kfree(opt); 1937 v6_cork->opt = NULL; 1938 } 1939 1940 if (cork->base.dst) { 1941 dst_release(cork->base.dst); 1942 cork->base.dst = NULL; 1943 cork->base.flags &= ~IPCORK_ALLFRAG; 1944 } 1945 } 1946 1947 struct sk_buff *__ip6_make_skb(struct sock *sk, 1948 struct sk_buff_head *queue, 1949 struct inet_cork_full *cork, 1950 struct inet6_cork *v6_cork) 1951 { 1952 struct sk_buff *skb, *tmp_skb; 1953 struct sk_buff **tail_skb; 1954 struct in6_addr *final_dst; 1955 struct ipv6_pinfo *np = inet6_sk(sk); 1956 struct net *net = sock_net(sk); 1957 struct ipv6hdr *hdr; 1958 struct ipv6_txoptions *opt = v6_cork->opt; 1959 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1960 struct flowi6 *fl6 = &cork->fl.u.ip6; 1961 unsigned char proto = fl6->flowi6_proto; 1962 1963 skb = __skb_dequeue(queue); 1964 if (!skb) 1965 goto out; 1966 tail_skb = &(skb_shinfo(skb)->frag_list); 1967 1968 /* move skb->data to ip header from ext header */ 1969 if (skb->data < skb_network_header(skb)) 1970 __skb_pull(skb, skb_network_offset(skb)); 1971 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1972 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1973 *tail_skb = tmp_skb; 1974 tail_skb = &(tmp_skb->next); 1975 skb->len += tmp_skb->len; 1976 skb->data_len += tmp_skb->len; 1977 skb->truesize += tmp_skb->truesize; 1978 tmp_skb->destructor = NULL; 1979 tmp_skb->sk = NULL; 1980 } 1981 1982 /* Allow local fragmentation. */ 1983 skb->ignore_df = ip6_sk_ignore_df(sk); 1984 __skb_pull(skb, skb_network_header_len(skb)); 1985 1986 final_dst = &fl6->daddr; 1987 if (opt && opt->opt_flen) 1988 ipv6_push_frag_opts(skb, opt, &proto); 1989 if (opt && opt->opt_nflen) 1990 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1991 1992 skb_push(skb, sizeof(struct ipv6hdr)); 1993 skb_reset_network_header(skb); 1994 hdr = ipv6_hdr(skb); 1995 1996 ip6_flow_hdr(hdr, v6_cork->tclass, 1997 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1998 ip6_autoflowlabel(net, np), fl6)); 1999 hdr->hop_limit = v6_cork->hop_limit; 2000 hdr->nexthdr = proto; 2001 hdr->saddr = fl6->saddr; 2002 hdr->daddr = *final_dst; 2003 2004 skb->priority = sk->sk_priority; 2005 skb->mark = cork->base.mark; 2006 skb->tstamp = cork->base.transmit_time; 2007 2008 ip6_cork_steal_dst(skb, cork); 2009 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 2010 if (proto == IPPROTO_ICMPV6) { 2011 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 2012 u8 icmp6_type; 2013 2014 if (sk->sk_socket->type == SOCK_RAW && 2015 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 2016 icmp6_type = fl6->fl6_icmp_type; 2017 else 2018 icmp6_type = icmp6_hdr(skb)->icmp6_type; 2019 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 2020 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 2021 } 2022 2023 ip6_cork_release(cork, v6_cork); 2024 out: 2025 return skb; 2026 } 2027 2028 int ip6_send_skb(struct sk_buff *skb) 2029 { 2030 struct net *net = sock_net(skb->sk); 2031 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 2032 int err; 2033 2034 rcu_read_lock(); 2035 err = ip6_local_out(net, skb->sk, skb); 2036 if (err) { 2037 if (err > 0) 2038 err = net_xmit_errno(err); 2039 if (err) 2040 IP6_INC_STATS(net, rt->rt6i_idev, 2041 IPSTATS_MIB_OUTDISCARDS); 2042 } 2043 2044 rcu_read_unlock(); 2045 return err; 2046 } 2047 2048 int ip6_push_pending_frames(struct sock *sk) 2049 { 2050 struct sk_buff *skb; 2051 2052 skb = ip6_finish_skb(sk); 2053 if (!skb) 2054 return 0; 2055 2056 return ip6_send_skb(skb); 2057 } 2058 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2059 2060 static void __ip6_flush_pending_frames(struct sock *sk, 2061 struct sk_buff_head *queue, 2062 struct inet_cork_full *cork, 2063 struct inet6_cork *v6_cork) 2064 { 2065 struct sk_buff *skb; 2066 2067 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2068 if (skb_dst(skb)) 2069 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2070 IPSTATS_MIB_OUTDISCARDS); 2071 kfree_skb(skb); 2072 } 2073 2074 ip6_cork_release(cork, v6_cork); 2075 } 2076 2077 void ip6_flush_pending_frames(struct sock *sk) 2078 { 2079 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2080 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 2081 } 2082 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2083 2084 struct sk_buff *ip6_make_skb(struct sock *sk, 2085 int getfrag(void *from, char *to, int offset, 2086 int len, int odd, struct sk_buff *skb), 2087 void *from, size_t length, int transhdrlen, 2088 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2089 unsigned int flags, struct inet_cork_full *cork) 2090 { 2091 struct inet6_cork v6_cork; 2092 struct sk_buff_head queue; 2093 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2094 int err; 2095 2096 if (flags & MSG_PROBE) { 2097 dst_release(&rt->dst); 2098 return NULL; 2099 } 2100 2101 __skb_queue_head_init(&queue); 2102 2103 cork->base.flags = 0; 2104 cork->base.addr = 0; 2105 cork->base.opt = NULL; 2106 v6_cork.opt = NULL; 2107 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); 2108 if (err) { 2109 ip6_cork_release(cork, &v6_cork); 2110 return ERR_PTR(err); 2111 } 2112 if (ipc6->dontfrag < 0) 2113 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 2114 2115 err = __ip6_append_data(sk, &queue, cork, &v6_cork, 2116 ¤t->task_frag, getfrag, from, 2117 length + exthdrlen, transhdrlen + exthdrlen, 2118 flags, ipc6); 2119 if (err) { 2120 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 2121 return ERR_PTR(err); 2122 } 2123 2124 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 2125 } 2126