1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst->dev; 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 skb = skb_expand_head(skb, hh_len); 74 if (!skb) { 75 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 76 return -ENOMEM; 77 } 78 } 79 80 hdr = ipv6_hdr(skb); 81 daddr = &hdr->daddr; 82 if (ipv6_addr_is_multicast(daddr)) { 83 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 84 ((mroute6_is_socket(net, skb) && 85 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 86 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 87 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 88 89 /* Do not check for IFF_ALLMULTI; multicast routing 90 is not supported in any case. 91 */ 92 if (newskb) 93 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 94 net, sk, newskb, NULL, newskb->dev, 95 dev_loopback_xmit); 96 97 if (hdr->hop_limit == 0) { 98 IP6_INC_STATS(net, idev, 99 IPSTATS_MIB_OUTDISCARDS); 100 kfree_skb(skb); 101 return 0; 102 } 103 } 104 105 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 106 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 107 !(dev->flags & IFF_LOOPBACK)) { 108 kfree_skb(skb); 109 return 0; 110 } 111 } 112 113 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 114 int res = lwtunnel_xmit(skb); 115 116 if (res != LWTUNNEL_XMIT_CONTINUE) 117 return res; 118 } 119 120 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 121 122 rcu_read_lock(); 123 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (unlikely(IS_ERR_OR_NULL(neigh))) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 rcu_read_unlock(); 131 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 132 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 133 return -EINVAL; 134 } 135 } 136 sock_confirm_neigh(skb, neigh); 137 ret = neigh_output(neigh, skb, false); 138 rcu_read_unlock(); 139 return ret; 140 } 141 142 static int 143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 144 struct sk_buff *skb, unsigned int mtu) 145 { 146 struct sk_buff *segs, *nskb; 147 netdev_features_t features; 148 int ret = 0; 149 150 /* Please see corresponding comment in ip_finish_output_gso 151 * describing the cases where GSO segment length exceeds the 152 * egress MTU. 153 */ 154 features = netif_skb_features(skb); 155 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 156 if (IS_ERR_OR_NULL(segs)) { 157 kfree_skb(skb); 158 return -ENOMEM; 159 } 160 161 consume_skb(skb); 162 163 skb_list_walk_safe(segs, segs, nskb) { 164 int err; 165 166 skb_mark_not_on_list(segs); 167 /* Last GSO segment can be smaller than gso_size (and MTU). 168 * Adding a fragment header would produce an "atomic fragment", 169 * which is considered harmful (RFC-8021). Avoid that. 170 */ 171 err = segs->len > mtu ? 172 ip6_fragment(net, sk, segs, ip6_finish_output2) : 173 ip6_finish_output2(net, sk, segs); 174 if (err && ret == 0) 175 ret = err; 176 } 177 178 return ret; 179 } 180 181 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 182 { 183 unsigned int mtu; 184 185 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 186 /* Policy lookup after SNAT yielded a new policy */ 187 if (skb_dst(skb)->xfrm) { 188 IP6CB(skb)->flags |= IP6SKB_REROUTED; 189 return dst_output(net, sk, skb); 190 } 191 #endif 192 193 mtu = ip6_skb_dst_mtu(skb); 194 if (skb_is_gso(skb) && 195 !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 196 !skb_gso_validate_network_len(skb, mtu)) 197 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 198 199 if ((skb->len > mtu && !skb_is_gso(skb)) || 200 dst_allfrag(skb_dst(skb)) || 201 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 202 return ip6_fragment(net, sk, skb, ip6_finish_output2); 203 else 204 return ip6_finish_output2(net, sk, skb); 205 } 206 207 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 208 { 209 int ret; 210 211 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 212 switch (ret) { 213 case NET_XMIT_SUCCESS: 214 case NET_XMIT_CN: 215 return __ip6_finish_output(net, sk, skb) ? : ret; 216 default: 217 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 218 return ret; 219 } 220 } 221 222 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 223 { 224 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; 225 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 226 227 skb->protocol = htons(ETH_P_IPV6); 228 skb->dev = dev; 229 230 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 231 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 232 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 233 return 0; 234 } 235 236 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 237 net, sk, skb, indev, dev, 238 ip6_finish_output, 239 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 240 } 241 EXPORT_SYMBOL(ip6_output); 242 243 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 244 { 245 if (!np->autoflowlabel_set) 246 return ip6_default_np_autolabel(net); 247 else 248 return np->autoflowlabel; 249 } 250 251 /* 252 * xmit an sk_buff (used by TCP, SCTP and DCCP) 253 * Note : socket lock is not held for SYNACK packets, but might be modified 254 * by calls to skb_set_owner_w() and ipv6_local_error(), 255 * which are using proper atomic operations or spinlocks. 256 */ 257 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 258 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 259 { 260 struct net *net = sock_net(sk); 261 const struct ipv6_pinfo *np = inet6_sk(sk); 262 struct in6_addr *first_hop = &fl6->daddr; 263 struct dst_entry *dst = skb_dst(skb); 264 struct net_device *dev = dst->dev; 265 struct inet6_dev *idev = ip6_dst_idev(dst); 266 struct hop_jumbo_hdr *hop_jumbo; 267 int hoplen = sizeof(*hop_jumbo); 268 unsigned int head_room; 269 struct ipv6hdr *hdr; 270 u8 proto = fl6->flowi6_proto; 271 int seg_len = skb->len; 272 int hlimit = -1; 273 u32 mtu; 274 275 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 276 if (opt) 277 head_room += opt->opt_nflen + opt->opt_flen; 278 279 if (unlikely(head_room > skb_headroom(skb))) { 280 skb = skb_expand_head(skb, head_room); 281 if (!skb) { 282 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 283 return -ENOBUFS; 284 } 285 } 286 287 if (opt) { 288 seg_len += opt->opt_nflen + opt->opt_flen; 289 290 if (opt->opt_flen) 291 ipv6_push_frag_opts(skb, opt, &proto); 292 293 if (opt->opt_nflen) 294 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 295 &fl6->saddr); 296 } 297 298 if (unlikely(seg_len > IPV6_MAXPLEN)) { 299 hop_jumbo = skb_push(skb, hoplen); 300 301 hop_jumbo->nexthdr = proto; 302 hop_jumbo->hdrlen = 0; 303 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 304 hop_jumbo->tlv_len = 4; 305 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 306 307 proto = IPPROTO_HOPOPTS; 308 seg_len = 0; 309 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 310 } 311 312 skb_push(skb, sizeof(struct ipv6hdr)); 313 skb_reset_network_header(skb); 314 hdr = ipv6_hdr(skb); 315 316 /* 317 * Fill in the IPv6 header 318 */ 319 if (np) 320 hlimit = np->hop_limit; 321 if (hlimit < 0) 322 hlimit = ip6_dst_hoplimit(dst); 323 324 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 325 ip6_autoflowlabel(net, np), fl6)); 326 327 hdr->payload_len = htons(seg_len); 328 hdr->nexthdr = proto; 329 hdr->hop_limit = hlimit; 330 331 hdr->saddr = fl6->saddr; 332 hdr->daddr = *first_hop; 333 334 skb->protocol = htons(ETH_P_IPV6); 335 skb->priority = priority; 336 skb->mark = mark; 337 338 mtu = dst_mtu(dst); 339 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 340 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 341 342 /* if egress device is enslaved to an L3 master device pass the 343 * skb to its handler for processing 344 */ 345 skb = l3mdev_ip6_out((struct sock *)sk, skb); 346 if (unlikely(!skb)) 347 return 0; 348 349 /* hooks should never assume socket lock is held. 350 * we promote our socket to non const 351 */ 352 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 353 net, (struct sock *)sk, skb, NULL, dev, 354 dst_output); 355 } 356 357 skb->dev = dev; 358 /* ipv6_local_error() does not require socket lock, 359 * we promote our socket to non const 360 */ 361 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 362 363 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 364 kfree_skb(skb); 365 return -EMSGSIZE; 366 } 367 EXPORT_SYMBOL(ip6_xmit); 368 369 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 370 { 371 struct ip6_ra_chain *ra; 372 struct sock *last = NULL; 373 374 read_lock(&ip6_ra_lock); 375 for (ra = ip6_ra_chain; ra; ra = ra->next) { 376 struct sock *sk = ra->sk; 377 if (sk && ra->sel == sel && 378 (!sk->sk_bound_dev_if || 379 sk->sk_bound_dev_if == skb->dev->ifindex)) { 380 struct ipv6_pinfo *np = inet6_sk(sk); 381 382 if (np && np->rtalert_isolate && 383 !net_eq(sock_net(sk), dev_net(skb->dev))) { 384 continue; 385 } 386 if (last) { 387 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 388 if (skb2) 389 rawv6_rcv(last, skb2); 390 } 391 last = sk; 392 } 393 } 394 395 if (last) { 396 rawv6_rcv(last, skb); 397 read_unlock(&ip6_ra_lock); 398 return 1; 399 } 400 read_unlock(&ip6_ra_lock); 401 return 0; 402 } 403 404 static int ip6_forward_proxy_check(struct sk_buff *skb) 405 { 406 struct ipv6hdr *hdr = ipv6_hdr(skb); 407 u8 nexthdr = hdr->nexthdr; 408 __be16 frag_off; 409 int offset; 410 411 if (ipv6_ext_hdr(nexthdr)) { 412 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 413 if (offset < 0) 414 return 0; 415 } else 416 offset = sizeof(struct ipv6hdr); 417 418 if (nexthdr == IPPROTO_ICMPV6) { 419 struct icmp6hdr *icmp6; 420 421 if (!pskb_may_pull(skb, (skb_network_header(skb) + 422 offset + 1 - skb->data))) 423 return 0; 424 425 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 426 427 switch (icmp6->icmp6_type) { 428 case NDISC_ROUTER_SOLICITATION: 429 case NDISC_ROUTER_ADVERTISEMENT: 430 case NDISC_NEIGHBOUR_SOLICITATION: 431 case NDISC_NEIGHBOUR_ADVERTISEMENT: 432 case NDISC_REDIRECT: 433 /* For reaction involving unicast neighbor discovery 434 * message destined to the proxied address, pass it to 435 * input function. 436 */ 437 return 1; 438 default: 439 break; 440 } 441 } 442 443 /* 444 * The proxying router can't forward traffic sent to a link-local 445 * address, so signal the sender and discard the packet. This 446 * behavior is clarified by the MIPv6 specification. 447 */ 448 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 449 dst_link_failure(skb); 450 return -1; 451 } 452 453 return 0; 454 } 455 456 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 457 struct sk_buff *skb) 458 { 459 struct dst_entry *dst = skb_dst(skb); 460 461 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 462 463 #ifdef CONFIG_NET_SWITCHDEV 464 if (skb->offload_l3_fwd_mark) { 465 consume_skb(skb); 466 return 0; 467 } 468 #endif 469 470 skb_clear_tstamp(skb); 471 return dst_output(net, sk, skb); 472 } 473 474 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 475 { 476 if (skb->len <= mtu) 477 return false; 478 479 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 480 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 481 return true; 482 483 if (skb->ignore_df) 484 return false; 485 486 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 487 return false; 488 489 return true; 490 } 491 492 int ip6_forward(struct sk_buff *skb) 493 { 494 struct dst_entry *dst = skb_dst(skb); 495 struct ipv6hdr *hdr = ipv6_hdr(skb); 496 struct inet6_skb_parm *opt = IP6CB(skb); 497 struct net *net = dev_net(dst->dev); 498 struct inet6_dev *idev; 499 SKB_DR(reason); 500 u32 mtu; 501 502 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 503 if (net->ipv6.devconf_all->forwarding == 0) 504 goto error; 505 506 if (skb->pkt_type != PACKET_HOST) 507 goto drop; 508 509 if (unlikely(skb->sk)) 510 goto drop; 511 512 if (skb_warn_if_lro(skb)) 513 goto drop; 514 515 if (!net->ipv6.devconf_all->disable_policy && 516 (!idev || !idev->cnf.disable_policy) && 517 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 518 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 519 goto drop; 520 } 521 522 skb_forward_csum(skb); 523 524 /* 525 * We DO NOT make any processing on 526 * RA packets, pushing them to user level AS IS 527 * without ane WARRANTY that application will be able 528 * to interpret them. The reason is that we 529 * cannot make anything clever here. 530 * 531 * We are not end-node, so that if packet contains 532 * AH/ESP, we cannot make anything. 533 * Defragmentation also would be mistake, RA packets 534 * cannot be fragmented, because there is no warranty 535 * that different fragments will go along one path. --ANK 536 */ 537 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 538 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 539 return 0; 540 } 541 542 /* 543 * check and decrement ttl 544 */ 545 if (hdr->hop_limit <= 1) { 546 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 547 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 548 549 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 550 return -ETIMEDOUT; 551 } 552 553 /* XXX: idev->cnf.proxy_ndp? */ 554 if (net->ipv6.devconf_all->proxy_ndp && 555 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 556 int proxied = ip6_forward_proxy_check(skb); 557 if (proxied > 0) { 558 /* It's tempting to decrease the hop limit 559 * here by 1, as we do at the end of the 560 * function too. 561 * 562 * But that would be incorrect, as proxying is 563 * not forwarding. The ip6_input function 564 * will handle this packet locally, and it 565 * depends on the hop limit being unchanged. 566 * 567 * One example is the NDP hop limit, that 568 * always has to stay 255, but other would be 569 * similar checks around RA packets, where the 570 * user can even change the desired limit. 571 */ 572 return ip6_input(skb); 573 } else if (proxied < 0) { 574 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 575 goto drop; 576 } 577 } 578 579 if (!xfrm6_route_forward(skb)) { 580 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 581 SKB_DR_SET(reason, XFRM_POLICY); 582 goto drop; 583 } 584 dst = skb_dst(skb); 585 586 /* IPv6 specs say nothing about it, but it is clear that we cannot 587 send redirects to source routed frames. 588 We don't send redirects to frames decapsulated from IPsec. 589 */ 590 if (IP6CB(skb)->iif == dst->dev->ifindex && 591 opt->srcrt == 0 && !skb_sec_path(skb)) { 592 struct in6_addr *target = NULL; 593 struct inet_peer *peer; 594 struct rt6_info *rt; 595 596 /* 597 * incoming and outgoing devices are the same 598 * send a redirect. 599 */ 600 601 rt = (struct rt6_info *) dst; 602 if (rt->rt6i_flags & RTF_GATEWAY) 603 target = &rt->rt6i_gateway; 604 else 605 target = &hdr->daddr; 606 607 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 608 609 /* Limit redirects both by destination (here) 610 and by source (inside ndisc_send_redirect) 611 */ 612 if (inet_peer_xrlim_allow(peer, 1*HZ)) 613 ndisc_send_redirect(skb, target); 614 if (peer) 615 inet_putpeer(peer); 616 } else { 617 int addrtype = ipv6_addr_type(&hdr->saddr); 618 619 /* This check is security critical. */ 620 if (addrtype == IPV6_ADDR_ANY || 621 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 622 goto error; 623 if (addrtype & IPV6_ADDR_LINKLOCAL) { 624 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 625 ICMPV6_NOT_NEIGHBOUR, 0); 626 goto error; 627 } 628 } 629 630 mtu = ip6_dst_mtu_maybe_forward(dst, true); 631 if (mtu < IPV6_MIN_MTU) 632 mtu = IPV6_MIN_MTU; 633 634 if (ip6_pkt_too_big(skb, mtu)) { 635 /* Again, force OUTPUT device used as source address */ 636 skb->dev = dst->dev; 637 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 638 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 639 __IP6_INC_STATS(net, ip6_dst_idev(dst), 640 IPSTATS_MIB_FRAGFAILS); 641 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 642 return -EMSGSIZE; 643 } 644 645 if (skb_cow(skb, dst->dev->hard_header_len)) { 646 __IP6_INC_STATS(net, ip6_dst_idev(dst), 647 IPSTATS_MIB_OUTDISCARDS); 648 goto drop; 649 } 650 651 hdr = ipv6_hdr(skb); 652 653 /* Mangling hops number delayed to point after skb COW */ 654 655 hdr->hop_limit--; 656 657 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 658 net, NULL, skb, skb->dev, dst->dev, 659 ip6_forward_finish); 660 661 error: 662 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 663 SKB_DR_SET(reason, IP_INADDRERRORS); 664 drop: 665 kfree_skb_reason(skb, reason); 666 return -EINVAL; 667 } 668 669 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 670 { 671 to->pkt_type = from->pkt_type; 672 to->priority = from->priority; 673 to->protocol = from->protocol; 674 skb_dst_drop(to); 675 skb_dst_set(to, dst_clone(skb_dst(from))); 676 to->dev = from->dev; 677 to->mark = from->mark; 678 679 skb_copy_hash(to, from); 680 681 #ifdef CONFIG_NET_SCHED 682 to->tc_index = from->tc_index; 683 #endif 684 nf_copy(to, from); 685 skb_ext_copy(to, from); 686 skb_copy_secmark(to, from); 687 } 688 689 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 690 u8 nexthdr, __be32 frag_id, 691 struct ip6_fraglist_iter *iter) 692 { 693 unsigned int first_len; 694 struct frag_hdr *fh; 695 696 /* BUILD HEADER */ 697 *prevhdr = NEXTHDR_FRAGMENT; 698 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 699 if (!iter->tmp_hdr) 700 return -ENOMEM; 701 702 iter->frag = skb_shinfo(skb)->frag_list; 703 skb_frag_list_init(skb); 704 705 iter->offset = 0; 706 iter->hlen = hlen; 707 iter->frag_id = frag_id; 708 iter->nexthdr = nexthdr; 709 710 __skb_pull(skb, hlen); 711 fh = __skb_push(skb, sizeof(struct frag_hdr)); 712 __skb_push(skb, hlen); 713 skb_reset_network_header(skb); 714 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 715 716 fh->nexthdr = nexthdr; 717 fh->reserved = 0; 718 fh->frag_off = htons(IP6_MF); 719 fh->identification = frag_id; 720 721 first_len = skb_pagelen(skb); 722 skb->data_len = first_len - skb_headlen(skb); 723 skb->len = first_len; 724 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 725 726 return 0; 727 } 728 EXPORT_SYMBOL(ip6_fraglist_init); 729 730 void ip6_fraglist_prepare(struct sk_buff *skb, 731 struct ip6_fraglist_iter *iter) 732 { 733 struct sk_buff *frag = iter->frag; 734 unsigned int hlen = iter->hlen; 735 struct frag_hdr *fh; 736 737 frag->ip_summed = CHECKSUM_NONE; 738 skb_reset_transport_header(frag); 739 fh = __skb_push(frag, sizeof(struct frag_hdr)); 740 __skb_push(frag, hlen); 741 skb_reset_network_header(frag); 742 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 743 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 744 fh->nexthdr = iter->nexthdr; 745 fh->reserved = 0; 746 fh->frag_off = htons(iter->offset); 747 if (frag->next) 748 fh->frag_off |= htons(IP6_MF); 749 fh->identification = iter->frag_id; 750 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 751 ip6_copy_metadata(frag, skb); 752 } 753 EXPORT_SYMBOL(ip6_fraglist_prepare); 754 755 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 756 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 757 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 758 { 759 state->prevhdr = prevhdr; 760 state->nexthdr = nexthdr; 761 state->frag_id = frag_id; 762 763 state->hlen = hlen; 764 state->mtu = mtu; 765 766 state->left = skb->len - hlen; /* Space per frame */ 767 state->ptr = hlen; /* Where to start from */ 768 769 state->hroom = hdr_room; 770 state->troom = needed_tailroom; 771 772 state->offset = 0; 773 } 774 EXPORT_SYMBOL(ip6_frag_init); 775 776 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 777 { 778 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 779 struct sk_buff *frag; 780 struct frag_hdr *fh; 781 unsigned int len; 782 783 len = state->left; 784 /* IF: it doesn't fit, use 'mtu' - the data space left */ 785 if (len > state->mtu) 786 len = state->mtu; 787 /* IF: we are not sending up to and including the packet end 788 then align the next start on an eight byte boundary */ 789 if (len < state->left) 790 len &= ~7; 791 792 /* Allocate buffer */ 793 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 794 state->hroom + state->troom, GFP_ATOMIC); 795 if (!frag) 796 return ERR_PTR(-ENOMEM); 797 798 /* 799 * Set up data on packet 800 */ 801 802 ip6_copy_metadata(frag, skb); 803 skb_reserve(frag, state->hroom); 804 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 805 skb_reset_network_header(frag); 806 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 807 frag->transport_header = (frag->network_header + state->hlen + 808 sizeof(struct frag_hdr)); 809 810 /* 811 * Charge the memory for the fragment to any owner 812 * it might possess 813 */ 814 if (skb->sk) 815 skb_set_owner_w(frag, skb->sk); 816 817 /* 818 * Copy the packet header into the new buffer. 819 */ 820 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 821 822 fragnexthdr_offset = skb_network_header(frag); 823 fragnexthdr_offset += prevhdr - skb_network_header(skb); 824 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 825 826 /* 827 * Build fragment header. 828 */ 829 fh->nexthdr = state->nexthdr; 830 fh->reserved = 0; 831 fh->identification = state->frag_id; 832 833 /* 834 * Copy a block of the IP datagram. 835 */ 836 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 837 len)); 838 state->left -= len; 839 840 fh->frag_off = htons(state->offset); 841 if (state->left > 0) 842 fh->frag_off |= htons(IP6_MF); 843 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 844 845 state->ptr += len; 846 state->offset += len; 847 848 return frag; 849 } 850 EXPORT_SYMBOL(ip6_frag_next); 851 852 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 853 int (*output)(struct net *, struct sock *, struct sk_buff *)) 854 { 855 struct sk_buff *frag; 856 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 857 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 858 inet6_sk(skb->sk) : NULL; 859 bool mono_delivery_time = skb->mono_delivery_time; 860 struct ip6_frag_state state; 861 unsigned int mtu, hlen, nexthdr_offset; 862 ktime_t tstamp = skb->tstamp; 863 int hroom, err = 0; 864 __be32 frag_id; 865 u8 *prevhdr, nexthdr = 0; 866 867 err = ip6_find_1stfragopt(skb, &prevhdr); 868 if (err < 0) 869 goto fail; 870 hlen = err; 871 nexthdr = *prevhdr; 872 nexthdr_offset = prevhdr - skb_network_header(skb); 873 874 mtu = ip6_skb_dst_mtu(skb); 875 876 /* We must not fragment if the socket is set to force MTU discovery 877 * or if the skb it not generated by a local socket. 878 */ 879 if (unlikely(!skb->ignore_df && skb->len > mtu)) 880 goto fail_toobig; 881 882 if (IP6CB(skb)->frag_max_size) { 883 if (IP6CB(skb)->frag_max_size > mtu) 884 goto fail_toobig; 885 886 /* don't send fragments larger than what we received */ 887 mtu = IP6CB(skb)->frag_max_size; 888 if (mtu < IPV6_MIN_MTU) 889 mtu = IPV6_MIN_MTU; 890 } 891 892 if (np && np->frag_size < mtu) { 893 if (np->frag_size) 894 mtu = np->frag_size; 895 } 896 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 897 goto fail_toobig; 898 mtu -= hlen + sizeof(struct frag_hdr); 899 900 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 901 &ipv6_hdr(skb)->saddr); 902 903 if (skb->ip_summed == CHECKSUM_PARTIAL && 904 (err = skb_checksum_help(skb))) 905 goto fail; 906 907 prevhdr = skb_network_header(skb) + nexthdr_offset; 908 hroom = LL_RESERVED_SPACE(rt->dst.dev); 909 if (skb_has_frag_list(skb)) { 910 unsigned int first_len = skb_pagelen(skb); 911 struct ip6_fraglist_iter iter; 912 struct sk_buff *frag2; 913 914 if (first_len - hlen > mtu || 915 ((first_len - hlen) & 7) || 916 skb_cloned(skb) || 917 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 918 goto slow_path; 919 920 skb_walk_frags(skb, frag) { 921 /* Correct geometry. */ 922 if (frag->len > mtu || 923 ((frag->len & 7) && frag->next) || 924 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 925 goto slow_path_clean; 926 927 /* Partially cloned skb? */ 928 if (skb_shared(frag)) 929 goto slow_path_clean; 930 931 BUG_ON(frag->sk); 932 if (skb->sk) { 933 frag->sk = skb->sk; 934 frag->destructor = sock_wfree; 935 } 936 skb->truesize -= frag->truesize; 937 } 938 939 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 940 &iter); 941 if (err < 0) 942 goto fail; 943 944 /* We prevent @rt from being freed. */ 945 rcu_read_lock(); 946 947 for (;;) { 948 /* Prepare header of the next frame, 949 * before previous one went down. */ 950 if (iter.frag) 951 ip6_fraglist_prepare(skb, &iter); 952 953 skb_set_delivery_time(skb, tstamp, mono_delivery_time); 954 err = output(net, sk, skb); 955 if (!err) 956 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 957 IPSTATS_MIB_FRAGCREATES); 958 959 if (err || !iter.frag) 960 break; 961 962 skb = ip6_fraglist_next(&iter); 963 } 964 965 kfree(iter.tmp_hdr); 966 967 if (err == 0) { 968 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 969 IPSTATS_MIB_FRAGOKS); 970 rcu_read_unlock(); 971 return 0; 972 } 973 974 kfree_skb_list(iter.frag); 975 976 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 977 IPSTATS_MIB_FRAGFAILS); 978 rcu_read_unlock(); 979 return err; 980 981 slow_path_clean: 982 skb_walk_frags(skb, frag2) { 983 if (frag2 == frag) 984 break; 985 frag2->sk = NULL; 986 frag2->destructor = NULL; 987 skb->truesize += frag2->truesize; 988 } 989 } 990 991 slow_path: 992 /* 993 * Fragment the datagram. 994 */ 995 996 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 997 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 998 &state); 999 1000 /* 1001 * Keep copying data until we run out. 1002 */ 1003 1004 while (state.left > 0) { 1005 frag = ip6_frag_next(skb, &state); 1006 if (IS_ERR(frag)) { 1007 err = PTR_ERR(frag); 1008 goto fail; 1009 } 1010 1011 /* 1012 * Put this fragment into the sending queue. 1013 */ 1014 skb_set_delivery_time(frag, tstamp, mono_delivery_time); 1015 err = output(net, sk, frag); 1016 if (err) 1017 goto fail; 1018 1019 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1020 IPSTATS_MIB_FRAGCREATES); 1021 } 1022 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1023 IPSTATS_MIB_FRAGOKS); 1024 consume_skb(skb); 1025 return err; 1026 1027 fail_toobig: 1028 if (skb->sk && dst_allfrag(skb_dst(skb))) 1029 sk_gso_disable(skb->sk); 1030 1031 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1032 err = -EMSGSIZE; 1033 1034 fail: 1035 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1036 IPSTATS_MIB_FRAGFAILS); 1037 kfree_skb(skb); 1038 return err; 1039 } 1040 1041 static inline int ip6_rt_check(const struct rt6key *rt_key, 1042 const struct in6_addr *fl_addr, 1043 const struct in6_addr *addr_cache) 1044 { 1045 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1046 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1047 } 1048 1049 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1050 struct dst_entry *dst, 1051 const struct flowi6 *fl6) 1052 { 1053 struct ipv6_pinfo *np = inet6_sk(sk); 1054 struct rt6_info *rt; 1055 1056 if (!dst) 1057 goto out; 1058 1059 if (dst->ops->family != AF_INET6) { 1060 dst_release(dst); 1061 return NULL; 1062 } 1063 1064 rt = (struct rt6_info *)dst; 1065 /* Yes, checking route validity in not connected 1066 * case is not very simple. Take into account, 1067 * that we do not support routing by source, TOS, 1068 * and MSG_DONTROUTE --ANK (980726) 1069 * 1070 * 1. ip6_rt_check(): If route was host route, 1071 * check that cached destination is current. 1072 * If it is network route, we still may 1073 * check its validity using saved pointer 1074 * to the last used address: daddr_cache. 1075 * We do not want to save whole address now, 1076 * (because main consumer of this service 1077 * is tcp, which has not this problem), 1078 * so that the last trick works only on connected 1079 * sockets. 1080 * 2. oif also should be the same. 1081 */ 1082 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1083 #ifdef CONFIG_IPV6_SUBTREES 1084 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1085 #endif 1086 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { 1087 dst_release(dst); 1088 dst = NULL; 1089 } 1090 1091 out: 1092 return dst; 1093 } 1094 1095 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1096 struct dst_entry **dst, struct flowi6 *fl6) 1097 { 1098 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1099 struct neighbour *n; 1100 struct rt6_info *rt; 1101 #endif 1102 int err; 1103 int flags = 0; 1104 1105 /* The correct way to handle this would be to do 1106 * ip6_route_get_saddr, and then ip6_route_output; however, 1107 * the route-specific preferred source forces the 1108 * ip6_route_output call _before_ ip6_route_get_saddr. 1109 * 1110 * In source specific routing (no src=any default route), 1111 * ip6_route_output will fail given src=any saddr, though, so 1112 * that's why we try it again later. 1113 */ 1114 if (ipv6_addr_any(&fl6->saddr)) { 1115 struct fib6_info *from; 1116 struct rt6_info *rt; 1117 1118 *dst = ip6_route_output(net, sk, fl6); 1119 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 1120 1121 rcu_read_lock(); 1122 from = rt ? rcu_dereference(rt->from) : NULL; 1123 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1124 sk ? inet6_sk(sk)->srcprefs : 0, 1125 fl6->flowi6_l3mdev, 1126 &fl6->saddr); 1127 rcu_read_unlock(); 1128 1129 if (err) 1130 goto out_err_release; 1131 1132 /* If we had an erroneous initial result, pretend it 1133 * never existed and let the SA-enabled version take 1134 * over. 1135 */ 1136 if ((*dst)->error) { 1137 dst_release(*dst); 1138 *dst = NULL; 1139 } 1140 1141 if (fl6->flowi6_oif) 1142 flags |= RT6_LOOKUP_F_IFACE; 1143 } 1144 1145 if (!*dst) 1146 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1147 1148 err = (*dst)->error; 1149 if (err) 1150 goto out_err_release; 1151 1152 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1153 /* 1154 * Here if the dst entry we've looked up 1155 * has a neighbour entry that is in the INCOMPLETE 1156 * state and the src address from the flow is 1157 * marked as OPTIMISTIC, we release the found 1158 * dst entry and replace it instead with the 1159 * dst entry of the nexthop router 1160 */ 1161 rt = (struct rt6_info *) *dst; 1162 rcu_read_lock(); 1163 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1164 rt6_nexthop(rt, &fl6->daddr)); 1165 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1166 rcu_read_unlock(); 1167 1168 if (err) { 1169 struct inet6_ifaddr *ifp; 1170 struct flowi6 fl_gw6; 1171 int redirect; 1172 1173 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1174 (*dst)->dev, 1); 1175 1176 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1177 if (ifp) 1178 in6_ifa_put(ifp); 1179 1180 if (redirect) { 1181 /* 1182 * We need to get the dst entry for the 1183 * default router instead 1184 */ 1185 dst_release(*dst); 1186 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1187 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1188 *dst = ip6_route_output(net, sk, &fl_gw6); 1189 err = (*dst)->error; 1190 if (err) 1191 goto out_err_release; 1192 } 1193 } 1194 #endif 1195 if (ipv6_addr_v4mapped(&fl6->saddr) && 1196 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1197 err = -EAFNOSUPPORT; 1198 goto out_err_release; 1199 } 1200 1201 return 0; 1202 1203 out_err_release: 1204 dst_release(*dst); 1205 *dst = NULL; 1206 1207 if (err == -ENETUNREACH) 1208 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1209 return err; 1210 } 1211 1212 /** 1213 * ip6_dst_lookup - perform route lookup on flow 1214 * @net: Network namespace to perform lookup in 1215 * @sk: socket which provides route info 1216 * @dst: pointer to dst_entry * for result 1217 * @fl6: flow to lookup 1218 * 1219 * This function performs a route lookup on the given flow. 1220 * 1221 * It returns zero on success, or a standard errno code on error. 1222 */ 1223 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1224 struct flowi6 *fl6) 1225 { 1226 *dst = NULL; 1227 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1228 } 1229 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1230 1231 /** 1232 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1233 * @net: Network namespace to perform lookup in 1234 * @sk: socket which provides route info 1235 * @fl6: flow to lookup 1236 * @final_dst: final destination address for ipsec lookup 1237 * 1238 * This function performs a route lookup on the given flow. 1239 * 1240 * It returns a valid dst pointer on success, or a pointer encoded 1241 * error code. 1242 */ 1243 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1244 const struct in6_addr *final_dst) 1245 { 1246 struct dst_entry *dst = NULL; 1247 int err; 1248 1249 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1250 if (err) 1251 return ERR_PTR(err); 1252 if (final_dst) 1253 fl6->daddr = *final_dst; 1254 1255 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1256 } 1257 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1258 1259 /** 1260 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1261 * @sk: socket which provides the dst cache and route info 1262 * @fl6: flow to lookup 1263 * @final_dst: final destination address for ipsec lookup 1264 * @connected: whether @sk is connected or not 1265 * 1266 * This function performs a route lookup on the given flow with the 1267 * possibility of using the cached route in the socket if it is valid. 1268 * It will take the socket dst lock when operating on the dst cache. 1269 * As a result, this function can only be used in process context. 1270 * 1271 * In addition, for a connected socket, cache the dst in the socket 1272 * if the current cache is not valid. 1273 * 1274 * It returns a valid dst pointer on success, or a pointer encoded 1275 * error code. 1276 */ 1277 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1278 const struct in6_addr *final_dst, 1279 bool connected) 1280 { 1281 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1282 1283 dst = ip6_sk_dst_check(sk, dst, fl6); 1284 if (dst) 1285 return dst; 1286 1287 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1288 if (connected && !IS_ERR(dst)) 1289 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1290 1291 return dst; 1292 } 1293 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1294 1295 /** 1296 * ip6_dst_lookup_tunnel - perform route lookup on tunnel 1297 * @skb: Packet for which lookup is done 1298 * @dev: Tunnel device 1299 * @net: Network namespace of tunnel device 1300 * @sock: Socket which provides route info 1301 * @saddr: Memory to store the src ip address 1302 * @info: Tunnel information 1303 * @protocol: IP protocol 1304 * @use_cache: Flag to enable cache usage 1305 * This function performs a route lookup on a tunnel 1306 * 1307 * It returns a valid dst pointer and stores src address to be used in 1308 * tunnel in param saddr on success, else a pointer encoded error code. 1309 */ 1310 1311 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, 1312 struct net_device *dev, 1313 struct net *net, 1314 struct socket *sock, 1315 struct in6_addr *saddr, 1316 const struct ip_tunnel_info *info, 1317 u8 protocol, 1318 bool use_cache) 1319 { 1320 struct dst_entry *dst = NULL; 1321 #ifdef CONFIG_DST_CACHE 1322 struct dst_cache *dst_cache; 1323 #endif 1324 struct flowi6 fl6; 1325 __u8 prio; 1326 1327 #ifdef CONFIG_DST_CACHE 1328 dst_cache = (struct dst_cache *)&info->dst_cache; 1329 if (use_cache) { 1330 dst = dst_cache_get_ip6(dst_cache, saddr); 1331 if (dst) 1332 return dst; 1333 } 1334 #endif 1335 memset(&fl6, 0, sizeof(fl6)); 1336 fl6.flowi6_mark = skb->mark; 1337 fl6.flowi6_proto = protocol; 1338 fl6.daddr = info->key.u.ipv6.dst; 1339 fl6.saddr = info->key.u.ipv6.src; 1340 prio = info->key.tos; 1341 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label); 1342 1343 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6, 1344 NULL); 1345 if (IS_ERR(dst)) { 1346 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); 1347 return ERR_PTR(-ENETUNREACH); 1348 } 1349 if (dst->dev == dev) { /* is this necessary? */ 1350 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); 1351 dst_release(dst); 1352 return ERR_PTR(-ELOOP); 1353 } 1354 #ifdef CONFIG_DST_CACHE 1355 if (use_cache) 1356 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr); 1357 #endif 1358 *saddr = fl6.saddr; 1359 return dst; 1360 } 1361 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel); 1362 1363 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1364 gfp_t gfp) 1365 { 1366 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1367 } 1368 1369 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1370 gfp_t gfp) 1371 { 1372 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1373 } 1374 1375 static void ip6_append_data_mtu(unsigned int *mtu, 1376 int *maxfraglen, 1377 unsigned int fragheaderlen, 1378 struct sk_buff *skb, 1379 struct rt6_info *rt, 1380 unsigned int orig_mtu) 1381 { 1382 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1383 if (!skb) { 1384 /* first fragment, reserve header_len */ 1385 *mtu = orig_mtu - rt->dst.header_len; 1386 1387 } else { 1388 /* 1389 * this fragment is not first, the headers 1390 * space is regarded as data space. 1391 */ 1392 *mtu = orig_mtu; 1393 } 1394 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1395 + fragheaderlen - sizeof(struct frag_hdr); 1396 } 1397 } 1398 1399 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1400 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1401 struct rt6_info *rt) 1402 { 1403 struct ipv6_pinfo *np = inet6_sk(sk); 1404 unsigned int mtu; 1405 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1406 1407 /* callers pass dst together with a reference, set it first so 1408 * ip6_cork_release() can put it down even in case of an error. 1409 */ 1410 cork->base.dst = &rt->dst; 1411 1412 /* 1413 * setup for corking 1414 */ 1415 if (opt) { 1416 if (WARN_ON(v6_cork->opt)) 1417 return -EINVAL; 1418 1419 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1420 if (unlikely(!nopt)) 1421 return -ENOBUFS; 1422 1423 nopt->tot_len = sizeof(*opt); 1424 nopt->opt_flen = opt->opt_flen; 1425 nopt->opt_nflen = opt->opt_nflen; 1426 1427 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1428 if (opt->dst0opt && !nopt->dst0opt) 1429 return -ENOBUFS; 1430 1431 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1432 if (opt->dst1opt && !nopt->dst1opt) 1433 return -ENOBUFS; 1434 1435 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1436 if (opt->hopopt && !nopt->hopopt) 1437 return -ENOBUFS; 1438 1439 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1440 if (opt->srcrt && !nopt->srcrt) 1441 return -ENOBUFS; 1442 1443 /* need source address above miyazawa*/ 1444 } 1445 v6_cork->hop_limit = ipc6->hlimit; 1446 v6_cork->tclass = ipc6->tclass; 1447 if (rt->dst.flags & DST_XFRM_TUNNEL) 1448 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1449 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1450 else 1451 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1452 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1453 if (np->frag_size < mtu) { 1454 if (np->frag_size) 1455 mtu = np->frag_size; 1456 } 1457 cork->base.fragsize = mtu; 1458 cork->base.gso_size = ipc6->gso_size; 1459 cork->base.tx_flags = 0; 1460 cork->base.mark = ipc6->sockc.mark; 1461 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1462 1463 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1464 cork->base.flags |= IPCORK_ALLFRAG; 1465 cork->base.length = 0; 1466 1467 cork->base.transmit_time = ipc6->sockc.transmit_time; 1468 1469 return 0; 1470 } 1471 1472 static int __ip6_append_data(struct sock *sk, 1473 struct sk_buff_head *queue, 1474 struct inet_cork_full *cork_full, 1475 struct inet6_cork *v6_cork, 1476 struct page_frag *pfrag, 1477 int getfrag(void *from, char *to, int offset, 1478 int len, int odd, struct sk_buff *skb), 1479 void *from, size_t length, int transhdrlen, 1480 unsigned int flags, struct ipcm6_cookie *ipc6) 1481 { 1482 struct sk_buff *skb, *skb_prev = NULL; 1483 struct inet_cork *cork = &cork_full->base; 1484 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1485 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1486 struct ubuf_info *uarg = NULL; 1487 int exthdrlen = 0; 1488 int dst_exthdrlen = 0; 1489 int hh_len; 1490 int copy; 1491 int err; 1492 int offset = 0; 1493 bool zc = false; 1494 u32 tskey = 0; 1495 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1496 struct ipv6_txoptions *opt = v6_cork->opt; 1497 int csummode = CHECKSUM_NONE; 1498 unsigned int maxnonfragsize, headersize; 1499 unsigned int wmem_alloc_delta = 0; 1500 bool paged, extra_uref = false; 1501 1502 skb = skb_peek_tail(queue); 1503 if (!skb) { 1504 exthdrlen = opt ? opt->opt_flen : 0; 1505 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1506 } 1507 1508 paged = !!cork->gso_size; 1509 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1510 orig_mtu = mtu; 1511 1512 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1513 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) 1514 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1515 1516 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1517 1518 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1519 (opt ? opt->opt_nflen : 0); 1520 1521 headersize = sizeof(struct ipv6hdr) + 1522 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1523 (dst_allfrag(&rt->dst) ? 1524 sizeof(struct frag_hdr) : 0) + 1525 rt->rt6i_nfheader_len; 1526 1527 if (mtu <= fragheaderlen || 1528 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1529 goto emsgsize; 1530 1531 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1532 sizeof(struct frag_hdr); 1533 1534 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1535 * the first fragment 1536 */ 1537 if (headersize + transhdrlen > mtu) 1538 goto emsgsize; 1539 1540 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1541 (sk->sk_protocol == IPPROTO_UDP || 1542 sk->sk_protocol == IPPROTO_ICMPV6 || 1543 sk->sk_protocol == IPPROTO_RAW)) { 1544 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1545 sizeof(struct ipv6hdr)); 1546 goto emsgsize; 1547 } 1548 1549 if (ip6_sk_ignore_df(sk)) 1550 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1551 else 1552 maxnonfragsize = mtu; 1553 1554 if (cork->length + length > maxnonfragsize - headersize) { 1555 emsgsize: 1556 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1557 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1558 return -EMSGSIZE; 1559 } 1560 1561 /* CHECKSUM_PARTIAL only with no extension headers and when 1562 * we are not going to fragment 1563 */ 1564 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1565 headersize == sizeof(struct ipv6hdr) && 1566 length <= mtu - headersize && 1567 (!(flags & MSG_MORE) || cork->gso_size) && 1568 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1569 csummode = CHECKSUM_PARTIAL; 1570 1571 if ((flags & MSG_ZEROCOPY) && length) { 1572 struct msghdr *msg = from; 1573 1574 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1575 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1576 return -EINVAL; 1577 1578 /* Leave uarg NULL if can't zerocopy, callers should 1579 * be able to handle it. 1580 */ 1581 if ((rt->dst.dev->features & NETIF_F_SG) && 1582 csummode == CHECKSUM_PARTIAL) { 1583 paged = true; 1584 zc = true; 1585 uarg = msg->msg_ubuf; 1586 } 1587 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1588 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1589 if (!uarg) 1590 return -ENOBUFS; 1591 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1592 if (rt->dst.dev->features & NETIF_F_SG && 1593 csummode == CHECKSUM_PARTIAL) { 1594 paged = true; 1595 zc = true; 1596 } else { 1597 uarg_to_msgzc(uarg)->zerocopy = 0; 1598 skb_zcopy_set(skb, uarg, &extra_uref); 1599 } 1600 } 1601 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1602 if (inet_test_bit(HDRINCL, sk)) 1603 return -EPERM; 1604 if (rt->dst.dev->features & NETIF_F_SG && 1605 getfrag == ip_generic_getfrag) 1606 /* We need an empty buffer to attach stuff to */ 1607 paged = true; 1608 else 1609 flags &= ~MSG_SPLICE_PAGES; 1610 } 1611 1612 /* 1613 * Let's try using as much space as possible. 1614 * Use MTU if total length of the message fits into the MTU. 1615 * Otherwise, we need to reserve fragment header and 1616 * fragment alignment (= 8-15 octects, in total). 1617 * 1618 * Note that we may need to "move" the data from the tail 1619 * of the buffer to the new fragment when we split 1620 * the message. 1621 * 1622 * FIXME: It may be fragmented into multiple chunks 1623 * at once if non-fragmentable extension headers 1624 * are too large. 1625 * --yoshfuji 1626 */ 1627 1628 cork->length += length; 1629 if (!skb) 1630 goto alloc_new_skb; 1631 1632 while (length > 0) { 1633 /* Check if the remaining data fits into current packet. */ 1634 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1635 if (copy < length) 1636 copy = maxfraglen - skb->len; 1637 1638 if (copy <= 0) { 1639 char *data; 1640 unsigned int datalen; 1641 unsigned int fraglen; 1642 unsigned int fraggap; 1643 unsigned int alloclen, alloc_extra; 1644 unsigned int pagedlen; 1645 alloc_new_skb: 1646 /* There's no room in the current skb */ 1647 if (skb) 1648 fraggap = skb->len - maxfraglen; 1649 else 1650 fraggap = 0; 1651 /* update mtu and maxfraglen if necessary */ 1652 if (!skb || !skb_prev) 1653 ip6_append_data_mtu(&mtu, &maxfraglen, 1654 fragheaderlen, skb, rt, 1655 orig_mtu); 1656 1657 skb_prev = skb; 1658 1659 /* 1660 * If remaining data exceeds the mtu, 1661 * we know we need more fragment(s). 1662 */ 1663 datalen = length + fraggap; 1664 1665 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1666 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1667 fraglen = datalen + fragheaderlen; 1668 pagedlen = 0; 1669 1670 alloc_extra = hh_len; 1671 alloc_extra += dst_exthdrlen; 1672 alloc_extra += rt->dst.trailer_len; 1673 1674 /* We just reserve space for fragment header. 1675 * Note: this may be overallocation if the message 1676 * (without MSG_MORE) fits into the MTU. 1677 */ 1678 alloc_extra += sizeof(struct frag_hdr); 1679 1680 if ((flags & MSG_MORE) && 1681 !(rt->dst.dev->features&NETIF_F_SG)) 1682 alloclen = mtu; 1683 else if (!paged && 1684 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1685 !(rt->dst.dev->features & NETIF_F_SG))) 1686 alloclen = fraglen; 1687 else { 1688 alloclen = fragheaderlen + transhdrlen; 1689 pagedlen = datalen - transhdrlen; 1690 } 1691 alloclen += alloc_extra; 1692 1693 if (datalen != length + fraggap) { 1694 /* 1695 * this is not the last fragment, the trailer 1696 * space is regarded as data space. 1697 */ 1698 datalen += rt->dst.trailer_len; 1699 } 1700 1701 fraglen = datalen + fragheaderlen; 1702 1703 copy = datalen - transhdrlen - fraggap - pagedlen; 1704 /* [!] NOTE: copy may be negative if pagedlen>0 1705 * because then the equation may reduces to -fraggap. 1706 */ 1707 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1708 err = -EINVAL; 1709 goto error; 1710 } 1711 if (transhdrlen) { 1712 skb = sock_alloc_send_skb(sk, alloclen, 1713 (flags & MSG_DONTWAIT), &err); 1714 } else { 1715 skb = NULL; 1716 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1717 2 * sk->sk_sndbuf) 1718 skb = alloc_skb(alloclen, 1719 sk->sk_allocation); 1720 if (unlikely(!skb)) 1721 err = -ENOBUFS; 1722 } 1723 if (!skb) 1724 goto error; 1725 /* 1726 * Fill in the control structures 1727 */ 1728 skb->protocol = htons(ETH_P_IPV6); 1729 skb->ip_summed = csummode; 1730 skb->csum = 0; 1731 /* reserve for fragmentation and ipsec header */ 1732 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1733 dst_exthdrlen); 1734 1735 /* 1736 * Find where to start putting bytes 1737 */ 1738 data = skb_put(skb, fraglen - pagedlen); 1739 skb_set_network_header(skb, exthdrlen); 1740 data += fragheaderlen; 1741 skb->transport_header = (skb->network_header + 1742 fragheaderlen); 1743 if (fraggap) { 1744 skb->csum = skb_copy_and_csum_bits( 1745 skb_prev, maxfraglen, 1746 data + transhdrlen, fraggap); 1747 skb_prev->csum = csum_sub(skb_prev->csum, 1748 skb->csum); 1749 data += fraggap; 1750 pskb_trim_unique(skb_prev, maxfraglen); 1751 } 1752 if (copy > 0 && 1753 getfrag(from, data + transhdrlen, offset, 1754 copy, fraggap, skb) < 0) { 1755 err = -EFAULT; 1756 kfree_skb(skb); 1757 goto error; 1758 } else if (flags & MSG_SPLICE_PAGES) { 1759 copy = 0; 1760 } 1761 1762 offset += copy; 1763 length -= copy + transhdrlen; 1764 transhdrlen = 0; 1765 exthdrlen = 0; 1766 dst_exthdrlen = 0; 1767 1768 /* Only the initial fragment is time stamped */ 1769 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1770 cork->tx_flags = 0; 1771 skb_shinfo(skb)->tskey = tskey; 1772 tskey = 0; 1773 skb_zcopy_set(skb, uarg, &extra_uref); 1774 1775 if ((flags & MSG_CONFIRM) && !skb_prev) 1776 skb_set_dst_pending_confirm(skb, 1); 1777 1778 /* 1779 * Put the packet on the pending queue 1780 */ 1781 if (!skb->destructor) { 1782 skb->destructor = sock_wfree; 1783 skb->sk = sk; 1784 wmem_alloc_delta += skb->truesize; 1785 } 1786 __skb_queue_tail(queue, skb); 1787 continue; 1788 } 1789 1790 if (copy > length) 1791 copy = length; 1792 1793 if (!(rt->dst.dev->features&NETIF_F_SG) && 1794 skb_tailroom(skb) >= copy) { 1795 unsigned int off; 1796 1797 off = skb->len; 1798 if (getfrag(from, skb_put(skb, copy), 1799 offset, copy, off, skb) < 0) { 1800 __skb_trim(skb, off); 1801 err = -EFAULT; 1802 goto error; 1803 } 1804 } else if (flags & MSG_SPLICE_PAGES) { 1805 struct msghdr *msg = from; 1806 1807 err = -EIO; 1808 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1809 goto error; 1810 1811 err = skb_splice_from_iter(skb, &msg->msg_iter, copy, 1812 sk->sk_allocation); 1813 if (err < 0) 1814 goto error; 1815 copy = err; 1816 wmem_alloc_delta += copy; 1817 } else if (!zc) { 1818 int i = skb_shinfo(skb)->nr_frags; 1819 1820 err = -ENOMEM; 1821 if (!sk_page_frag_refill(sk, pfrag)) 1822 goto error; 1823 1824 skb_zcopy_downgrade_managed(skb); 1825 if (!skb_can_coalesce(skb, i, pfrag->page, 1826 pfrag->offset)) { 1827 err = -EMSGSIZE; 1828 if (i == MAX_SKB_FRAGS) 1829 goto error; 1830 1831 __skb_fill_page_desc(skb, i, pfrag->page, 1832 pfrag->offset, 0); 1833 skb_shinfo(skb)->nr_frags = ++i; 1834 get_page(pfrag->page); 1835 } 1836 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1837 if (getfrag(from, 1838 page_address(pfrag->page) + pfrag->offset, 1839 offset, copy, skb->len, skb) < 0) 1840 goto error_efault; 1841 1842 pfrag->offset += copy; 1843 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1844 skb->len += copy; 1845 skb->data_len += copy; 1846 skb->truesize += copy; 1847 wmem_alloc_delta += copy; 1848 } else { 1849 err = skb_zerocopy_iter_dgram(skb, from, copy); 1850 if (err < 0) 1851 goto error; 1852 } 1853 offset += copy; 1854 length -= copy; 1855 } 1856 1857 if (wmem_alloc_delta) 1858 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1859 return 0; 1860 1861 error_efault: 1862 err = -EFAULT; 1863 error: 1864 net_zcopy_put_abort(uarg, extra_uref); 1865 cork->length -= length; 1866 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1867 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1868 return err; 1869 } 1870 1871 int ip6_append_data(struct sock *sk, 1872 int getfrag(void *from, char *to, int offset, int len, 1873 int odd, struct sk_buff *skb), 1874 void *from, size_t length, int transhdrlen, 1875 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1876 struct rt6_info *rt, unsigned int flags) 1877 { 1878 struct inet_sock *inet = inet_sk(sk); 1879 struct ipv6_pinfo *np = inet6_sk(sk); 1880 int exthdrlen; 1881 int err; 1882 1883 if (flags&MSG_PROBE) 1884 return 0; 1885 if (skb_queue_empty(&sk->sk_write_queue)) { 1886 /* 1887 * setup for corking 1888 */ 1889 dst_hold(&rt->dst); 1890 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1891 ipc6, rt); 1892 if (err) 1893 return err; 1894 1895 inet->cork.fl.u.ip6 = *fl6; 1896 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1897 length += exthdrlen; 1898 transhdrlen += exthdrlen; 1899 } else { 1900 transhdrlen = 0; 1901 } 1902 1903 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1904 &np->cork, sk_page_frag(sk), getfrag, 1905 from, length, transhdrlen, flags, ipc6); 1906 } 1907 EXPORT_SYMBOL_GPL(ip6_append_data); 1908 1909 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1910 { 1911 struct dst_entry *dst = cork->base.dst; 1912 1913 cork->base.dst = NULL; 1914 cork->base.flags &= ~IPCORK_ALLFRAG; 1915 skb_dst_set(skb, dst); 1916 } 1917 1918 static void ip6_cork_release(struct inet_cork_full *cork, 1919 struct inet6_cork *v6_cork) 1920 { 1921 if (v6_cork->opt) { 1922 struct ipv6_txoptions *opt = v6_cork->opt; 1923 1924 kfree(opt->dst0opt); 1925 kfree(opt->dst1opt); 1926 kfree(opt->hopopt); 1927 kfree(opt->srcrt); 1928 kfree(opt); 1929 v6_cork->opt = NULL; 1930 } 1931 1932 if (cork->base.dst) { 1933 dst_release(cork->base.dst); 1934 cork->base.dst = NULL; 1935 cork->base.flags &= ~IPCORK_ALLFRAG; 1936 } 1937 } 1938 1939 struct sk_buff *__ip6_make_skb(struct sock *sk, 1940 struct sk_buff_head *queue, 1941 struct inet_cork_full *cork, 1942 struct inet6_cork *v6_cork) 1943 { 1944 struct sk_buff *skb, *tmp_skb; 1945 struct sk_buff **tail_skb; 1946 struct in6_addr *final_dst; 1947 struct ipv6_pinfo *np = inet6_sk(sk); 1948 struct net *net = sock_net(sk); 1949 struct ipv6hdr *hdr; 1950 struct ipv6_txoptions *opt = v6_cork->opt; 1951 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1952 struct flowi6 *fl6 = &cork->fl.u.ip6; 1953 unsigned char proto = fl6->flowi6_proto; 1954 1955 skb = __skb_dequeue(queue); 1956 if (!skb) 1957 goto out; 1958 tail_skb = &(skb_shinfo(skb)->frag_list); 1959 1960 /* move skb->data to ip header from ext header */ 1961 if (skb->data < skb_network_header(skb)) 1962 __skb_pull(skb, skb_network_offset(skb)); 1963 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1964 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1965 *tail_skb = tmp_skb; 1966 tail_skb = &(tmp_skb->next); 1967 skb->len += tmp_skb->len; 1968 skb->data_len += tmp_skb->len; 1969 skb->truesize += tmp_skb->truesize; 1970 tmp_skb->destructor = NULL; 1971 tmp_skb->sk = NULL; 1972 } 1973 1974 /* Allow local fragmentation. */ 1975 skb->ignore_df = ip6_sk_ignore_df(sk); 1976 __skb_pull(skb, skb_network_header_len(skb)); 1977 1978 final_dst = &fl6->daddr; 1979 if (opt && opt->opt_flen) 1980 ipv6_push_frag_opts(skb, opt, &proto); 1981 if (opt && opt->opt_nflen) 1982 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1983 1984 skb_push(skb, sizeof(struct ipv6hdr)); 1985 skb_reset_network_header(skb); 1986 hdr = ipv6_hdr(skb); 1987 1988 ip6_flow_hdr(hdr, v6_cork->tclass, 1989 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1990 ip6_autoflowlabel(net, np), fl6)); 1991 hdr->hop_limit = v6_cork->hop_limit; 1992 hdr->nexthdr = proto; 1993 hdr->saddr = fl6->saddr; 1994 hdr->daddr = *final_dst; 1995 1996 skb->priority = sk->sk_priority; 1997 skb->mark = cork->base.mark; 1998 skb->tstamp = cork->base.transmit_time; 1999 2000 ip6_cork_steal_dst(skb, cork); 2001 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 2002 if (proto == IPPROTO_ICMPV6) { 2003 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 2004 u8 icmp6_type; 2005 2006 if (sk->sk_socket->type == SOCK_RAW && 2007 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 2008 icmp6_type = fl6->fl6_icmp_type; 2009 else 2010 icmp6_type = icmp6_hdr(skb)->icmp6_type; 2011 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 2012 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 2013 } 2014 2015 ip6_cork_release(cork, v6_cork); 2016 out: 2017 return skb; 2018 } 2019 2020 int ip6_send_skb(struct sk_buff *skb) 2021 { 2022 struct net *net = sock_net(skb->sk); 2023 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 2024 int err; 2025 2026 err = ip6_local_out(net, skb->sk, skb); 2027 if (err) { 2028 if (err > 0) 2029 err = net_xmit_errno(err); 2030 if (err) 2031 IP6_INC_STATS(net, rt->rt6i_idev, 2032 IPSTATS_MIB_OUTDISCARDS); 2033 } 2034 2035 return err; 2036 } 2037 2038 int ip6_push_pending_frames(struct sock *sk) 2039 { 2040 struct sk_buff *skb; 2041 2042 skb = ip6_finish_skb(sk); 2043 if (!skb) 2044 return 0; 2045 2046 return ip6_send_skb(skb); 2047 } 2048 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2049 2050 static void __ip6_flush_pending_frames(struct sock *sk, 2051 struct sk_buff_head *queue, 2052 struct inet_cork_full *cork, 2053 struct inet6_cork *v6_cork) 2054 { 2055 struct sk_buff *skb; 2056 2057 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2058 if (skb_dst(skb)) 2059 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2060 IPSTATS_MIB_OUTDISCARDS); 2061 kfree_skb(skb); 2062 } 2063 2064 ip6_cork_release(cork, v6_cork); 2065 } 2066 2067 void ip6_flush_pending_frames(struct sock *sk) 2068 { 2069 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2070 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 2071 } 2072 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2073 2074 struct sk_buff *ip6_make_skb(struct sock *sk, 2075 int getfrag(void *from, char *to, int offset, 2076 int len, int odd, struct sk_buff *skb), 2077 void *from, size_t length, int transhdrlen, 2078 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2079 unsigned int flags, struct inet_cork_full *cork) 2080 { 2081 struct inet6_cork v6_cork; 2082 struct sk_buff_head queue; 2083 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2084 int err; 2085 2086 if (flags & MSG_PROBE) { 2087 dst_release(&rt->dst); 2088 return NULL; 2089 } 2090 2091 __skb_queue_head_init(&queue); 2092 2093 cork->base.flags = 0; 2094 cork->base.addr = 0; 2095 cork->base.opt = NULL; 2096 v6_cork.opt = NULL; 2097 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); 2098 if (err) { 2099 ip6_cork_release(cork, &v6_cork); 2100 return ERR_PTR(err); 2101 } 2102 if (ipc6->dontfrag < 0) 2103 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 2104 2105 err = __ip6_append_data(sk, &queue, cork, &v6_cork, 2106 ¤t->task_frag, getfrag, from, 2107 length + exthdrlen, transhdrlen + exthdrlen, 2108 flags, ipc6); 2109 if (err) { 2110 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 2111 return ERR_PTR(err); 2112 } 2113 2114 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 2115 } 2116