1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/ipv6.h> 46 #include <net/ndisc.h> 47 #include <net/protocol.h> 48 #include <net/ip6_route.h> 49 #include <net/addrconf.h> 50 #include <net/rawv6.h> 51 #include <net/icmp.h> 52 #include <net/xfrm.h> 53 #include <net/checksum.h> 54 #include <linux/mroute6.h> 55 #include <net/l3mdev.h> 56 #include <net/lwtunnel.h> 57 58 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 59 { 60 struct dst_entry *dst = skb_dst(skb); 61 struct net_device *dev = dst->dev; 62 struct neighbour *neigh; 63 struct in6_addr *nexthop; 64 int ret; 65 66 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 67 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 68 69 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 70 ((mroute6_is_socket(net, skb) && 71 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 72 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 73 &ipv6_hdr(skb)->saddr))) { 74 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 75 76 /* Do not check for IFF_ALLMULTI; multicast routing 77 is not supported in any case. 78 */ 79 if (newskb) 80 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 81 net, sk, newskb, NULL, newskb->dev, 82 dev_loopback_xmit); 83 84 if (ipv6_hdr(skb)->hop_limit == 0) { 85 IP6_INC_STATS(net, idev, 86 IPSTATS_MIB_OUTDISCARDS); 87 kfree_skb(skb); 88 return 0; 89 } 90 } 91 92 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 93 94 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 95 IPV6_ADDR_SCOPE_NODELOCAL && 96 !(dev->flags & IFF_LOOPBACK)) { 97 kfree_skb(skb); 98 return 0; 99 } 100 } 101 102 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 103 int res = lwtunnel_xmit(skb); 104 105 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 106 return res; 107 } 108 109 rcu_read_lock_bh(); 110 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 111 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 112 if (unlikely(!neigh)) 113 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 114 if (!IS_ERR(neigh)) { 115 sock_confirm_neigh(skb, neigh); 116 ret = neigh_output(neigh, skb, false); 117 rcu_read_unlock_bh(); 118 return ret; 119 } 120 rcu_read_unlock_bh(); 121 122 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 123 kfree_skb(skb); 124 return -EINVAL; 125 } 126 127 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 128 { 129 int ret; 130 131 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 132 if (ret) { 133 kfree_skb(skb); 134 return ret; 135 } 136 137 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 138 /* Policy lookup after SNAT yielded a new policy */ 139 if (skb_dst(skb)->xfrm) { 140 IPCB(skb)->flags |= IPSKB_REROUTED; 141 return dst_output(net, sk, skb); 142 } 143 #endif 144 145 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 146 dst_allfrag(skb_dst(skb)) || 147 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 148 return ip6_fragment(net, sk, skb, ip6_finish_output2); 149 else 150 return ip6_finish_output2(net, sk, skb); 151 } 152 153 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 154 { 155 struct net_device *dev = skb_dst(skb)->dev; 156 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 157 158 skb->protocol = htons(ETH_P_IPV6); 159 skb->dev = dev; 160 161 if (unlikely(idev->cnf.disable_ipv6)) { 162 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 163 kfree_skb(skb); 164 return 0; 165 } 166 167 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 168 net, sk, skb, NULL, dev, 169 ip6_finish_output, 170 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 171 } 172 173 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 174 { 175 if (!np->autoflowlabel_set) 176 return ip6_default_np_autolabel(net); 177 else 178 return np->autoflowlabel; 179 } 180 181 /* 182 * xmit an sk_buff (used by TCP, SCTP and DCCP) 183 * Note : socket lock is not held for SYNACK packets, but might be modified 184 * by calls to skb_set_owner_w() and ipv6_local_error(), 185 * which are using proper atomic operations or spinlocks. 186 */ 187 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 188 __u32 mark, struct ipv6_txoptions *opt, int tclass) 189 { 190 struct net *net = sock_net(sk); 191 const struct ipv6_pinfo *np = inet6_sk(sk); 192 struct in6_addr *first_hop = &fl6->daddr; 193 struct dst_entry *dst = skb_dst(skb); 194 unsigned int head_room; 195 struct ipv6hdr *hdr; 196 u8 proto = fl6->flowi6_proto; 197 int seg_len = skb->len; 198 int hlimit = -1; 199 u32 mtu; 200 201 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 202 if (opt) 203 head_room += opt->opt_nflen + opt->opt_flen; 204 205 if (unlikely(skb_headroom(skb) < head_room)) { 206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 207 if (!skb2) { 208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 209 IPSTATS_MIB_OUTDISCARDS); 210 kfree_skb(skb); 211 return -ENOBUFS; 212 } 213 if (skb->sk) 214 skb_set_owner_w(skb2, skb->sk); 215 consume_skb(skb); 216 skb = skb2; 217 } 218 219 if (opt) { 220 seg_len += opt->opt_nflen + opt->opt_flen; 221 222 if (opt->opt_flen) 223 ipv6_push_frag_opts(skb, opt, &proto); 224 225 if (opt->opt_nflen) 226 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 227 &fl6->saddr); 228 } 229 230 skb_push(skb, sizeof(struct ipv6hdr)); 231 skb_reset_network_header(skb); 232 hdr = ipv6_hdr(skb); 233 234 /* 235 * Fill in the IPv6 header 236 */ 237 if (np) 238 hlimit = np->hop_limit; 239 if (hlimit < 0) 240 hlimit = ip6_dst_hoplimit(dst); 241 242 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 243 ip6_autoflowlabel(net, np), fl6)); 244 245 hdr->payload_len = htons(seg_len); 246 hdr->nexthdr = proto; 247 hdr->hop_limit = hlimit; 248 249 hdr->saddr = fl6->saddr; 250 hdr->daddr = *first_hop; 251 252 skb->protocol = htons(ETH_P_IPV6); 253 skb->priority = sk->sk_priority; 254 skb->mark = mark; 255 256 mtu = dst_mtu(dst); 257 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 258 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 259 IPSTATS_MIB_OUT, skb->len); 260 261 /* if egress device is enslaved to an L3 master device pass the 262 * skb to its handler for processing 263 */ 264 skb = l3mdev_ip6_out((struct sock *)sk, skb); 265 if (unlikely(!skb)) 266 return 0; 267 268 /* hooks should never assume socket lock is held. 269 * we promote our socket to non const 270 */ 271 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 272 net, (struct sock *)sk, skb, NULL, dst->dev, 273 dst_output); 274 } 275 276 skb->dev = dst->dev; 277 /* ipv6_local_error() does not require socket lock, 278 * we promote our socket to non const 279 */ 280 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 281 282 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 283 kfree_skb(skb); 284 return -EMSGSIZE; 285 } 286 EXPORT_SYMBOL(ip6_xmit); 287 288 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 289 { 290 struct ip6_ra_chain *ra; 291 struct sock *last = NULL; 292 293 read_lock(&ip6_ra_lock); 294 for (ra = ip6_ra_chain; ra; ra = ra->next) { 295 struct sock *sk = ra->sk; 296 if (sk && ra->sel == sel && 297 (!sk->sk_bound_dev_if || 298 sk->sk_bound_dev_if == skb->dev->ifindex)) { 299 struct ipv6_pinfo *np = inet6_sk(sk); 300 301 if (np && np->rtalert_isolate && 302 !net_eq(sock_net(sk), dev_net(skb->dev))) { 303 continue; 304 } 305 if (last) { 306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 307 if (skb2) 308 rawv6_rcv(last, skb2); 309 } 310 last = sk; 311 } 312 } 313 314 if (last) { 315 rawv6_rcv(last, skb); 316 read_unlock(&ip6_ra_lock); 317 return 1; 318 } 319 read_unlock(&ip6_ra_lock); 320 return 0; 321 } 322 323 static int ip6_forward_proxy_check(struct sk_buff *skb) 324 { 325 struct ipv6hdr *hdr = ipv6_hdr(skb); 326 u8 nexthdr = hdr->nexthdr; 327 __be16 frag_off; 328 int offset; 329 330 if (ipv6_ext_hdr(nexthdr)) { 331 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 332 if (offset < 0) 333 return 0; 334 } else 335 offset = sizeof(struct ipv6hdr); 336 337 if (nexthdr == IPPROTO_ICMPV6) { 338 struct icmp6hdr *icmp6; 339 340 if (!pskb_may_pull(skb, (skb_network_header(skb) + 341 offset + 1 - skb->data))) 342 return 0; 343 344 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 345 346 switch (icmp6->icmp6_type) { 347 case NDISC_ROUTER_SOLICITATION: 348 case NDISC_ROUTER_ADVERTISEMENT: 349 case NDISC_NEIGHBOUR_SOLICITATION: 350 case NDISC_NEIGHBOUR_ADVERTISEMENT: 351 case NDISC_REDIRECT: 352 /* For reaction involving unicast neighbor discovery 353 * message destined to the proxied address, pass it to 354 * input function. 355 */ 356 return 1; 357 default: 358 break; 359 } 360 } 361 362 /* 363 * The proxying router can't forward traffic sent to a link-local 364 * address, so signal the sender and discard the packet. This 365 * behavior is clarified by the MIPv6 specification. 366 */ 367 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 368 dst_link_failure(skb); 369 return -1; 370 } 371 372 return 0; 373 } 374 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 376 struct sk_buff *skb) 377 { 378 struct dst_entry *dst = skb_dst(skb); 379 380 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 381 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 382 383 #ifdef CONFIG_NET_SWITCHDEV 384 if (skb->offload_l3_fwd_mark) { 385 consume_skb(skb); 386 return 0; 387 } 388 #endif 389 390 skb->tstamp = 0; 391 return dst_output(net, sk, skb); 392 } 393 394 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 395 { 396 if (skb->len <= mtu) 397 return false; 398 399 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 400 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 401 return true; 402 403 if (skb->ignore_df) 404 return false; 405 406 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 407 return false; 408 409 return true; 410 } 411 412 int ip6_forward(struct sk_buff *skb) 413 { 414 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 415 struct dst_entry *dst = skb_dst(skb); 416 struct ipv6hdr *hdr = ipv6_hdr(skb); 417 struct inet6_skb_parm *opt = IP6CB(skb); 418 struct net *net = dev_net(dst->dev); 419 u32 mtu; 420 421 if (net->ipv6.devconf_all->forwarding == 0) 422 goto error; 423 424 if (skb->pkt_type != PACKET_HOST) 425 goto drop; 426 427 if (unlikely(skb->sk)) 428 goto drop; 429 430 if (skb_warn_if_lro(skb)) 431 goto drop; 432 433 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 434 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 435 goto drop; 436 } 437 438 skb_forward_csum(skb); 439 440 /* 441 * We DO NOT make any processing on 442 * RA packets, pushing them to user level AS IS 443 * without ane WARRANTY that application will be able 444 * to interpret them. The reason is that we 445 * cannot make anything clever here. 446 * 447 * We are not end-node, so that if packet contains 448 * AH/ESP, we cannot make anything. 449 * Defragmentation also would be mistake, RA packets 450 * cannot be fragmented, because there is no warranty 451 * that different fragments will go along one path. --ANK 452 */ 453 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 454 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 455 return 0; 456 } 457 458 /* 459 * check and decrement ttl 460 */ 461 if (hdr->hop_limit <= 1) { 462 /* Force OUTPUT device used as source address */ 463 skb->dev = dst->dev; 464 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 465 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 466 467 kfree_skb(skb); 468 return -ETIMEDOUT; 469 } 470 471 /* XXX: idev->cnf.proxy_ndp? */ 472 if (net->ipv6.devconf_all->proxy_ndp && 473 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 474 int proxied = ip6_forward_proxy_check(skb); 475 if (proxied > 0) 476 return ip6_input(skb); 477 else if (proxied < 0) { 478 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 479 goto drop; 480 } 481 } 482 483 if (!xfrm6_route_forward(skb)) { 484 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 485 goto drop; 486 } 487 dst = skb_dst(skb); 488 489 /* IPv6 specs say nothing about it, but it is clear that we cannot 490 send redirects to source routed frames. 491 We don't send redirects to frames decapsulated from IPsec. 492 */ 493 if (IP6CB(skb)->iif == dst->dev->ifindex && 494 opt->srcrt == 0 && !skb_sec_path(skb)) { 495 struct in6_addr *target = NULL; 496 struct inet_peer *peer; 497 struct rt6_info *rt; 498 499 /* 500 * incoming and outgoing devices are the same 501 * send a redirect. 502 */ 503 504 rt = (struct rt6_info *) dst; 505 if (rt->rt6i_flags & RTF_GATEWAY) 506 target = &rt->rt6i_gateway; 507 else 508 target = &hdr->daddr; 509 510 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 511 512 /* Limit redirects both by destination (here) 513 and by source (inside ndisc_send_redirect) 514 */ 515 if (inet_peer_xrlim_allow(peer, 1*HZ)) 516 ndisc_send_redirect(skb, target); 517 if (peer) 518 inet_putpeer(peer); 519 } else { 520 int addrtype = ipv6_addr_type(&hdr->saddr); 521 522 /* This check is security critical. */ 523 if (addrtype == IPV6_ADDR_ANY || 524 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 525 goto error; 526 if (addrtype & IPV6_ADDR_LINKLOCAL) { 527 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 528 ICMPV6_NOT_NEIGHBOUR, 0); 529 goto error; 530 } 531 } 532 533 mtu = ip6_dst_mtu_forward(dst); 534 if (mtu < IPV6_MIN_MTU) 535 mtu = IPV6_MIN_MTU; 536 537 if (ip6_pkt_too_big(skb, mtu)) { 538 /* Again, force OUTPUT device used as source address */ 539 skb->dev = dst->dev; 540 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 541 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 542 __IP6_INC_STATS(net, ip6_dst_idev(dst), 543 IPSTATS_MIB_FRAGFAILS); 544 kfree_skb(skb); 545 return -EMSGSIZE; 546 } 547 548 if (skb_cow(skb, dst->dev->hard_header_len)) { 549 __IP6_INC_STATS(net, ip6_dst_idev(dst), 550 IPSTATS_MIB_OUTDISCARDS); 551 goto drop; 552 } 553 554 hdr = ipv6_hdr(skb); 555 556 /* Mangling hops number delayed to point after skb COW */ 557 558 hdr->hop_limit--; 559 560 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 561 net, NULL, skb, skb->dev, dst->dev, 562 ip6_forward_finish); 563 564 error: 565 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 566 drop: 567 kfree_skb(skb); 568 return -EINVAL; 569 } 570 571 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 572 { 573 to->pkt_type = from->pkt_type; 574 to->priority = from->priority; 575 to->protocol = from->protocol; 576 skb_dst_drop(to); 577 skb_dst_set(to, dst_clone(skb_dst(from))); 578 to->dev = from->dev; 579 to->mark = from->mark; 580 581 skb_copy_hash(to, from); 582 583 #ifdef CONFIG_NET_SCHED 584 to->tc_index = from->tc_index; 585 #endif 586 nf_copy(to, from); 587 skb_ext_copy(to, from); 588 skb_copy_secmark(to, from); 589 } 590 591 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 592 int (*output)(struct net *, struct sock *, struct sk_buff *)) 593 { 594 struct sk_buff *frag; 595 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 596 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 597 inet6_sk(skb->sk) : NULL; 598 struct ipv6hdr *tmp_hdr; 599 struct frag_hdr *fh; 600 unsigned int mtu, hlen, left, len, nexthdr_offset; 601 int hroom, troom; 602 __be32 frag_id; 603 int ptr, offset = 0, err = 0; 604 u8 *prevhdr, nexthdr = 0; 605 606 err = ip6_find_1stfragopt(skb, &prevhdr); 607 if (err < 0) 608 goto fail; 609 hlen = err; 610 nexthdr = *prevhdr; 611 nexthdr_offset = prevhdr - skb_network_header(skb); 612 613 mtu = ip6_skb_dst_mtu(skb); 614 615 /* We must not fragment if the socket is set to force MTU discovery 616 * or if the skb it not generated by a local socket. 617 */ 618 if (unlikely(!skb->ignore_df && skb->len > mtu)) 619 goto fail_toobig; 620 621 if (IP6CB(skb)->frag_max_size) { 622 if (IP6CB(skb)->frag_max_size > mtu) 623 goto fail_toobig; 624 625 /* don't send fragments larger than what we received */ 626 mtu = IP6CB(skb)->frag_max_size; 627 if (mtu < IPV6_MIN_MTU) 628 mtu = IPV6_MIN_MTU; 629 } 630 631 if (np && np->frag_size < mtu) { 632 if (np->frag_size) 633 mtu = np->frag_size; 634 } 635 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 636 goto fail_toobig; 637 mtu -= hlen + sizeof(struct frag_hdr); 638 639 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 640 &ipv6_hdr(skb)->saddr); 641 642 if (skb->ip_summed == CHECKSUM_PARTIAL && 643 (err = skb_checksum_help(skb))) 644 goto fail; 645 646 prevhdr = skb_network_header(skb) + nexthdr_offset; 647 hroom = LL_RESERVED_SPACE(rt->dst.dev); 648 if (skb_has_frag_list(skb)) { 649 unsigned int first_len = skb_pagelen(skb); 650 struct sk_buff *frag2; 651 652 if (first_len - hlen > mtu || 653 ((first_len - hlen) & 7) || 654 skb_cloned(skb) || 655 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 656 goto slow_path; 657 658 skb_walk_frags(skb, frag) { 659 /* Correct geometry. */ 660 if (frag->len > mtu || 661 ((frag->len & 7) && frag->next) || 662 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 663 goto slow_path_clean; 664 665 /* Partially cloned skb? */ 666 if (skb_shared(frag)) 667 goto slow_path_clean; 668 669 BUG_ON(frag->sk); 670 if (skb->sk) { 671 frag->sk = skb->sk; 672 frag->destructor = sock_wfree; 673 } 674 skb->truesize -= frag->truesize; 675 } 676 677 err = 0; 678 offset = 0; 679 /* BUILD HEADER */ 680 681 *prevhdr = NEXTHDR_FRAGMENT; 682 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 683 if (!tmp_hdr) { 684 err = -ENOMEM; 685 goto fail; 686 } 687 frag = skb_shinfo(skb)->frag_list; 688 skb_frag_list_init(skb); 689 690 __skb_pull(skb, hlen); 691 fh = __skb_push(skb, sizeof(struct frag_hdr)); 692 __skb_push(skb, hlen); 693 skb_reset_network_header(skb); 694 memcpy(skb_network_header(skb), tmp_hdr, hlen); 695 696 fh->nexthdr = nexthdr; 697 fh->reserved = 0; 698 fh->frag_off = htons(IP6_MF); 699 fh->identification = frag_id; 700 701 first_len = skb_pagelen(skb); 702 skb->data_len = first_len - skb_headlen(skb); 703 skb->len = first_len; 704 ipv6_hdr(skb)->payload_len = htons(first_len - 705 sizeof(struct ipv6hdr)); 706 707 for (;;) { 708 /* Prepare header of the next frame, 709 * before previous one went down. */ 710 if (frag) { 711 frag->ip_summed = CHECKSUM_NONE; 712 skb_reset_transport_header(frag); 713 fh = __skb_push(frag, sizeof(struct frag_hdr)); 714 __skb_push(frag, hlen); 715 skb_reset_network_header(frag); 716 memcpy(skb_network_header(frag), tmp_hdr, 717 hlen); 718 offset += skb->len - hlen - sizeof(struct frag_hdr); 719 fh->nexthdr = nexthdr; 720 fh->reserved = 0; 721 fh->frag_off = htons(offset); 722 if (frag->next) 723 fh->frag_off |= htons(IP6_MF); 724 fh->identification = frag_id; 725 ipv6_hdr(frag)->payload_len = 726 htons(frag->len - 727 sizeof(struct ipv6hdr)); 728 ip6_copy_metadata(frag, skb); 729 } 730 731 err = output(net, sk, skb); 732 if (!err) 733 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 734 IPSTATS_MIB_FRAGCREATES); 735 736 if (err || !frag) 737 break; 738 739 skb = frag; 740 frag = skb->next; 741 skb_mark_not_on_list(skb); 742 } 743 744 kfree(tmp_hdr); 745 746 if (err == 0) { 747 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 748 IPSTATS_MIB_FRAGOKS); 749 return 0; 750 } 751 752 kfree_skb_list(frag); 753 754 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 755 IPSTATS_MIB_FRAGFAILS); 756 return err; 757 758 slow_path_clean: 759 skb_walk_frags(skb, frag2) { 760 if (frag2 == frag) 761 break; 762 frag2->sk = NULL; 763 frag2->destructor = NULL; 764 skb->truesize += frag2->truesize; 765 } 766 } 767 768 slow_path: 769 left = skb->len - hlen; /* Space per frame */ 770 ptr = hlen; /* Where to start from */ 771 772 /* 773 * Fragment the datagram. 774 */ 775 776 troom = rt->dst.dev->needed_tailroom; 777 778 /* 779 * Keep copying data until we run out. 780 */ 781 while (left > 0) { 782 u8 *fragnexthdr_offset; 783 784 len = left; 785 /* IF: it doesn't fit, use 'mtu' - the data space left */ 786 if (len > mtu) 787 len = mtu; 788 /* IF: we are not sending up to and including the packet end 789 then align the next start on an eight byte boundary */ 790 if (len < left) { 791 len &= ~7; 792 } 793 794 /* Allocate buffer */ 795 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 796 hroom + troom, GFP_ATOMIC); 797 if (!frag) { 798 err = -ENOMEM; 799 goto fail; 800 } 801 802 /* 803 * Set up data on packet 804 */ 805 806 ip6_copy_metadata(frag, skb); 807 skb_reserve(frag, hroom); 808 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 809 skb_reset_network_header(frag); 810 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 811 frag->transport_header = (frag->network_header + hlen + 812 sizeof(struct frag_hdr)); 813 814 /* 815 * Charge the memory for the fragment to any owner 816 * it might possess 817 */ 818 if (skb->sk) 819 skb_set_owner_w(frag, skb->sk); 820 821 /* 822 * Copy the packet header into the new buffer. 823 */ 824 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 825 826 fragnexthdr_offset = skb_network_header(frag); 827 fragnexthdr_offset += prevhdr - skb_network_header(skb); 828 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 829 830 /* 831 * Build fragment header. 832 */ 833 fh->nexthdr = nexthdr; 834 fh->reserved = 0; 835 fh->identification = frag_id; 836 837 /* 838 * Copy a block of the IP datagram. 839 */ 840 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 841 len)); 842 left -= len; 843 844 fh->frag_off = htons(offset); 845 if (left > 0) 846 fh->frag_off |= htons(IP6_MF); 847 ipv6_hdr(frag)->payload_len = htons(frag->len - 848 sizeof(struct ipv6hdr)); 849 850 ptr += len; 851 offset += len; 852 853 /* 854 * Put this fragment into the sending queue. 855 */ 856 err = output(net, sk, frag); 857 if (err) 858 goto fail; 859 860 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 861 IPSTATS_MIB_FRAGCREATES); 862 } 863 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 864 IPSTATS_MIB_FRAGOKS); 865 consume_skb(skb); 866 return err; 867 868 fail_toobig: 869 if (skb->sk && dst_allfrag(skb_dst(skb))) 870 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 871 872 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 873 err = -EMSGSIZE; 874 875 fail: 876 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 877 IPSTATS_MIB_FRAGFAILS); 878 kfree_skb(skb); 879 return err; 880 } 881 882 static inline int ip6_rt_check(const struct rt6key *rt_key, 883 const struct in6_addr *fl_addr, 884 const struct in6_addr *addr_cache) 885 { 886 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 887 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 888 } 889 890 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 891 struct dst_entry *dst, 892 const struct flowi6 *fl6) 893 { 894 struct ipv6_pinfo *np = inet6_sk(sk); 895 struct rt6_info *rt; 896 897 if (!dst) 898 goto out; 899 900 if (dst->ops->family != AF_INET6) { 901 dst_release(dst); 902 return NULL; 903 } 904 905 rt = (struct rt6_info *)dst; 906 /* Yes, checking route validity in not connected 907 * case is not very simple. Take into account, 908 * that we do not support routing by source, TOS, 909 * and MSG_DONTROUTE --ANK (980726) 910 * 911 * 1. ip6_rt_check(): If route was host route, 912 * check that cached destination is current. 913 * If it is network route, we still may 914 * check its validity using saved pointer 915 * to the last used address: daddr_cache. 916 * We do not want to save whole address now, 917 * (because main consumer of this service 918 * is tcp, which has not this problem), 919 * so that the last trick works only on connected 920 * sockets. 921 * 2. oif also should be the same. 922 */ 923 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 924 #ifdef CONFIG_IPV6_SUBTREES 925 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 926 #endif 927 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 928 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 929 dst_release(dst); 930 dst = NULL; 931 } 932 933 out: 934 return dst; 935 } 936 937 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 938 struct dst_entry **dst, struct flowi6 *fl6) 939 { 940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 941 struct neighbour *n; 942 struct rt6_info *rt; 943 #endif 944 int err; 945 int flags = 0; 946 947 /* The correct way to handle this would be to do 948 * ip6_route_get_saddr, and then ip6_route_output; however, 949 * the route-specific preferred source forces the 950 * ip6_route_output call _before_ ip6_route_get_saddr. 951 * 952 * In source specific routing (no src=any default route), 953 * ip6_route_output will fail given src=any saddr, though, so 954 * that's why we try it again later. 955 */ 956 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 957 struct fib6_info *from; 958 struct rt6_info *rt; 959 bool had_dst = *dst != NULL; 960 961 if (!had_dst) 962 *dst = ip6_route_output(net, sk, fl6); 963 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 964 965 rcu_read_lock(); 966 from = rt ? rcu_dereference(rt->from) : NULL; 967 err = ip6_route_get_saddr(net, from, &fl6->daddr, 968 sk ? inet6_sk(sk)->srcprefs : 0, 969 &fl6->saddr); 970 rcu_read_unlock(); 971 972 if (err) 973 goto out_err_release; 974 975 /* If we had an erroneous initial result, pretend it 976 * never existed and let the SA-enabled version take 977 * over. 978 */ 979 if (!had_dst && (*dst)->error) { 980 dst_release(*dst); 981 *dst = NULL; 982 } 983 984 if (fl6->flowi6_oif) 985 flags |= RT6_LOOKUP_F_IFACE; 986 } 987 988 if (!*dst) 989 *dst = ip6_route_output_flags(net, sk, fl6, flags); 990 991 err = (*dst)->error; 992 if (err) 993 goto out_err_release; 994 995 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 996 /* 997 * Here if the dst entry we've looked up 998 * has a neighbour entry that is in the INCOMPLETE 999 * state and the src address from the flow is 1000 * marked as OPTIMISTIC, we release the found 1001 * dst entry and replace it instead with the 1002 * dst entry of the nexthop router 1003 */ 1004 rt = (struct rt6_info *) *dst; 1005 rcu_read_lock_bh(); 1006 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1007 rt6_nexthop(rt, &fl6->daddr)); 1008 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1009 rcu_read_unlock_bh(); 1010 1011 if (err) { 1012 struct inet6_ifaddr *ifp; 1013 struct flowi6 fl_gw6; 1014 int redirect; 1015 1016 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1017 (*dst)->dev, 1); 1018 1019 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1020 if (ifp) 1021 in6_ifa_put(ifp); 1022 1023 if (redirect) { 1024 /* 1025 * We need to get the dst entry for the 1026 * default router instead 1027 */ 1028 dst_release(*dst); 1029 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1030 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1031 *dst = ip6_route_output(net, sk, &fl_gw6); 1032 err = (*dst)->error; 1033 if (err) 1034 goto out_err_release; 1035 } 1036 } 1037 #endif 1038 if (ipv6_addr_v4mapped(&fl6->saddr) && 1039 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1040 err = -EAFNOSUPPORT; 1041 goto out_err_release; 1042 } 1043 1044 return 0; 1045 1046 out_err_release: 1047 dst_release(*dst); 1048 *dst = NULL; 1049 1050 if (err == -ENETUNREACH) 1051 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1052 return err; 1053 } 1054 1055 /** 1056 * ip6_dst_lookup - perform route lookup on flow 1057 * @sk: socket which provides route info 1058 * @dst: pointer to dst_entry * for result 1059 * @fl6: flow to lookup 1060 * 1061 * This function performs a route lookup on the given flow. 1062 * 1063 * It returns zero on success, or a standard errno code on error. 1064 */ 1065 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1066 struct flowi6 *fl6) 1067 { 1068 *dst = NULL; 1069 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1070 } 1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1072 1073 /** 1074 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1075 * @sk: socket which provides route info 1076 * @fl6: flow to lookup 1077 * @final_dst: final destination address for ipsec lookup 1078 * 1079 * This function performs a route lookup on the given flow. 1080 * 1081 * It returns a valid dst pointer on success, or a pointer encoded 1082 * error code. 1083 */ 1084 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1085 const struct in6_addr *final_dst) 1086 { 1087 struct dst_entry *dst = NULL; 1088 int err; 1089 1090 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1091 if (err) 1092 return ERR_PTR(err); 1093 if (final_dst) 1094 fl6->daddr = *final_dst; 1095 1096 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1097 } 1098 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1099 1100 /** 1101 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1102 * @sk: socket which provides the dst cache and route info 1103 * @fl6: flow to lookup 1104 * @final_dst: final destination address for ipsec lookup 1105 * @connected: whether @sk is connected or not 1106 * 1107 * This function performs a route lookup on the given flow with the 1108 * possibility of using the cached route in the socket if it is valid. 1109 * It will take the socket dst lock when operating on the dst cache. 1110 * As a result, this function can only be used in process context. 1111 * 1112 * In addition, for a connected socket, cache the dst in the socket 1113 * if the current cache is not valid. 1114 * 1115 * It returns a valid dst pointer on success, or a pointer encoded 1116 * error code. 1117 */ 1118 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1119 const struct in6_addr *final_dst, 1120 bool connected) 1121 { 1122 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1123 1124 dst = ip6_sk_dst_check(sk, dst, fl6); 1125 if (dst) 1126 return dst; 1127 1128 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1129 if (connected && !IS_ERR(dst)) 1130 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1131 1132 return dst; 1133 } 1134 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1135 1136 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1137 gfp_t gfp) 1138 { 1139 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1140 } 1141 1142 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1143 gfp_t gfp) 1144 { 1145 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1146 } 1147 1148 static void ip6_append_data_mtu(unsigned int *mtu, 1149 int *maxfraglen, 1150 unsigned int fragheaderlen, 1151 struct sk_buff *skb, 1152 struct rt6_info *rt, 1153 unsigned int orig_mtu) 1154 { 1155 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1156 if (!skb) { 1157 /* first fragment, reserve header_len */ 1158 *mtu = orig_mtu - rt->dst.header_len; 1159 1160 } else { 1161 /* 1162 * this fragment is not first, the headers 1163 * space is regarded as data space. 1164 */ 1165 *mtu = orig_mtu; 1166 } 1167 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1168 + fragheaderlen - sizeof(struct frag_hdr); 1169 } 1170 } 1171 1172 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1173 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1174 struct rt6_info *rt, struct flowi6 *fl6) 1175 { 1176 struct ipv6_pinfo *np = inet6_sk(sk); 1177 unsigned int mtu; 1178 struct ipv6_txoptions *opt = ipc6->opt; 1179 1180 /* 1181 * setup for corking 1182 */ 1183 if (opt) { 1184 if (WARN_ON(v6_cork->opt)) 1185 return -EINVAL; 1186 1187 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1188 if (unlikely(!v6_cork->opt)) 1189 return -ENOBUFS; 1190 1191 v6_cork->opt->tot_len = sizeof(*opt); 1192 v6_cork->opt->opt_flen = opt->opt_flen; 1193 v6_cork->opt->opt_nflen = opt->opt_nflen; 1194 1195 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1196 sk->sk_allocation); 1197 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1198 return -ENOBUFS; 1199 1200 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1201 sk->sk_allocation); 1202 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1203 return -ENOBUFS; 1204 1205 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1206 sk->sk_allocation); 1207 if (opt->hopopt && !v6_cork->opt->hopopt) 1208 return -ENOBUFS; 1209 1210 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1211 sk->sk_allocation); 1212 if (opt->srcrt && !v6_cork->opt->srcrt) 1213 return -ENOBUFS; 1214 1215 /* need source address above miyazawa*/ 1216 } 1217 dst_hold(&rt->dst); 1218 cork->base.dst = &rt->dst; 1219 cork->fl.u.ip6 = *fl6; 1220 v6_cork->hop_limit = ipc6->hlimit; 1221 v6_cork->tclass = ipc6->tclass; 1222 if (rt->dst.flags & DST_XFRM_TUNNEL) 1223 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1224 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1225 else 1226 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1227 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1228 if (np->frag_size < mtu) { 1229 if (np->frag_size) 1230 mtu = np->frag_size; 1231 } 1232 if (mtu < IPV6_MIN_MTU) 1233 return -EINVAL; 1234 cork->base.fragsize = mtu; 1235 cork->base.gso_size = ipc6->gso_size; 1236 cork->base.tx_flags = 0; 1237 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1238 1239 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1240 cork->base.flags |= IPCORK_ALLFRAG; 1241 cork->base.length = 0; 1242 1243 cork->base.transmit_time = ipc6->sockc.transmit_time; 1244 1245 return 0; 1246 } 1247 1248 static int __ip6_append_data(struct sock *sk, 1249 struct flowi6 *fl6, 1250 struct sk_buff_head *queue, 1251 struct inet_cork *cork, 1252 struct inet6_cork *v6_cork, 1253 struct page_frag *pfrag, 1254 int getfrag(void *from, char *to, int offset, 1255 int len, int odd, struct sk_buff *skb), 1256 void *from, int length, int transhdrlen, 1257 unsigned int flags, struct ipcm6_cookie *ipc6) 1258 { 1259 struct sk_buff *skb, *skb_prev = NULL; 1260 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1261 struct ubuf_info *uarg = NULL; 1262 int exthdrlen = 0; 1263 int dst_exthdrlen = 0; 1264 int hh_len; 1265 int copy; 1266 int err; 1267 int offset = 0; 1268 u32 tskey = 0; 1269 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1270 struct ipv6_txoptions *opt = v6_cork->opt; 1271 int csummode = CHECKSUM_NONE; 1272 unsigned int maxnonfragsize, headersize; 1273 unsigned int wmem_alloc_delta = 0; 1274 bool paged, extra_uref = false; 1275 1276 skb = skb_peek_tail(queue); 1277 if (!skb) { 1278 exthdrlen = opt ? opt->opt_flen : 0; 1279 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1280 } 1281 1282 paged = !!cork->gso_size; 1283 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1284 orig_mtu = mtu; 1285 1286 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 1287 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1288 tskey = sk->sk_tskey++; 1289 1290 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1291 1292 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1293 (opt ? opt->opt_nflen : 0); 1294 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1295 sizeof(struct frag_hdr); 1296 1297 headersize = sizeof(struct ipv6hdr) + 1298 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1299 (dst_allfrag(&rt->dst) ? 1300 sizeof(struct frag_hdr) : 0) + 1301 rt->rt6i_nfheader_len; 1302 1303 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1304 * the first fragment 1305 */ 1306 if (headersize + transhdrlen > mtu) 1307 goto emsgsize; 1308 1309 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1310 (sk->sk_protocol == IPPROTO_UDP || 1311 sk->sk_protocol == IPPROTO_RAW)) { 1312 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1313 sizeof(struct ipv6hdr)); 1314 goto emsgsize; 1315 } 1316 1317 if (ip6_sk_ignore_df(sk)) 1318 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1319 else 1320 maxnonfragsize = mtu; 1321 1322 if (cork->length + length > maxnonfragsize - headersize) { 1323 emsgsize: 1324 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1325 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1326 return -EMSGSIZE; 1327 } 1328 1329 /* CHECKSUM_PARTIAL only with no extension headers and when 1330 * we are not going to fragment 1331 */ 1332 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1333 headersize == sizeof(struct ipv6hdr) && 1334 length <= mtu - headersize && 1335 (!(flags & MSG_MORE) || cork->gso_size) && 1336 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1337 csummode = CHECKSUM_PARTIAL; 1338 1339 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1340 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1341 if (!uarg) 1342 return -ENOBUFS; 1343 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1344 if (rt->dst.dev->features & NETIF_F_SG && 1345 csummode == CHECKSUM_PARTIAL) { 1346 paged = true; 1347 } else { 1348 uarg->zerocopy = 0; 1349 skb_zcopy_set(skb, uarg, &extra_uref); 1350 } 1351 } 1352 1353 /* 1354 * Let's try using as much space as possible. 1355 * Use MTU if total length of the message fits into the MTU. 1356 * Otherwise, we need to reserve fragment header and 1357 * fragment alignment (= 8-15 octects, in total). 1358 * 1359 * Note that we may need to "move" the data from the tail of 1360 * of the buffer to the new fragment when we split 1361 * the message. 1362 * 1363 * FIXME: It may be fragmented into multiple chunks 1364 * at once if non-fragmentable extension headers 1365 * are too large. 1366 * --yoshfuji 1367 */ 1368 1369 cork->length += length; 1370 if (!skb) 1371 goto alloc_new_skb; 1372 1373 while (length > 0) { 1374 /* Check if the remaining data fits into current packet. */ 1375 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1376 if (copy < length) 1377 copy = maxfraglen - skb->len; 1378 1379 if (copy <= 0) { 1380 char *data; 1381 unsigned int datalen; 1382 unsigned int fraglen; 1383 unsigned int fraggap; 1384 unsigned int alloclen; 1385 unsigned int pagedlen; 1386 alloc_new_skb: 1387 /* There's no room in the current skb */ 1388 if (skb) 1389 fraggap = skb->len - maxfraglen; 1390 else 1391 fraggap = 0; 1392 /* update mtu and maxfraglen if necessary */ 1393 if (!skb || !skb_prev) 1394 ip6_append_data_mtu(&mtu, &maxfraglen, 1395 fragheaderlen, skb, rt, 1396 orig_mtu); 1397 1398 skb_prev = skb; 1399 1400 /* 1401 * If remaining data exceeds the mtu, 1402 * we know we need more fragment(s). 1403 */ 1404 datalen = length + fraggap; 1405 1406 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1407 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1408 fraglen = datalen + fragheaderlen; 1409 pagedlen = 0; 1410 1411 if ((flags & MSG_MORE) && 1412 !(rt->dst.dev->features&NETIF_F_SG)) 1413 alloclen = mtu; 1414 else if (!paged) 1415 alloclen = fraglen; 1416 else { 1417 alloclen = min_t(int, fraglen, MAX_HEADER); 1418 pagedlen = fraglen - alloclen; 1419 } 1420 1421 alloclen += dst_exthdrlen; 1422 1423 if (datalen != length + fraggap) { 1424 /* 1425 * this is not the last fragment, the trailer 1426 * space is regarded as data space. 1427 */ 1428 datalen += rt->dst.trailer_len; 1429 } 1430 1431 alloclen += rt->dst.trailer_len; 1432 fraglen = datalen + fragheaderlen; 1433 1434 /* 1435 * We just reserve space for fragment header. 1436 * Note: this may be overallocation if the message 1437 * (without MSG_MORE) fits into the MTU. 1438 */ 1439 alloclen += sizeof(struct frag_hdr); 1440 1441 copy = datalen - transhdrlen - fraggap - pagedlen; 1442 if (copy < 0) { 1443 err = -EINVAL; 1444 goto error; 1445 } 1446 if (transhdrlen) { 1447 skb = sock_alloc_send_skb(sk, 1448 alloclen + hh_len, 1449 (flags & MSG_DONTWAIT), &err); 1450 } else { 1451 skb = NULL; 1452 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1453 2 * sk->sk_sndbuf) 1454 skb = alloc_skb(alloclen + hh_len, 1455 sk->sk_allocation); 1456 if (unlikely(!skb)) 1457 err = -ENOBUFS; 1458 } 1459 if (!skb) 1460 goto error; 1461 /* 1462 * Fill in the control structures 1463 */ 1464 skb->protocol = htons(ETH_P_IPV6); 1465 skb->ip_summed = csummode; 1466 skb->csum = 0; 1467 /* reserve for fragmentation and ipsec header */ 1468 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1469 dst_exthdrlen); 1470 1471 /* 1472 * Find where to start putting bytes 1473 */ 1474 data = skb_put(skb, fraglen - pagedlen); 1475 skb_set_network_header(skb, exthdrlen); 1476 data += fragheaderlen; 1477 skb->transport_header = (skb->network_header + 1478 fragheaderlen); 1479 if (fraggap) { 1480 skb->csum = skb_copy_and_csum_bits( 1481 skb_prev, maxfraglen, 1482 data + transhdrlen, fraggap, 0); 1483 skb_prev->csum = csum_sub(skb_prev->csum, 1484 skb->csum); 1485 data += fraggap; 1486 pskb_trim_unique(skb_prev, maxfraglen); 1487 } 1488 if (copy > 0 && 1489 getfrag(from, data + transhdrlen, offset, 1490 copy, fraggap, skb) < 0) { 1491 err = -EFAULT; 1492 kfree_skb(skb); 1493 goto error; 1494 } 1495 1496 offset += copy; 1497 length -= copy + transhdrlen; 1498 transhdrlen = 0; 1499 exthdrlen = 0; 1500 dst_exthdrlen = 0; 1501 1502 /* Only the initial fragment is time stamped */ 1503 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1504 cork->tx_flags = 0; 1505 skb_shinfo(skb)->tskey = tskey; 1506 tskey = 0; 1507 skb_zcopy_set(skb, uarg, &extra_uref); 1508 1509 if ((flags & MSG_CONFIRM) && !skb_prev) 1510 skb_set_dst_pending_confirm(skb, 1); 1511 1512 /* 1513 * Put the packet on the pending queue 1514 */ 1515 if (!skb->destructor) { 1516 skb->destructor = sock_wfree; 1517 skb->sk = sk; 1518 wmem_alloc_delta += skb->truesize; 1519 } 1520 __skb_queue_tail(queue, skb); 1521 continue; 1522 } 1523 1524 if (copy > length) 1525 copy = length; 1526 1527 if (!(rt->dst.dev->features&NETIF_F_SG) && 1528 skb_tailroom(skb) >= copy) { 1529 unsigned int off; 1530 1531 off = skb->len; 1532 if (getfrag(from, skb_put(skb, copy), 1533 offset, copy, off, skb) < 0) { 1534 __skb_trim(skb, off); 1535 err = -EFAULT; 1536 goto error; 1537 } 1538 } else if (!uarg || !uarg->zerocopy) { 1539 int i = skb_shinfo(skb)->nr_frags; 1540 1541 err = -ENOMEM; 1542 if (!sk_page_frag_refill(sk, pfrag)) 1543 goto error; 1544 1545 if (!skb_can_coalesce(skb, i, pfrag->page, 1546 pfrag->offset)) { 1547 err = -EMSGSIZE; 1548 if (i == MAX_SKB_FRAGS) 1549 goto error; 1550 1551 __skb_fill_page_desc(skb, i, pfrag->page, 1552 pfrag->offset, 0); 1553 skb_shinfo(skb)->nr_frags = ++i; 1554 get_page(pfrag->page); 1555 } 1556 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1557 if (getfrag(from, 1558 page_address(pfrag->page) + pfrag->offset, 1559 offset, copy, skb->len, skb) < 0) 1560 goto error_efault; 1561 1562 pfrag->offset += copy; 1563 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1564 skb->len += copy; 1565 skb->data_len += copy; 1566 skb->truesize += copy; 1567 wmem_alloc_delta += copy; 1568 } else { 1569 err = skb_zerocopy_iter_dgram(skb, from, copy); 1570 if (err < 0) 1571 goto error; 1572 } 1573 offset += copy; 1574 length -= copy; 1575 } 1576 1577 if (wmem_alloc_delta) 1578 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1579 return 0; 1580 1581 error_efault: 1582 err = -EFAULT; 1583 error: 1584 if (uarg) 1585 sock_zerocopy_put_abort(uarg, extra_uref); 1586 cork->length -= length; 1587 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1588 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1589 return err; 1590 } 1591 1592 int ip6_append_data(struct sock *sk, 1593 int getfrag(void *from, char *to, int offset, int len, 1594 int odd, struct sk_buff *skb), 1595 void *from, int length, int transhdrlen, 1596 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1597 struct rt6_info *rt, unsigned int flags) 1598 { 1599 struct inet_sock *inet = inet_sk(sk); 1600 struct ipv6_pinfo *np = inet6_sk(sk); 1601 int exthdrlen; 1602 int err; 1603 1604 if (flags&MSG_PROBE) 1605 return 0; 1606 if (skb_queue_empty(&sk->sk_write_queue)) { 1607 /* 1608 * setup for corking 1609 */ 1610 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1611 ipc6, rt, fl6); 1612 if (err) 1613 return err; 1614 1615 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1616 length += exthdrlen; 1617 transhdrlen += exthdrlen; 1618 } else { 1619 fl6 = &inet->cork.fl.u.ip6; 1620 transhdrlen = 0; 1621 } 1622 1623 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1624 &np->cork, sk_page_frag(sk), getfrag, 1625 from, length, transhdrlen, flags, ipc6); 1626 } 1627 EXPORT_SYMBOL_GPL(ip6_append_data); 1628 1629 static void ip6_cork_release(struct inet_cork_full *cork, 1630 struct inet6_cork *v6_cork) 1631 { 1632 if (v6_cork->opt) { 1633 kfree(v6_cork->opt->dst0opt); 1634 kfree(v6_cork->opt->dst1opt); 1635 kfree(v6_cork->opt->hopopt); 1636 kfree(v6_cork->opt->srcrt); 1637 kfree(v6_cork->opt); 1638 v6_cork->opt = NULL; 1639 } 1640 1641 if (cork->base.dst) { 1642 dst_release(cork->base.dst); 1643 cork->base.dst = NULL; 1644 cork->base.flags &= ~IPCORK_ALLFRAG; 1645 } 1646 memset(&cork->fl, 0, sizeof(cork->fl)); 1647 } 1648 1649 struct sk_buff *__ip6_make_skb(struct sock *sk, 1650 struct sk_buff_head *queue, 1651 struct inet_cork_full *cork, 1652 struct inet6_cork *v6_cork) 1653 { 1654 struct sk_buff *skb, *tmp_skb; 1655 struct sk_buff **tail_skb; 1656 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1657 struct ipv6_pinfo *np = inet6_sk(sk); 1658 struct net *net = sock_net(sk); 1659 struct ipv6hdr *hdr; 1660 struct ipv6_txoptions *opt = v6_cork->opt; 1661 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1662 struct flowi6 *fl6 = &cork->fl.u.ip6; 1663 unsigned char proto = fl6->flowi6_proto; 1664 1665 skb = __skb_dequeue(queue); 1666 if (!skb) 1667 goto out; 1668 tail_skb = &(skb_shinfo(skb)->frag_list); 1669 1670 /* move skb->data to ip header from ext header */ 1671 if (skb->data < skb_network_header(skb)) 1672 __skb_pull(skb, skb_network_offset(skb)); 1673 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1674 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1675 *tail_skb = tmp_skb; 1676 tail_skb = &(tmp_skb->next); 1677 skb->len += tmp_skb->len; 1678 skb->data_len += tmp_skb->len; 1679 skb->truesize += tmp_skb->truesize; 1680 tmp_skb->destructor = NULL; 1681 tmp_skb->sk = NULL; 1682 } 1683 1684 /* Allow local fragmentation. */ 1685 skb->ignore_df = ip6_sk_ignore_df(sk); 1686 1687 *final_dst = fl6->daddr; 1688 __skb_pull(skb, skb_network_header_len(skb)); 1689 if (opt && opt->opt_flen) 1690 ipv6_push_frag_opts(skb, opt, &proto); 1691 if (opt && opt->opt_nflen) 1692 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1693 1694 skb_push(skb, sizeof(struct ipv6hdr)); 1695 skb_reset_network_header(skb); 1696 hdr = ipv6_hdr(skb); 1697 1698 ip6_flow_hdr(hdr, v6_cork->tclass, 1699 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1700 ip6_autoflowlabel(net, np), fl6)); 1701 hdr->hop_limit = v6_cork->hop_limit; 1702 hdr->nexthdr = proto; 1703 hdr->saddr = fl6->saddr; 1704 hdr->daddr = *final_dst; 1705 1706 skb->priority = sk->sk_priority; 1707 skb->mark = sk->sk_mark; 1708 1709 skb->tstamp = cork->base.transmit_time; 1710 1711 skb_dst_set(skb, dst_clone(&rt->dst)); 1712 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1713 if (proto == IPPROTO_ICMPV6) { 1714 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1715 1716 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1717 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1718 } 1719 1720 ip6_cork_release(cork, v6_cork); 1721 out: 1722 return skb; 1723 } 1724 1725 int ip6_send_skb(struct sk_buff *skb) 1726 { 1727 struct net *net = sock_net(skb->sk); 1728 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1729 int err; 1730 1731 err = ip6_local_out(net, skb->sk, skb); 1732 if (err) { 1733 if (err > 0) 1734 err = net_xmit_errno(err); 1735 if (err) 1736 IP6_INC_STATS(net, rt->rt6i_idev, 1737 IPSTATS_MIB_OUTDISCARDS); 1738 } 1739 1740 return err; 1741 } 1742 1743 int ip6_push_pending_frames(struct sock *sk) 1744 { 1745 struct sk_buff *skb; 1746 1747 skb = ip6_finish_skb(sk); 1748 if (!skb) 1749 return 0; 1750 1751 return ip6_send_skb(skb); 1752 } 1753 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1754 1755 static void __ip6_flush_pending_frames(struct sock *sk, 1756 struct sk_buff_head *queue, 1757 struct inet_cork_full *cork, 1758 struct inet6_cork *v6_cork) 1759 { 1760 struct sk_buff *skb; 1761 1762 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1763 if (skb_dst(skb)) 1764 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1765 IPSTATS_MIB_OUTDISCARDS); 1766 kfree_skb(skb); 1767 } 1768 1769 ip6_cork_release(cork, v6_cork); 1770 } 1771 1772 void ip6_flush_pending_frames(struct sock *sk) 1773 { 1774 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1775 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1776 } 1777 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1778 1779 struct sk_buff *ip6_make_skb(struct sock *sk, 1780 int getfrag(void *from, char *to, int offset, 1781 int len, int odd, struct sk_buff *skb), 1782 void *from, int length, int transhdrlen, 1783 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1784 struct rt6_info *rt, unsigned int flags, 1785 struct inet_cork_full *cork) 1786 { 1787 struct inet6_cork v6_cork; 1788 struct sk_buff_head queue; 1789 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1790 int err; 1791 1792 if (flags & MSG_PROBE) 1793 return NULL; 1794 1795 __skb_queue_head_init(&queue); 1796 1797 cork->base.flags = 0; 1798 cork->base.addr = 0; 1799 cork->base.opt = NULL; 1800 cork->base.dst = NULL; 1801 v6_cork.opt = NULL; 1802 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); 1803 if (err) { 1804 ip6_cork_release(cork, &v6_cork); 1805 return ERR_PTR(err); 1806 } 1807 if (ipc6->dontfrag < 0) 1808 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1809 1810 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1811 ¤t->task_frag, getfrag, from, 1812 length + exthdrlen, transhdrlen + exthdrlen, 1813 flags, ipc6); 1814 if (err) { 1815 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1816 return ERR_PTR(err); 1817 } 1818 1819 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 1820 } 1821