1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb, false); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 134 /* Policy lookup after SNAT yielded a new policy */ 135 if (skb_dst(skb)->xfrm) { 136 IPCB(skb)->flags |= IPSKB_REROUTED; 137 return dst_output(net, sk, skb); 138 } 139 #endif 140 141 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 142 dst_allfrag(skb_dst(skb)) || 143 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 144 return ip6_fragment(net, sk, skb, ip6_finish_output2); 145 else 146 return ip6_finish_output2(net, sk, skb); 147 } 148 149 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 150 { 151 int ret; 152 153 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 154 switch (ret) { 155 case NET_XMIT_SUCCESS: 156 return __ip6_finish_output(net, sk, skb); 157 case NET_XMIT_CN: 158 return __ip6_finish_output(net, sk, skb) ? : ret; 159 default: 160 kfree_skb(skb); 161 return ret; 162 } 163 } 164 165 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 166 { 167 struct net_device *dev = skb_dst(skb)->dev; 168 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 169 170 skb->protocol = htons(ETH_P_IPV6); 171 skb->dev = dev; 172 173 if (unlikely(idev->cnf.disable_ipv6)) { 174 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 175 kfree_skb(skb); 176 return 0; 177 } 178 179 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 180 net, sk, skb, NULL, dev, 181 ip6_finish_output, 182 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 183 } 184 185 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 186 { 187 if (!np->autoflowlabel_set) 188 return ip6_default_np_autolabel(net); 189 else 190 return np->autoflowlabel; 191 } 192 193 /* 194 * xmit an sk_buff (used by TCP, SCTP and DCCP) 195 * Note : socket lock is not held for SYNACK packets, but might be modified 196 * by calls to skb_set_owner_w() and ipv6_local_error(), 197 * which are using proper atomic operations or spinlocks. 198 */ 199 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 200 __u32 mark, struct ipv6_txoptions *opt, int tclass) 201 { 202 struct net *net = sock_net(sk); 203 const struct ipv6_pinfo *np = inet6_sk(sk); 204 struct in6_addr *first_hop = &fl6->daddr; 205 struct dst_entry *dst = skb_dst(skb); 206 unsigned int head_room; 207 struct ipv6hdr *hdr; 208 u8 proto = fl6->flowi6_proto; 209 int seg_len = skb->len; 210 int hlimit = -1; 211 u32 mtu; 212 213 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 214 if (opt) 215 head_room += opt->opt_nflen + opt->opt_flen; 216 217 if (unlikely(skb_headroom(skb) < head_room)) { 218 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 219 if (!skb2) { 220 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 221 IPSTATS_MIB_OUTDISCARDS); 222 kfree_skb(skb); 223 return -ENOBUFS; 224 } 225 if (skb->sk) 226 skb_set_owner_w(skb2, skb->sk); 227 consume_skb(skb); 228 skb = skb2; 229 } 230 231 if (opt) { 232 seg_len += opt->opt_nflen + opt->opt_flen; 233 234 if (opt->opt_flen) 235 ipv6_push_frag_opts(skb, opt, &proto); 236 237 if (opt->opt_nflen) 238 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 239 &fl6->saddr); 240 } 241 242 skb_push(skb, sizeof(struct ipv6hdr)); 243 skb_reset_network_header(skb); 244 hdr = ipv6_hdr(skb); 245 246 /* 247 * Fill in the IPv6 header 248 */ 249 if (np) 250 hlimit = np->hop_limit; 251 if (hlimit < 0) 252 hlimit = ip6_dst_hoplimit(dst); 253 254 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 255 ip6_autoflowlabel(net, np), fl6)); 256 257 hdr->payload_len = htons(seg_len); 258 hdr->nexthdr = proto; 259 hdr->hop_limit = hlimit; 260 261 hdr->saddr = fl6->saddr; 262 hdr->daddr = *first_hop; 263 264 skb->protocol = htons(ETH_P_IPV6); 265 skb->priority = sk->sk_priority; 266 skb->mark = mark; 267 268 mtu = dst_mtu(dst); 269 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 270 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 271 IPSTATS_MIB_OUT, skb->len); 272 273 /* if egress device is enslaved to an L3 master device pass the 274 * skb to its handler for processing 275 */ 276 skb = l3mdev_ip6_out((struct sock *)sk, skb); 277 if (unlikely(!skb)) 278 return 0; 279 280 /* hooks should never assume socket lock is held. 281 * we promote our socket to non const 282 */ 283 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 284 net, (struct sock *)sk, skb, NULL, dst->dev, 285 dst_output); 286 } 287 288 skb->dev = dst->dev; 289 /* ipv6_local_error() does not require socket lock, 290 * we promote our socket to non const 291 */ 292 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 293 294 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 295 kfree_skb(skb); 296 return -EMSGSIZE; 297 } 298 EXPORT_SYMBOL(ip6_xmit); 299 300 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 301 { 302 struct ip6_ra_chain *ra; 303 struct sock *last = NULL; 304 305 read_lock(&ip6_ra_lock); 306 for (ra = ip6_ra_chain; ra; ra = ra->next) { 307 struct sock *sk = ra->sk; 308 if (sk && ra->sel == sel && 309 (!sk->sk_bound_dev_if || 310 sk->sk_bound_dev_if == skb->dev->ifindex)) { 311 struct ipv6_pinfo *np = inet6_sk(sk); 312 313 if (np && np->rtalert_isolate && 314 !net_eq(sock_net(sk), dev_net(skb->dev))) { 315 continue; 316 } 317 if (last) { 318 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 319 if (skb2) 320 rawv6_rcv(last, skb2); 321 } 322 last = sk; 323 } 324 } 325 326 if (last) { 327 rawv6_rcv(last, skb); 328 read_unlock(&ip6_ra_lock); 329 return 1; 330 } 331 read_unlock(&ip6_ra_lock); 332 return 0; 333 } 334 335 static int ip6_forward_proxy_check(struct sk_buff *skb) 336 { 337 struct ipv6hdr *hdr = ipv6_hdr(skb); 338 u8 nexthdr = hdr->nexthdr; 339 __be16 frag_off; 340 int offset; 341 342 if (ipv6_ext_hdr(nexthdr)) { 343 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 344 if (offset < 0) 345 return 0; 346 } else 347 offset = sizeof(struct ipv6hdr); 348 349 if (nexthdr == IPPROTO_ICMPV6) { 350 struct icmp6hdr *icmp6; 351 352 if (!pskb_may_pull(skb, (skb_network_header(skb) + 353 offset + 1 - skb->data))) 354 return 0; 355 356 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 357 358 switch (icmp6->icmp6_type) { 359 case NDISC_ROUTER_SOLICITATION: 360 case NDISC_ROUTER_ADVERTISEMENT: 361 case NDISC_NEIGHBOUR_SOLICITATION: 362 case NDISC_NEIGHBOUR_ADVERTISEMENT: 363 case NDISC_REDIRECT: 364 /* For reaction involving unicast neighbor discovery 365 * message destined to the proxied address, pass it to 366 * input function. 367 */ 368 return 1; 369 default: 370 break; 371 } 372 } 373 374 /* 375 * The proxying router can't forward traffic sent to a link-local 376 * address, so signal the sender and discard the packet. This 377 * behavior is clarified by the MIPv6 specification. 378 */ 379 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 380 dst_link_failure(skb); 381 return -1; 382 } 383 384 return 0; 385 } 386 387 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 388 struct sk_buff *skb) 389 { 390 struct dst_entry *dst = skb_dst(skb); 391 392 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 393 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 394 395 #ifdef CONFIG_NET_SWITCHDEV 396 if (skb->offload_l3_fwd_mark) { 397 consume_skb(skb); 398 return 0; 399 } 400 #endif 401 402 skb->tstamp = 0; 403 return dst_output(net, sk, skb); 404 } 405 406 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 407 { 408 if (skb->len <= mtu) 409 return false; 410 411 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 412 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 413 return true; 414 415 if (skb->ignore_df) 416 return false; 417 418 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 419 return false; 420 421 return true; 422 } 423 424 int ip6_forward(struct sk_buff *skb) 425 { 426 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 427 struct dst_entry *dst = skb_dst(skb); 428 struct ipv6hdr *hdr = ipv6_hdr(skb); 429 struct inet6_skb_parm *opt = IP6CB(skb); 430 struct net *net = dev_net(dst->dev); 431 u32 mtu; 432 433 if (net->ipv6.devconf_all->forwarding == 0) 434 goto error; 435 436 if (skb->pkt_type != PACKET_HOST) 437 goto drop; 438 439 if (unlikely(skb->sk)) 440 goto drop; 441 442 if (skb_warn_if_lro(skb)) 443 goto drop; 444 445 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 446 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 447 goto drop; 448 } 449 450 skb_forward_csum(skb); 451 452 /* 453 * We DO NOT make any processing on 454 * RA packets, pushing them to user level AS IS 455 * without ane WARRANTY that application will be able 456 * to interpret them. The reason is that we 457 * cannot make anything clever here. 458 * 459 * We are not end-node, so that if packet contains 460 * AH/ESP, we cannot make anything. 461 * Defragmentation also would be mistake, RA packets 462 * cannot be fragmented, because there is no warranty 463 * that different fragments will go along one path. --ANK 464 */ 465 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 466 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 467 return 0; 468 } 469 470 /* 471 * check and decrement ttl 472 */ 473 if (hdr->hop_limit <= 1) { 474 /* Force OUTPUT device used as source address */ 475 skb->dev = dst->dev; 476 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 477 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 478 479 kfree_skb(skb); 480 return -ETIMEDOUT; 481 } 482 483 /* XXX: idev->cnf.proxy_ndp? */ 484 if (net->ipv6.devconf_all->proxy_ndp && 485 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 486 int proxied = ip6_forward_proxy_check(skb); 487 if (proxied > 0) 488 return ip6_input(skb); 489 else if (proxied < 0) { 490 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 491 goto drop; 492 } 493 } 494 495 if (!xfrm6_route_forward(skb)) { 496 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 497 goto drop; 498 } 499 dst = skb_dst(skb); 500 501 /* IPv6 specs say nothing about it, but it is clear that we cannot 502 send redirects to source routed frames. 503 We don't send redirects to frames decapsulated from IPsec. 504 */ 505 if (IP6CB(skb)->iif == dst->dev->ifindex && 506 opt->srcrt == 0 && !skb_sec_path(skb)) { 507 struct in6_addr *target = NULL; 508 struct inet_peer *peer; 509 struct rt6_info *rt; 510 511 /* 512 * incoming and outgoing devices are the same 513 * send a redirect. 514 */ 515 516 rt = (struct rt6_info *) dst; 517 if (rt->rt6i_flags & RTF_GATEWAY) 518 target = &rt->rt6i_gateway; 519 else 520 target = &hdr->daddr; 521 522 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 523 524 /* Limit redirects both by destination (here) 525 and by source (inside ndisc_send_redirect) 526 */ 527 if (inet_peer_xrlim_allow(peer, 1*HZ)) 528 ndisc_send_redirect(skb, target); 529 if (peer) 530 inet_putpeer(peer); 531 } else { 532 int addrtype = ipv6_addr_type(&hdr->saddr); 533 534 /* This check is security critical. */ 535 if (addrtype == IPV6_ADDR_ANY || 536 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 537 goto error; 538 if (addrtype & IPV6_ADDR_LINKLOCAL) { 539 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 540 ICMPV6_NOT_NEIGHBOUR, 0); 541 goto error; 542 } 543 } 544 545 mtu = ip6_dst_mtu_forward(dst); 546 if (mtu < IPV6_MIN_MTU) 547 mtu = IPV6_MIN_MTU; 548 549 if (ip6_pkt_too_big(skb, mtu)) { 550 /* Again, force OUTPUT device used as source address */ 551 skb->dev = dst->dev; 552 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 553 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 554 __IP6_INC_STATS(net, ip6_dst_idev(dst), 555 IPSTATS_MIB_FRAGFAILS); 556 kfree_skb(skb); 557 return -EMSGSIZE; 558 } 559 560 if (skb_cow(skb, dst->dev->hard_header_len)) { 561 __IP6_INC_STATS(net, ip6_dst_idev(dst), 562 IPSTATS_MIB_OUTDISCARDS); 563 goto drop; 564 } 565 566 hdr = ipv6_hdr(skb); 567 568 /* Mangling hops number delayed to point after skb COW */ 569 570 hdr->hop_limit--; 571 572 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 573 net, NULL, skb, skb->dev, dst->dev, 574 ip6_forward_finish); 575 576 error: 577 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 578 drop: 579 kfree_skb(skb); 580 return -EINVAL; 581 } 582 583 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 584 { 585 to->pkt_type = from->pkt_type; 586 to->priority = from->priority; 587 to->protocol = from->protocol; 588 skb_dst_drop(to); 589 skb_dst_set(to, dst_clone(skb_dst(from))); 590 to->dev = from->dev; 591 to->mark = from->mark; 592 593 skb_copy_hash(to, from); 594 595 #ifdef CONFIG_NET_SCHED 596 to->tc_index = from->tc_index; 597 #endif 598 nf_copy(to, from); 599 skb_ext_copy(to, from); 600 skb_copy_secmark(to, from); 601 } 602 603 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 604 int (*output)(struct net *, struct sock *, struct sk_buff *)) 605 { 606 struct sk_buff *frag; 607 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 608 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 609 inet6_sk(skb->sk) : NULL; 610 struct ipv6hdr *tmp_hdr; 611 struct frag_hdr *fh; 612 unsigned int mtu, hlen, left, len, nexthdr_offset; 613 int hroom, troom; 614 __be32 frag_id; 615 int ptr, offset = 0, err = 0; 616 u8 *prevhdr, nexthdr = 0; 617 618 err = ip6_find_1stfragopt(skb, &prevhdr); 619 if (err < 0) 620 goto fail; 621 hlen = err; 622 nexthdr = *prevhdr; 623 nexthdr_offset = prevhdr - skb_network_header(skb); 624 625 mtu = ip6_skb_dst_mtu(skb); 626 627 /* We must not fragment if the socket is set to force MTU discovery 628 * or if the skb it not generated by a local socket. 629 */ 630 if (unlikely(!skb->ignore_df && skb->len > mtu)) 631 goto fail_toobig; 632 633 if (IP6CB(skb)->frag_max_size) { 634 if (IP6CB(skb)->frag_max_size > mtu) 635 goto fail_toobig; 636 637 /* don't send fragments larger than what we received */ 638 mtu = IP6CB(skb)->frag_max_size; 639 if (mtu < IPV6_MIN_MTU) 640 mtu = IPV6_MIN_MTU; 641 } 642 643 if (np && np->frag_size < mtu) { 644 if (np->frag_size) 645 mtu = np->frag_size; 646 } 647 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 648 goto fail_toobig; 649 mtu -= hlen + sizeof(struct frag_hdr); 650 651 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 652 &ipv6_hdr(skb)->saddr); 653 654 if (skb->ip_summed == CHECKSUM_PARTIAL && 655 (err = skb_checksum_help(skb))) 656 goto fail; 657 658 prevhdr = skb_network_header(skb) + nexthdr_offset; 659 hroom = LL_RESERVED_SPACE(rt->dst.dev); 660 if (skb_has_frag_list(skb)) { 661 unsigned int first_len = skb_pagelen(skb); 662 struct sk_buff *frag2; 663 664 if (first_len - hlen > mtu || 665 ((first_len - hlen) & 7) || 666 skb_cloned(skb) || 667 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 668 goto slow_path; 669 670 skb_walk_frags(skb, frag) { 671 /* Correct geometry. */ 672 if (frag->len > mtu || 673 ((frag->len & 7) && frag->next) || 674 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 675 goto slow_path_clean; 676 677 /* Partially cloned skb? */ 678 if (skb_shared(frag)) 679 goto slow_path_clean; 680 681 BUG_ON(frag->sk); 682 if (skb->sk) { 683 frag->sk = skb->sk; 684 frag->destructor = sock_wfree; 685 } 686 skb->truesize -= frag->truesize; 687 } 688 689 err = 0; 690 offset = 0; 691 /* BUILD HEADER */ 692 693 *prevhdr = NEXTHDR_FRAGMENT; 694 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 695 if (!tmp_hdr) { 696 err = -ENOMEM; 697 goto fail; 698 } 699 frag = skb_shinfo(skb)->frag_list; 700 skb_frag_list_init(skb); 701 702 __skb_pull(skb, hlen); 703 fh = __skb_push(skb, sizeof(struct frag_hdr)); 704 __skb_push(skb, hlen); 705 skb_reset_network_header(skb); 706 memcpy(skb_network_header(skb), tmp_hdr, hlen); 707 708 fh->nexthdr = nexthdr; 709 fh->reserved = 0; 710 fh->frag_off = htons(IP6_MF); 711 fh->identification = frag_id; 712 713 first_len = skb_pagelen(skb); 714 skb->data_len = first_len - skb_headlen(skb); 715 skb->len = first_len; 716 ipv6_hdr(skb)->payload_len = htons(first_len - 717 sizeof(struct ipv6hdr)); 718 719 for (;;) { 720 /* Prepare header of the next frame, 721 * before previous one went down. */ 722 if (frag) { 723 frag->ip_summed = CHECKSUM_NONE; 724 skb_reset_transport_header(frag); 725 fh = __skb_push(frag, sizeof(struct frag_hdr)); 726 __skb_push(frag, hlen); 727 skb_reset_network_header(frag); 728 memcpy(skb_network_header(frag), tmp_hdr, 729 hlen); 730 offset += skb->len - hlen - sizeof(struct frag_hdr); 731 fh->nexthdr = nexthdr; 732 fh->reserved = 0; 733 fh->frag_off = htons(offset); 734 if (frag->next) 735 fh->frag_off |= htons(IP6_MF); 736 fh->identification = frag_id; 737 ipv6_hdr(frag)->payload_len = 738 htons(frag->len - 739 sizeof(struct ipv6hdr)); 740 ip6_copy_metadata(frag, skb); 741 } 742 743 err = output(net, sk, skb); 744 if (!err) 745 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 746 IPSTATS_MIB_FRAGCREATES); 747 748 if (err || !frag) 749 break; 750 751 skb = frag; 752 frag = skb->next; 753 skb_mark_not_on_list(skb); 754 } 755 756 kfree(tmp_hdr); 757 758 if (err == 0) { 759 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 760 IPSTATS_MIB_FRAGOKS); 761 return 0; 762 } 763 764 kfree_skb_list(frag); 765 766 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 767 IPSTATS_MIB_FRAGFAILS); 768 return err; 769 770 slow_path_clean: 771 skb_walk_frags(skb, frag2) { 772 if (frag2 == frag) 773 break; 774 frag2->sk = NULL; 775 frag2->destructor = NULL; 776 skb->truesize += frag2->truesize; 777 } 778 } 779 780 slow_path: 781 left = skb->len - hlen; /* Space per frame */ 782 ptr = hlen; /* Where to start from */ 783 784 /* 785 * Fragment the datagram. 786 */ 787 788 troom = rt->dst.dev->needed_tailroom; 789 790 /* 791 * Keep copying data until we run out. 792 */ 793 while (left > 0) { 794 u8 *fragnexthdr_offset; 795 796 len = left; 797 /* IF: it doesn't fit, use 'mtu' - the data space left */ 798 if (len > mtu) 799 len = mtu; 800 /* IF: we are not sending up to and including the packet end 801 then align the next start on an eight byte boundary */ 802 if (len < left) { 803 len &= ~7; 804 } 805 806 /* Allocate buffer */ 807 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 808 hroom + troom, GFP_ATOMIC); 809 if (!frag) { 810 err = -ENOMEM; 811 goto fail; 812 } 813 814 /* 815 * Set up data on packet 816 */ 817 818 ip6_copy_metadata(frag, skb); 819 skb_reserve(frag, hroom); 820 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 821 skb_reset_network_header(frag); 822 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 823 frag->transport_header = (frag->network_header + hlen + 824 sizeof(struct frag_hdr)); 825 826 /* 827 * Charge the memory for the fragment to any owner 828 * it might possess 829 */ 830 if (skb->sk) 831 skb_set_owner_w(frag, skb->sk); 832 833 /* 834 * Copy the packet header into the new buffer. 835 */ 836 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 837 838 fragnexthdr_offset = skb_network_header(frag); 839 fragnexthdr_offset += prevhdr - skb_network_header(skb); 840 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 841 842 /* 843 * Build fragment header. 844 */ 845 fh->nexthdr = nexthdr; 846 fh->reserved = 0; 847 fh->identification = frag_id; 848 849 /* 850 * Copy a block of the IP datagram. 851 */ 852 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 853 len)); 854 left -= len; 855 856 fh->frag_off = htons(offset); 857 if (left > 0) 858 fh->frag_off |= htons(IP6_MF); 859 ipv6_hdr(frag)->payload_len = htons(frag->len - 860 sizeof(struct ipv6hdr)); 861 862 ptr += len; 863 offset += len; 864 865 /* 866 * Put this fragment into the sending queue. 867 */ 868 err = output(net, sk, frag); 869 if (err) 870 goto fail; 871 872 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 873 IPSTATS_MIB_FRAGCREATES); 874 } 875 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 876 IPSTATS_MIB_FRAGOKS); 877 consume_skb(skb); 878 return err; 879 880 fail_toobig: 881 if (skb->sk && dst_allfrag(skb_dst(skb))) 882 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 883 884 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 885 err = -EMSGSIZE; 886 887 fail: 888 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 889 IPSTATS_MIB_FRAGFAILS); 890 kfree_skb(skb); 891 return err; 892 } 893 894 static inline int ip6_rt_check(const struct rt6key *rt_key, 895 const struct in6_addr *fl_addr, 896 const struct in6_addr *addr_cache) 897 { 898 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 899 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 900 } 901 902 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 903 struct dst_entry *dst, 904 const struct flowi6 *fl6) 905 { 906 struct ipv6_pinfo *np = inet6_sk(sk); 907 struct rt6_info *rt; 908 909 if (!dst) 910 goto out; 911 912 if (dst->ops->family != AF_INET6) { 913 dst_release(dst); 914 return NULL; 915 } 916 917 rt = (struct rt6_info *)dst; 918 /* Yes, checking route validity in not connected 919 * case is not very simple. Take into account, 920 * that we do not support routing by source, TOS, 921 * and MSG_DONTROUTE --ANK (980726) 922 * 923 * 1. ip6_rt_check(): If route was host route, 924 * check that cached destination is current. 925 * If it is network route, we still may 926 * check its validity using saved pointer 927 * to the last used address: daddr_cache. 928 * We do not want to save whole address now, 929 * (because main consumer of this service 930 * is tcp, which has not this problem), 931 * so that the last trick works only on connected 932 * sockets. 933 * 2. oif also should be the same. 934 */ 935 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 936 #ifdef CONFIG_IPV6_SUBTREES 937 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 938 #endif 939 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 940 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 941 dst_release(dst); 942 dst = NULL; 943 } 944 945 out: 946 return dst; 947 } 948 949 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 950 struct dst_entry **dst, struct flowi6 *fl6) 951 { 952 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 953 struct neighbour *n; 954 struct rt6_info *rt; 955 #endif 956 int err; 957 int flags = 0; 958 959 /* The correct way to handle this would be to do 960 * ip6_route_get_saddr, and then ip6_route_output; however, 961 * the route-specific preferred source forces the 962 * ip6_route_output call _before_ ip6_route_get_saddr. 963 * 964 * In source specific routing (no src=any default route), 965 * ip6_route_output will fail given src=any saddr, though, so 966 * that's why we try it again later. 967 */ 968 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 969 struct fib6_info *from; 970 struct rt6_info *rt; 971 bool had_dst = *dst != NULL; 972 973 if (!had_dst) 974 *dst = ip6_route_output(net, sk, fl6); 975 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 976 977 rcu_read_lock(); 978 from = rt ? rcu_dereference(rt->from) : NULL; 979 err = ip6_route_get_saddr(net, from, &fl6->daddr, 980 sk ? inet6_sk(sk)->srcprefs : 0, 981 &fl6->saddr); 982 rcu_read_unlock(); 983 984 if (err) 985 goto out_err_release; 986 987 /* If we had an erroneous initial result, pretend it 988 * never existed and let the SA-enabled version take 989 * over. 990 */ 991 if (!had_dst && (*dst)->error) { 992 dst_release(*dst); 993 *dst = NULL; 994 } 995 996 if (fl6->flowi6_oif) 997 flags |= RT6_LOOKUP_F_IFACE; 998 } 999 1000 if (!*dst) 1001 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1002 1003 err = (*dst)->error; 1004 if (err) 1005 goto out_err_release; 1006 1007 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1008 /* 1009 * Here if the dst entry we've looked up 1010 * has a neighbour entry that is in the INCOMPLETE 1011 * state and the src address from the flow is 1012 * marked as OPTIMISTIC, we release the found 1013 * dst entry and replace it instead with the 1014 * dst entry of the nexthop router 1015 */ 1016 rt = (struct rt6_info *) *dst; 1017 rcu_read_lock_bh(); 1018 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1019 rt6_nexthop(rt, &fl6->daddr)); 1020 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1021 rcu_read_unlock_bh(); 1022 1023 if (err) { 1024 struct inet6_ifaddr *ifp; 1025 struct flowi6 fl_gw6; 1026 int redirect; 1027 1028 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1029 (*dst)->dev, 1); 1030 1031 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1032 if (ifp) 1033 in6_ifa_put(ifp); 1034 1035 if (redirect) { 1036 /* 1037 * We need to get the dst entry for the 1038 * default router instead 1039 */ 1040 dst_release(*dst); 1041 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1042 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1043 *dst = ip6_route_output(net, sk, &fl_gw6); 1044 err = (*dst)->error; 1045 if (err) 1046 goto out_err_release; 1047 } 1048 } 1049 #endif 1050 if (ipv6_addr_v4mapped(&fl6->saddr) && 1051 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1052 err = -EAFNOSUPPORT; 1053 goto out_err_release; 1054 } 1055 1056 return 0; 1057 1058 out_err_release: 1059 dst_release(*dst); 1060 *dst = NULL; 1061 1062 if (err == -ENETUNREACH) 1063 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1064 return err; 1065 } 1066 1067 /** 1068 * ip6_dst_lookup - perform route lookup on flow 1069 * @sk: socket which provides route info 1070 * @dst: pointer to dst_entry * for result 1071 * @fl6: flow to lookup 1072 * 1073 * This function performs a route lookup on the given flow. 1074 * 1075 * It returns zero on success, or a standard errno code on error. 1076 */ 1077 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1078 struct flowi6 *fl6) 1079 { 1080 *dst = NULL; 1081 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1082 } 1083 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1084 1085 /** 1086 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1087 * @sk: socket which provides route info 1088 * @fl6: flow to lookup 1089 * @final_dst: final destination address for ipsec lookup 1090 * 1091 * This function performs a route lookup on the given flow. 1092 * 1093 * It returns a valid dst pointer on success, or a pointer encoded 1094 * error code. 1095 */ 1096 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1097 const struct in6_addr *final_dst) 1098 { 1099 struct dst_entry *dst = NULL; 1100 int err; 1101 1102 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1103 if (err) 1104 return ERR_PTR(err); 1105 if (final_dst) 1106 fl6->daddr = *final_dst; 1107 1108 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1109 } 1110 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1111 1112 /** 1113 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1114 * @sk: socket which provides the dst cache and route info 1115 * @fl6: flow to lookup 1116 * @final_dst: final destination address for ipsec lookup 1117 * @connected: whether @sk is connected or not 1118 * 1119 * This function performs a route lookup on the given flow with the 1120 * possibility of using the cached route in the socket if it is valid. 1121 * It will take the socket dst lock when operating on the dst cache. 1122 * As a result, this function can only be used in process context. 1123 * 1124 * In addition, for a connected socket, cache the dst in the socket 1125 * if the current cache is not valid. 1126 * 1127 * It returns a valid dst pointer on success, or a pointer encoded 1128 * error code. 1129 */ 1130 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1131 const struct in6_addr *final_dst, 1132 bool connected) 1133 { 1134 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1135 1136 dst = ip6_sk_dst_check(sk, dst, fl6); 1137 if (dst) 1138 return dst; 1139 1140 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1141 if (connected && !IS_ERR(dst)) 1142 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1143 1144 return dst; 1145 } 1146 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1147 1148 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1149 gfp_t gfp) 1150 { 1151 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1152 } 1153 1154 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1155 gfp_t gfp) 1156 { 1157 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1158 } 1159 1160 static void ip6_append_data_mtu(unsigned int *mtu, 1161 int *maxfraglen, 1162 unsigned int fragheaderlen, 1163 struct sk_buff *skb, 1164 struct rt6_info *rt, 1165 unsigned int orig_mtu) 1166 { 1167 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1168 if (!skb) { 1169 /* first fragment, reserve header_len */ 1170 *mtu = orig_mtu - rt->dst.header_len; 1171 1172 } else { 1173 /* 1174 * this fragment is not first, the headers 1175 * space is regarded as data space. 1176 */ 1177 *mtu = orig_mtu; 1178 } 1179 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1180 + fragheaderlen - sizeof(struct frag_hdr); 1181 } 1182 } 1183 1184 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1185 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1186 struct rt6_info *rt, struct flowi6 *fl6) 1187 { 1188 struct ipv6_pinfo *np = inet6_sk(sk); 1189 unsigned int mtu; 1190 struct ipv6_txoptions *opt = ipc6->opt; 1191 1192 /* 1193 * setup for corking 1194 */ 1195 if (opt) { 1196 if (WARN_ON(v6_cork->opt)) 1197 return -EINVAL; 1198 1199 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1200 if (unlikely(!v6_cork->opt)) 1201 return -ENOBUFS; 1202 1203 v6_cork->opt->tot_len = sizeof(*opt); 1204 v6_cork->opt->opt_flen = opt->opt_flen; 1205 v6_cork->opt->opt_nflen = opt->opt_nflen; 1206 1207 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1208 sk->sk_allocation); 1209 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1210 return -ENOBUFS; 1211 1212 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1213 sk->sk_allocation); 1214 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1215 return -ENOBUFS; 1216 1217 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1218 sk->sk_allocation); 1219 if (opt->hopopt && !v6_cork->opt->hopopt) 1220 return -ENOBUFS; 1221 1222 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1223 sk->sk_allocation); 1224 if (opt->srcrt && !v6_cork->opt->srcrt) 1225 return -ENOBUFS; 1226 1227 /* need source address above miyazawa*/ 1228 } 1229 dst_hold(&rt->dst); 1230 cork->base.dst = &rt->dst; 1231 cork->fl.u.ip6 = *fl6; 1232 v6_cork->hop_limit = ipc6->hlimit; 1233 v6_cork->tclass = ipc6->tclass; 1234 if (rt->dst.flags & DST_XFRM_TUNNEL) 1235 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1236 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1237 else 1238 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1239 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1240 if (np->frag_size < mtu) { 1241 if (np->frag_size) 1242 mtu = np->frag_size; 1243 } 1244 if (mtu < IPV6_MIN_MTU) 1245 return -EINVAL; 1246 cork->base.fragsize = mtu; 1247 cork->base.gso_size = ipc6->gso_size; 1248 cork->base.tx_flags = 0; 1249 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1250 1251 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1252 cork->base.flags |= IPCORK_ALLFRAG; 1253 cork->base.length = 0; 1254 1255 cork->base.transmit_time = ipc6->sockc.transmit_time; 1256 1257 return 0; 1258 } 1259 1260 static int __ip6_append_data(struct sock *sk, 1261 struct flowi6 *fl6, 1262 struct sk_buff_head *queue, 1263 struct inet_cork *cork, 1264 struct inet6_cork *v6_cork, 1265 struct page_frag *pfrag, 1266 int getfrag(void *from, char *to, int offset, 1267 int len, int odd, struct sk_buff *skb), 1268 void *from, int length, int transhdrlen, 1269 unsigned int flags, struct ipcm6_cookie *ipc6) 1270 { 1271 struct sk_buff *skb, *skb_prev = NULL; 1272 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1273 struct ubuf_info *uarg = NULL; 1274 int exthdrlen = 0; 1275 int dst_exthdrlen = 0; 1276 int hh_len; 1277 int copy; 1278 int err; 1279 int offset = 0; 1280 u32 tskey = 0; 1281 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1282 struct ipv6_txoptions *opt = v6_cork->opt; 1283 int csummode = CHECKSUM_NONE; 1284 unsigned int maxnonfragsize, headersize; 1285 unsigned int wmem_alloc_delta = 0; 1286 bool paged, extra_uref; 1287 1288 skb = skb_peek_tail(queue); 1289 if (!skb) { 1290 exthdrlen = opt ? opt->opt_flen : 0; 1291 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1292 } 1293 1294 paged = !!cork->gso_size; 1295 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1296 orig_mtu = mtu; 1297 1298 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 1299 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1300 tskey = sk->sk_tskey++; 1301 1302 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1303 1304 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1305 (opt ? opt->opt_nflen : 0); 1306 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1307 sizeof(struct frag_hdr); 1308 1309 headersize = sizeof(struct ipv6hdr) + 1310 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1311 (dst_allfrag(&rt->dst) ? 1312 sizeof(struct frag_hdr) : 0) + 1313 rt->rt6i_nfheader_len; 1314 1315 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1316 * the first fragment 1317 */ 1318 if (headersize + transhdrlen > mtu) 1319 goto emsgsize; 1320 1321 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1322 (sk->sk_protocol == IPPROTO_UDP || 1323 sk->sk_protocol == IPPROTO_RAW)) { 1324 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1325 sizeof(struct ipv6hdr)); 1326 goto emsgsize; 1327 } 1328 1329 if (ip6_sk_ignore_df(sk)) 1330 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1331 else 1332 maxnonfragsize = mtu; 1333 1334 if (cork->length + length > maxnonfragsize - headersize) { 1335 emsgsize: 1336 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1337 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1338 return -EMSGSIZE; 1339 } 1340 1341 /* CHECKSUM_PARTIAL only with no extension headers and when 1342 * we are not going to fragment 1343 */ 1344 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1345 headersize == sizeof(struct ipv6hdr) && 1346 length <= mtu - headersize && 1347 (!(flags & MSG_MORE) || cork->gso_size) && 1348 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1349 csummode = CHECKSUM_PARTIAL; 1350 1351 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1352 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1353 if (!uarg) 1354 return -ENOBUFS; 1355 extra_uref = true; 1356 if (rt->dst.dev->features & NETIF_F_SG && 1357 csummode == CHECKSUM_PARTIAL) { 1358 paged = true; 1359 } else { 1360 uarg->zerocopy = 0; 1361 skb_zcopy_set(skb, uarg, &extra_uref); 1362 } 1363 } 1364 1365 /* 1366 * Let's try using as much space as possible. 1367 * Use MTU if total length of the message fits into the MTU. 1368 * Otherwise, we need to reserve fragment header and 1369 * fragment alignment (= 8-15 octects, in total). 1370 * 1371 * Note that we may need to "move" the data from the tail of 1372 * of the buffer to the new fragment when we split 1373 * the message. 1374 * 1375 * FIXME: It may be fragmented into multiple chunks 1376 * at once if non-fragmentable extension headers 1377 * are too large. 1378 * --yoshfuji 1379 */ 1380 1381 cork->length += length; 1382 if (!skb) 1383 goto alloc_new_skb; 1384 1385 while (length > 0) { 1386 /* Check if the remaining data fits into current packet. */ 1387 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1388 if (copy < length) 1389 copy = maxfraglen - skb->len; 1390 1391 if (copy <= 0) { 1392 char *data; 1393 unsigned int datalen; 1394 unsigned int fraglen; 1395 unsigned int fraggap; 1396 unsigned int alloclen; 1397 unsigned int pagedlen; 1398 alloc_new_skb: 1399 /* There's no room in the current skb */ 1400 if (skb) 1401 fraggap = skb->len - maxfraglen; 1402 else 1403 fraggap = 0; 1404 /* update mtu and maxfraglen if necessary */ 1405 if (!skb || !skb_prev) 1406 ip6_append_data_mtu(&mtu, &maxfraglen, 1407 fragheaderlen, skb, rt, 1408 orig_mtu); 1409 1410 skb_prev = skb; 1411 1412 /* 1413 * If remaining data exceeds the mtu, 1414 * we know we need more fragment(s). 1415 */ 1416 datalen = length + fraggap; 1417 1418 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1419 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1420 fraglen = datalen + fragheaderlen; 1421 pagedlen = 0; 1422 1423 if ((flags & MSG_MORE) && 1424 !(rt->dst.dev->features&NETIF_F_SG)) 1425 alloclen = mtu; 1426 else if (!paged) 1427 alloclen = fraglen; 1428 else { 1429 alloclen = min_t(int, fraglen, MAX_HEADER); 1430 pagedlen = fraglen - alloclen; 1431 } 1432 1433 alloclen += dst_exthdrlen; 1434 1435 if (datalen != length + fraggap) { 1436 /* 1437 * this is not the last fragment, the trailer 1438 * space is regarded as data space. 1439 */ 1440 datalen += rt->dst.trailer_len; 1441 } 1442 1443 alloclen += rt->dst.trailer_len; 1444 fraglen = datalen + fragheaderlen; 1445 1446 /* 1447 * We just reserve space for fragment header. 1448 * Note: this may be overallocation if the message 1449 * (without MSG_MORE) fits into the MTU. 1450 */ 1451 alloclen += sizeof(struct frag_hdr); 1452 1453 copy = datalen - transhdrlen - fraggap - pagedlen; 1454 if (copy < 0) { 1455 err = -EINVAL; 1456 goto error; 1457 } 1458 if (transhdrlen) { 1459 skb = sock_alloc_send_skb(sk, 1460 alloclen + hh_len, 1461 (flags & MSG_DONTWAIT), &err); 1462 } else { 1463 skb = NULL; 1464 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1465 2 * sk->sk_sndbuf) 1466 skb = alloc_skb(alloclen + hh_len, 1467 sk->sk_allocation); 1468 if (unlikely(!skb)) 1469 err = -ENOBUFS; 1470 } 1471 if (!skb) 1472 goto error; 1473 /* 1474 * Fill in the control structures 1475 */ 1476 skb->protocol = htons(ETH_P_IPV6); 1477 skb->ip_summed = csummode; 1478 skb->csum = 0; 1479 /* reserve for fragmentation and ipsec header */ 1480 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1481 dst_exthdrlen); 1482 1483 /* 1484 * Find where to start putting bytes 1485 */ 1486 data = skb_put(skb, fraglen - pagedlen); 1487 skb_set_network_header(skb, exthdrlen); 1488 data += fragheaderlen; 1489 skb->transport_header = (skb->network_header + 1490 fragheaderlen); 1491 if (fraggap) { 1492 skb->csum = skb_copy_and_csum_bits( 1493 skb_prev, maxfraglen, 1494 data + transhdrlen, fraggap, 0); 1495 skb_prev->csum = csum_sub(skb_prev->csum, 1496 skb->csum); 1497 data += fraggap; 1498 pskb_trim_unique(skb_prev, maxfraglen); 1499 } 1500 if (copy > 0 && 1501 getfrag(from, data + transhdrlen, offset, 1502 copy, fraggap, skb) < 0) { 1503 err = -EFAULT; 1504 kfree_skb(skb); 1505 goto error; 1506 } 1507 1508 offset += copy; 1509 length -= copy + transhdrlen; 1510 transhdrlen = 0; 1511 exthdrlen = 0; 1512 dst_exthdrlen = 0; 1513 1514 /* Only the initial fragment is time stamped */ 1515 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1516 cork->tx_flags = 0; 1517 skb_shinfo(skb)->tskey = tskey; 1518 tskey = 0; 1519 skb_zcopy_set(skb, uarg, &extra_uref); 1520 1521 if ((flags & MSG_CONFIRM) && !skb_prev) 1522 skb_set_dst_pending_confirm(skb, 1); 1523 1524 /* 1525 * Put the packet on the pending queue 1526 */ 1527 if (!skb->destructor) { 1528 skb->destructor = sock_wfree; 1529 skb->sk = sk; 1530 wmem_alloc_delta += skb->truesize; 1531 } 1532 __skb_queue_tail(queue, skb); 1533 continue; 1534 } 1535 1536 if (copy > length) 1537 copy = length; 1538 1539 if (!(rt->dst.dev->features&NETIF_F_SG) && 1540 skb_tailroom(skb) >= copy) { 1541 unsigned int off; 1542 1543 off = skb->len; 1544 if (getfrag(from, skb_put(skb, copy), 1545 offset, copy, off, skb) < 0) { 1546 __skb_trim(skb, off); 1547 err = -EFAULT; 1548 goto error; 1549 } 1550 } else if (!uarg || !uarg->zerocopy) { 1551 int i = skb_shinfo(skb)->nr_frags; 1552 1553 err = -ENOMEM; 1554 if (!sk_page_frag_refill(sk, pfrag)) 1555 goto error; 1556 1557 if (!skb_can_coalesce(skb, i, pfrag->page, 1558 pfrag->offset)) { 1559 err = -EMSGSIZE; 1560 if (i == MAX_SKB_FRAGS) 1561 goto error; 1562 1563 __skb_fill_page_desc(skb, i, pfrag->page, 1564 pfrag->offset, 0); 1565 skb_shinfo(skb)->nr_frags = ++i; 1566 get_page(pfrag->page); 1567 } 1568 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1569 if (getfrag(from, 1570 page_address(pfrag->page) + pfrag->offset, 1571 offset, copy, skb->len, skb) < 0) 1572 goto error_efault; 1573 1574 pfrag->offset += copy; 1575 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1576 skb->len += copy; 1577 skb->data_len += copy; 1578 skb->truesize += copy; 1579 wmem_alloc_delta += copy; 1580 } else { 1581 err = skb_zerocopy_iter_dgram(skb, from, copy); 1582 if (err < 0) 1583 goto error; 1584 } 1585 offset += copy; 1586 length -= copy; 1587 } 1588 1589 if (wmem_alloc_delta) 1590 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1591 return 0; 1592 1593 error_efault: 1594 err = -EFAULT; 1595 error: 1596 if (uarg) 1597 sock_zerocopy_put_abort(uarg, extra_uref); 1598 cork->length -= length; 1599 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1600 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1601 return err; 1602 } 1603 1604 int ip6_append_data(struct sock *sk, 1605 int getfrag(void *from, char *to, int offset, int len, 1606 int odd, struct sk_buff *skb), 1607 void *from, int length, int transhdrlen, 1608 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1609 struct rt6_info *rt, unsigned int flags) 1610 { 1611 struct inet_sock *inet = inet_sk(sk); 1612 struct ipv6_pinfo *np = inet6_sk(sk); 1613 int exthdrlen; 1614 int err; 1615 1616 if (flags&MSG_PROBE) 1617 return 0; 1618 if (skb_queue_empty(&sk->sk_write_queue)) { 1619 /* 1620 * setup for corking 1621 */ 1622 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1623 ipc6, rt, fl6); 1624 if (err) 1625 return err; 1626 1627 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1628 length += exthdrlen; 1629 transhdrlen += exthdrlen; 1630 } else { 1631 fl6 = &inet->cork.fl.u.ip6; 1632 transhdrlen = 0; 1633 } 1634 1635 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1636 &np->cork, sk_page_frag(sk), getfrag, 1637 from, length, transhdrlen, flags, ipc6); 1638 } 1639 EXPORT_SYMBOL_GPL(ip6_append_data); 1640 1641 static void ip6_cork_release(struct inet_cork_full *cork, 1642 struct inet6_cork *v6_cork) 1643 { 1644 if (v6_cork->opt) { 1645 kfree(v6_cork->opt->dst0opt); 1646 kfree(v6_cork->opt->dst1opt); 1647 kfree(v6_cork->opt->hopopt); 1648 kfree(v6_cork->opt->srcrt); 1649 kfree(v6_cork->opt); 1650 v6_cork->opt = NULL; 1651 } 1652 1653 if (cork->base.dst) { 1654 dst_release(cork->base.dst); 1655 cork->base.dst = NULL; 1656 cork->base.flags &= ~IPCORK_ALLFRAG; 1657 } 1658 memset(&cork->fl, 0, sizeof(cork->fl)); 1659 } 1660 1661 struct sk_buff *__ip6_make_skb(struct sock *sk, 1662 struct sk_buff_head *queue, 1663 struct inet_cork_full *cork, 1664 struct inet6_cork *v6_cork) 1665 { 1666 struct sk_buff *skb, *tmp_skb; 1667 struct sk_buff **tail_skb; 1668 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1669 struct ipv6_pinfo *np = inet6_sk(sk); 1670 struct net *net = sock_net(sk); 1671 struct ipv6hdr *hdr; 1672 struct ipv6_txoptions *opt = v6_cork->opt; 1673 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1674 struct flowi6 *fl6 = &cork->fl.u.ip6; 1675 unsigned char proto = fl6->flowi6_proto; 1676 1677 skb = __skb_dequeue(queue); 1678 if (!skb) 1679 goto out; 1680 tail_skb = &(skb_shinfo(skb)->frag_list); 1681 1682 /* move skb->data to ip header from ext header */ 1683 if (skb->data < skb_network_header(skb)) 1684 __skb_pull(skb, skb_network_offset(skb)); 1685 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1686 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1687 *tail_skb = tmp_skb; 1688 tail_skb = &(tmp_skb->next); 1689 skb->len += tmp_skb->len; 1690 skb->data_len += tmp_skb->len; 1691 skb->truesize += tmp_skb->truesize; 1692 tmp_skb->destructor = NULL; 1693 tmp_skb->sk = NULL; 1694 } 1695 1696 /* Allow local fragmentation. */ 1697 skb->ignore_df = ip6_sk_ignore_df(sk); 1698 1699 *final_dst = fl6->daddr; 1700 __skb_pull(skb, skb_network_header_len(skb)); 1701 if (opt && opt->opt_flen) 1702 ipv6_push_frag_opts(skb, opt, &proto); 1703 if (opt && opt->opt_nflen) 1704 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1705 1706 skb_push(skb, sizeof(struct ipv6hdr)); 1707 skb_reset_network_header(skb); 1708 hdr = ipv6_hdr(skb); 1709 1710 ip6_flow_hdr(hdr, v6_cork->tclass, 1711 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1712 ip6_autoflowlabel(net, np), fl6)); 1713 hdr->hop_limit = v6_cork->hop_limit; 1714 hdr->nexthdr = proto; 1715 hdr->saddr = fl6->saddr; 1716 hdr->daddr = *final_dst; 1717 1718 skb->priority = sk->sk_priority; 1719 skb->mark = sk->sk_mark; 1720 1721 skb->tstamp = cork->base.transmit_time; 1722 1723 skb_dst_set(skb, dst_clone(&rt->dst)); 1724 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1725 if (proto == IPPROTO_ICMPV6) { 1726 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1727 1728 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1729 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1730 } 1731 1732 ip6_cork_release(cork, v6_cork); 1733 out: 1734 return skb; 1735 } 1736 1737 int ip6_send_skb(struct sk_buff *skb) 1738 { 1739 struct net *net = sock_net(skb->sk); 1740 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1741 int err; 1742 1743 err = ip6_local_out(net, skb->sk, skb); 1744 if (err) { 1745 if (err > 0) 1746 err = net_xmit_errno(err); 1747 if (err) 1748 IP6_INC_STATS(net, rt->rt6i_idev, 1749 IPSTATS_MIB_OUTDISCARDS); 1750 } 1751 1752 return err; 1753 } 1754 1755 int ip6_push_pending_frames(struct sock *sk) 1756 { 1757 struct sk_buff *skb; 1758 1759 skb = ip6_finish_skb(sk); 1760 if (!skb) 1761 return 0; 1762 1763 return ip6_send_skb(skb); 1764 } 1765 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1766 1767 static void __ip6_flush_pending_frames(struct sock *sk, 1768 struct sk_buff_head *queue, 1769 struct inet_cork_full *cork, 1770 struct inet6_cork *v6_cork) 1771 { 1772 struct sk_buff *skb; 1773 1774 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1775 if (skb_dst(skb)) 1776 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1777 IPSTATS_MIB_OUTDISCARDS); 1778 kfree_skb(skb); 1779 } 1780 1781 ip6_cork_release(cork, v6_cork); 1782 } 1783 1784 void ip6_flush_pending_frames(struct sock *sk) 1785 { 1786 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1787 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1788 } 1789 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1790 1791 struct sk_buff *ip6_make_skb(struct sock *sk, 1792 int getfrag(void *from, char *to, int offset, 1793 int len, int odd, struct sk_buff *skb), 1794 void *from, int length, int transhdrlen, 1795 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1796 struct rt6_info *rt, unsigned int flags, 1797 struct inet_cork_full *cork) 1798 { 1799 struct inet6_cork v6_cork; 1800 struct sk_buff_head queue; 1801 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1802 int err; 1803 1804 if (flags & MSG_PROBE) 1805 return NULL; 1806 1807 __skb_queue_head_init(&queue); 1808 1809 cork->base.flags = 0; 1810 cork->base.addr = 0; 1811 cork->base.opt = NULL; 1812 cork->base.dst = NULL; 1813 v6_cork.opt = NULL; 1814 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); 1815 if (err) { 1816 ip6_cork_release(cork, &v6_cork); 1817 return ERR_PTR(err); 1818 } 1819 if (ipc6->dontfrag < 0) 1820 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1821 1822 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1823 ¤t->task_frag, getfrag, from, 1824 length + exthdrlen, transhdrlen + exthdrlen, 1825 flags, ipc6); 1826 if (err) { 1827 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1828 return ERR_PTR(err); 1829 } 1830 1831 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 1832 } 1833