1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 int ret; 134 135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 if (ret) { 137 kfree_skb(skb); 138 return ret; 139 } 140 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 /* Policy lookup after SNAT yielded a new policy */ 143 if (skb_dst(skb)->xfrm) { 144 IPCB(skb)->flags |= IPSKB_REROUTED; 145 return dst_output(net, sk, skb); 146 } 147 #endif 148 149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 150 dst_allfrag(skb_dst(skb)) || 151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 153 else 154 return ip6_finish_output2(net, sk, skb); 155 } 156 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 { 159 struct net_device *dev = skb_dst(skb)->dev; 160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 161 162 skb->protocol = htons(ETH_P_IPV6); 163 skb->dev = dev; 164 165 if (unlikely(idev->cnf.disable_ipv6)) { 166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 167 kfree_skb(skb); 168 return 0; 169 } 170 171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 172 net, sk, skb, NULL, dev, 173 ip6_finish_output, 174 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 } 176 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 178 { 179 if (!np->autoflowlabel_set) 180 return ip6_default_np_autolabel(net); 181 else 182 return np->autoflowlabel; 183 } 184 185 /* 186 * xmit an sk_buff (used by TCP, SCTP and DCCP) 187 * Note : socket lock is not held for SYNACK packets, but might be modified 188 * by calls to skb_set_owner_w() and ipv6_local_error(), 189 * which are using proper atomic operations or spinlocks. 190 */ 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 192 __u32 mark, struct ipv6_txoptions *opt, int tclass) 193 { 194 struct net *net = sock_net(sk); 195 const struct ipv6_pinfo *np = inet6_sk(sk); 196 struct in6_addr *first_hop = &fl6->daddr; 197 struct dst_entry *dst = skb_dst(skb); 198 unsigned int head_room; 199 struct ipv6hdr *hdr; 200 u8 proto = fl6->flowi6_proto; 201 int seg_len = skb->len; 202 int hlimit = -1; 203 u32 mtu; 204 205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 206 if (opt) 207 head_room += opt->opt_nflen + opt->opt_flen; 208 209 if (unlikely(skb_headroom(skb) < head_room)) { 210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 211 if (!skb2) { 212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 213 IPSTATS_MIB_OUTDISCARDS); 214 kfree_skb(skb); 215 return -ENOBUFS; 216 } 217 if (skb->sk) 218 skb_set_owner_w(skb2, skb->sk); 219 consume_skb(skb); 220 skb = skb2; 221 } 222 223 if (opt) { 224 seg_len += opt->opt_nflen + opt->opt_flen; 225 226 if (opt->opt_flen) 227 ipv6_push_frag_opts(skb, opt, &proto); 228 229 if (opt->opt_nflen) 230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 231 &fl6->saddr); 232 } 233 234 skb_push(skb, sizeof(struct ipv6hdr)); 235 skb_reset_network_header(skb); 236 hdr = ipv6_hdr(skb); 237 238 /* 239 * Fill in the IPv6 header 240 */ 241 if (np) 242 hlimit = np->hop_limit; 243 if (hlimit < 0) 244 hlimit = ip6_dst_hoplimit(dst); 245 246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 247 ip6_autoflowlabel(net, np), fl6)); 248 249 hdr->payload_len = htons(seg_len); 250 hdr->nexthdr = proto; 251 hdr->hop_limit = hlimit; 252 253 hdr->saddr = fl6->saddr; 254 hdr->daddr = *first_hop; 255 256 skb->protocol = htons(ETH_P_IPV6); 257 skb->priority = sk->sk_priority; 258 skb->mark = mark; 259 260 mtu = dst_mtu(dst); 261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 263 IPSTATS_MIB_OUT, skb->len); 264 265 /* if egress device is enslaved to an L3 master device pass the 266 * skb to its handler for processing 267 */ 268 skb = l3mdev_ip6_out((struct sock *)sk, skb); 269 if (unlikely(!skb)) 270 return 0; 271 272 /* hooks should never assume socket lock is held. 273 * we promote our socket to non const 274 */ 275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 276 net, (struct sock *)sk, skb, NULL, dst->dev, 277 dst_output); 278 } 279 280 skb->dev = dst->dev; 281 /* ipv6_local_error() does not require socket lock, 282 * we promote our socket to non const 283 */ 284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 285 286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 287 kfree_skb(skb); 288 return -EMSGSIZE; 289 } 290 EXPORT_SYMBOL(ip6_xmit); 291 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 293 { 294 struct ip6_ra_chain *ra; 295 struct sock *last = NULL; 296 297 read_lock(&ip6_ra_lock); 298 for (ra = ip6_ra_chain; ra; ra = ra->next) { 299 struct sock *sk = ra->sk; 300 if (sk && ra->sel == sel && 301 (!sk->sk_bound_dev_if || 302 sk->sk_bound_dev_if == skb->dev->ifindex)) { 303 struct ipv6_pinfo *np = inet6_sk(sk); 304 305 if (np && np->rtalert_isolate && 306 !net_eq(sock_net(sk), dev_net(skb->dev))) { 307 continue; 308 } 309 if (last) { 310 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 311 if (skb2) 312 rawv6_rcv(last, skb2); 313 } 314 last = sk; 315 } 316 } 317 318 if (last) { 319 rawv6_rcv(last, skb); 320 read_unlock(&ip6_ra_lock); 321 return 1; 322 } 323 read_unlock(&ip6_ra_lock); 324 return 0; 325 } 326 327 static int ip6_forward_proxy_check(struct sk_buff *skb) 328 { 329 struct ipv6hdr *hdr = ipv6_hdr(skb); 330 u8 nexthdr = hdr->nexthdr; 331 __be16 frag_off; 332 int offset; 333 334 if (ipv6_ext_hdr(nexthdr)) { 335 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 336 if (offset < 0) 337 return 0; 338 } else 339 offset = sizeof(struct ipv6hdr); 340 341 if (nexthdr == IPPROTO_ICMPV6) { 342 struct icmp6hdr *icmp6; 343 344 if (!pskb_may_pull(skb, (skb_network_header(skb) + 345 offset + 1 - skb->data))) 346 return 0; 347 348 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 349 350 switch (icmp6->icmp6_type) { 351 case NDISC_ROUTER_SOLICITATION: 352 case NDISC_ROUTER_ADVERTISEMENT: 353 case NDISC_NEIGHBOUR_SOLICITATION: 354 case NDISC_NEIGHBOUR_ADVERTISEMENT: 355 case NDISC_REDIRECT: 356 /* For reaction involving unicast neighbor discovery 357 * message destined to the proxied address, pass it to 358 * input function. 359 */ 360 return 1; 361 default: 362 break; 363 } 364 } 365 366 /* 367 * The proxying router can't forward traffic sent to a link-local 368 * address, so signal the sender and discard the packet. This 369 * behavior is clarified by the MIPv6 specification. 370 */ 371 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 372 dst_link_failure(skb); 373 return -1; 374 } 375 376 return 0; 377 } 378 379 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 380 struct sk_buff *skb) 381 { 382 struct dst_entry *dst = skb_dst(skb); 383 384 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 385 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 386 387 #ifdef CONFIG_NET_SWITCHDEV 388 if (skb->offload_l3_fwd_mark) { 389 consume_skb(skb); 390 return 0; 391 } 392 #endif 393 394 skb->tstamp = 0; 395 return dst_output(net, sk, skb); 396 } 397 398 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 399 { 400 if (skb->len <= mtu) 401 return false; 402 403 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 404 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 405 return true; 406 407 if (skb->ignore_df) 408 return false; 409 410 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 411 return false; 412 413 return true; 414 } 415 416 int ip6_forward(struct sk_buff *skb) 417 { 418 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 419 struct dst_entry *dst = skb_dst(skb); 420 struct ipv6hdr *hdr = ipv6_hdr(skb); 421 struct inet6_skb_parm *opt = IP6CB(skb); 422 struct net *net = dev_net(dst->dev); 423 u32 mtu; 424 425 if (net->ipv6.devconf_all->forwarding == 0) 426 goto error; 427 428 if (skb->pkt_type != PACKET_HOST) 429 goto drop; 430 431 if (unlikely(skb->sk)) 432 goto drop; 433 434 if (skb_warn_if_lro(skb)) 435 goto drop; 436 437 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 438 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 439 goto drop; 440 } 441 442 skb_forward_csum(skb); 443 444 /* 445 * We DO NOT make any processing on 446 * RA packets, pushing them to user level AS IS 447 * without ane WARRANTY that application will be able 448 * to interpret them. The reason is that we 449 * cannot make anything clever here. 450 * 451 * We are not end-node, so that if packet contains 452 * AH/ESP, we cannot make anything. 453 * Defragmentation also would be mistake, RA packets 454 * cannot be fragmented, because there is no warranty 455 * that different fragments will go along one path. --ANK 456 */ 457 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 458 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 459 return 0; 460 } 461 462 /* 463 * check and decrement ttl 464 */ 465 if (hdr->hop_limit <= 1) { 466 /* Force OUTPUT device used as source address */ 467 skb->dev = dst->dev; 468 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 469 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 470 471 kfree_skb(skb); 472 return -ETIMEDOUT; 473 } 474 475 /* XXX: idev->cnf.proxy_ndp? */ 476 if (net->ipv6.devconf_all->proxy_ndp && 477 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 478 int proxied = ip6_forward_proxy_check(skb); 479 if (proxied > 0) 480 return ip6_input(skb); 481 else if (proxied < 0) { 482 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 483 goto drop; 484 } 485 } 486 487 if (!xfrm6_route_forward(skb)) { 488 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 489 goto drop; 490 } 491 dst = skb_dst(skb); 492 493 /* IPv6 specs say nothing about it, but it is clear that we cannot 494 send redirects to source routed frames. 495 We don't send redirects to frames decapsulated from IPsec. 496 */ 497 if (IP6CB(skb)->iif == dst->dev->ifindex && 498 opt->srcrt == 0 && !skb_sec_path(skb)) { 499 struct in6_addr *target = NULL; 500 struct inet_peer *peer; 501 struct rt6_info *rt; 502 503 /* 504 * incoming and outgoing devices are the same 505 * send a redirect. 506 */ 507 508 rt = (struct rt6_info *) dst; 509 if (rt->rt6i_flags & RTF_GATEWAY) 510 target = &rt->rt6i_gateway; 511 else 512 target = &hdr->daddr; 513 514 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 515 516 /* Limit redirects both by destination (here) 517 and by source (inside ndisc_send_redirect) 518 */ 519 if (inet_peer_xrlim_allow(peer, 1*HZ)) 520 ndisc_send_redirect(skb, target); 521 if (peer) 522 inet_putpeer(peer); 523 } else { 524 int addrtype = ipv6_addr_type(&hdr->saddr); 525 526 /* This check is security critical. */ 527 if (addrtype == IPV6_ADDR_ANY || 528 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 529 goto error; 530 if (addrtype & IPV6_ADDR_LINKLOCAL) { 531 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 532 ICMPV6_NOT_NEIGHBOUR, 0); 533 goto error; 534 } 535 } 536 537 mtu = ip6_dst_mtu_forward(dst); 538 if (mtu < IPV6_MIN_MTU) 539 mtu = IPV6_MIN_MTU; 540 541 if (ip6_pkt_too_big(skb, mtu)) { 542 /* Again, force OUTPUT device used as source address */ 543 skb->dev = dst->dev; 544 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 545 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 546 __IP6_INC_STATS(net, ip6_dst_idev(dst), 547 IPSTATS_MIB_FRAGFAILS); 548 kfree_skb(skb); 549 return -EMSGSIZE; 550 } 551 552 if (skb_cow(skb, dst->dev->hard_header_len)) { 553 __IP6_INC_STATS(net, ip6_dst_idev(dst), 554 IPSTATS_MIB_OUTDISCARDS); 555 goto drop; 556 } 557 558 hdr = ipv6_hdr(skb); 559 560 /* Mangling hops number delayed to point after skb COW */ 561 562 hdr->hop_limit--; 563 564 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 565 net, NULL, skb, skb->dev, dst->dev, 566 ip6_forward_finish); 567 568 error: 569 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 570 drop: 571 kfree_skb(skb); 572 return -EINVAL; 573 } 574 575 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 576 { 577 to->pkt_type = from->pkt_type; 578 to->priority = from->priority; 579 to->protocol = from->protocol; 580 skb_dst_drop(to); 581 skb_dst_set(to, dst_clone(skb_dst(from))); 582 to->dev = from->dev; 583 to->mark = from->mark; 584 585 skb_copy_hash(to, from); 586 587 #ifdef CONFIG_NET_SCHED 588 to->tc_index = from->tc_index; 589 #endif 590 nf_copy(to, from); 591 skb_ext_copy(to, from); 592 skb_copy_secmark(to, from); 593 } 594 595 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 596 int (*output)(struct net *, struct sock *, struct sk_buff *)) 597 { 598 struct sk_buff *frag; 599 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 600 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 601 inet6_sk(skb->sk) : NULL; 602 struct ipv6hdr *tmp_hdr; 603 struct frag_hdr *fh; 604 unsigned int mtu, hlen, left, len, nexthdr_offset; 605 int hroom, troom; 606 __be32 frag_id; 607 int ptr, offset = 0, err = 0; 608 u8 *prevhdr, nexthdr = 0; 609 610 err = ip6_find_1stfragopt(skb, &prevhdr); 611 if (err < 0) 612 goto fail; 613 hlen = err; 614 nexthdr = *prevhdr; 615 nexthdr_offset = prevhdr - skb_network_header(skb); 616 617 mtu = ip6_skb_dst_mtu(skb); 618 619 /* We must not fragment if the socket is set to force MTU discovery 620 * or if the skb it not generated by a local socket. 621 */ 622 if (unlikely(!skb->ignore_df && skb->len > mtu)) 623 goto fail_toobig; 624 625 if (IP6CB(skb)->frag_max_size) { 626 if (IP6CB(skb)->frag_max_size > mtu) 627 goto fail_toobig; 628 629 /* don't send fragments larger than what we received */ 630 mtu = IP6CB(skb)->frag_max_size; 631 if (mtu < IPV6_MIN_MTU) 632 mtu = IPV6_MIN_MTU; 633 } 634 635 if (np && np->frag_size < mtu) { 636 if (np->frag_size) 637 mtu = np->frag_size; 638 } 639 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 640 goto fail_toobig; 641 mtu -= hlen + sizeof(struct frag_hdr); 642 643 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 644 &ipv6_hdr(skb)->saddr); 645 646 if (skb->ip_summed == CHECKSUM_PARTIAL && 647 (err = skb_checksum_help(skb))) 648 goto fail; 649 650 prevhdr = skb_network_header(skb) + nexthdr_offset; 651 hroom = LL_RESERVED_SPACE(rt->dst.dev); 652 if (skb_has_frag_list(skb)) { 653 unsigned int first_len = skb_pagelen(skb); 654 struct sk_buff *frag2; 655 656 if (first_len - hlen > mtu || 657 ((first_len - hlen) & 7) || 658 skb_cloned(skb) || 659 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 660 goto slow_path; 661 662 skb_walk_frags(skb, frag) { 663 /* Correct geometry. */ 664 if (frag->len > mtu || 665 ((frag->len & 7) && frag->next) || 666 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 667 goto slow_path_clean; 668 669 /* Partially cloned skb? */ 670 if (skb_shared(frag)) 671 goto slow_path_clean; 672 673 BUG_ON(frag->sk); 674 if (skb->sk) { 675 frag->sk = skb->sk; 676 frag->destructor = sock_wfree; 677 } 678 skb->truesize -= frag->truesize; 679 } 680 681 err = 0; 682 offset = 0; 683 /* BUILD HEADER */ 684 685 *prevhdr = NEXTHDR_FRAGMENT; 686 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 687 if (!tmp_hdr) { 688 err = -ENOMEM; 689 goto fail; 690 } 691 frag = skb_shinfo(skb)->frag_list; 692 skb_frag_list_init(skb); 693 694 __skb_pull(skb, hlen); 695 fh = __skb_push(skb, sizeof(struct frag_hdr)); 696 __skb_push(skb, hlen); 697 skb_reset_network_header(skb); 698 memcpy(skb_network_header(skb), tmp_hdr, hlen); 699 700 fh->nexthdr = nexthdr; 701 fh->reserved = 0; 702 fh->frag_off = htons(IP6_MF); 703 fh->identification = frag_id; 704 705 first_len = skb_pagelen(skb); 706 skb->data_len = first_len - skb_headlen(skb); 707 skb->len = first_len; 708 ipv6_hdr(skb)->payload_len = htons(first_len - 709 sizeof(struct ipv6hdr)); 710 711 for (;;) { 712 /* Prepare header of the next frame, 713 * before previous one went down. */ 714 if (frag) { 715 frag->ip_summed = CHECKSUM_NONE; 716 skb_reset_transport_header(frag); 717 fh = __skb_push(frag, sizeof(struct frag_hdr)); 718 __skb_push(frag, hlen); 719 skb_reset_network_header(frag); 720 memcpy(skb_network_header(frag), tmp_hdr, 721 hlen); 722 offset += skb->len - hlen - sizeof(struct frag_hdr); 723 fh->nexthdr = nexthdr; 724 fh->reserved = 0; 725 fh->frag_off = htons(offset); 726 if (frag->next) 727 fh->frag_off |= htons(IP6_MF); 728 fh->identification = frag_id; 729 ipv6_hdr(frag)->payload_len = 730 htons(frag->len - 731 sizeof(struct ipv6hdr)); 732 ip6_copy_metadata(frag, skb); 733 } 734 735 err = output(net, sk, skb); 736 if (!err) 737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 738 IPSTATS_MIB_FRAGCREATES); 739 740 if (err || !frag) 741 break; 742 743 skb = frag; 744 frag = skb->next; 745 skb_mark_not_on_list(skb); 746 } 747 748 kfree(tmp_hdr); 749 750 if (err == 0) { 751 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 752 IPSTATS_MIB_FRAGOKS); 753 return 0; 754 } 755 756 kfree_skb_list(frag); 757 758 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 759 IPSTATS_MIB_FRAGFAILS); 760 return err; 761 762 slow_path_clean: 763 skb_walk_frags(skb, frag2) { 764 if (frag2 == frag) 765 break; 766 frag2->sk = NULL; 767 frag2->destructor = NULL; 768 skb->truesize += frag2->truesize; 769 } 770 } 771 772 slow_path: 773 left = skb->len - hlen; /* Space per frame */ 774 ptr = hlen; /* Where to start from */ 775 776 /* 777 * Fragment the datagram. 778 */ 779 780 troom = rt->dst.dev->needed_tailroom; 781 782 /* 783 * Keep copying data until we run out. 784 */ 785 while (left > 0) { 786 u8 *fragnexthdr_offset; 787 788 len = left; 789 /* IF: it doesn't fit, use 'mtu' - the data space left */ 790 if (len > mtu) 791 len = mtu; 792 /* IF: we are not sending up to and including the packet end 793 then align the next start on an eight byte boundary */ 794 if (len < left) { 795 len &= ~7; 796 } 797 798 /* Allocate buffer */ 799 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 800 hroom + troom, GFP_ATOMIC); 801 if (!frag) { 802 err = -ENOMEM; 803 goto fail; 804 } 805 806 /* 807 * Set up data on packet 808 */ 809 810 ip6_copy_metadata(frag, skb); 811 skb_reserve(frag, hroom); 812 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 813 skb_reset_network_header(frag); 814 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 815 frag->transport_header = (frag->network_header + hlen + 816 sizeof(struct frag_hdr)); 817 818 /* 819 * Charge the memory for the fragment to any owner 820 * it might possess 821 */ 822 if (skb->sk) 823 skb_set_owner_w(frag, skb->sk); 824 825 /* 826 * Copy the packet header into the new buffer. 827 */ 828 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 829 830 fragnexthdr_offset = skb_network_header(frag); 831 fragnexthdr_offset += prevhdr - skb_network_header(skb); 832 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 833 834 /* 835 * Build fragment header. 836 */ 837 fh->nexthdr = nexthdr; 838 fh->reserved = 0; 839 fh->identification = frag_id; 840 841 /* 842 * Copy a block of the IP datagram. 843 */ 844 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 845 len)); 846 left -= len; 847 848 fh->frag_off = htons(offset); 849 if (left > 0) 850 fh->frag_off |= htons(IP6_MF); 851 ipv6_hdr(frag)->payload_len = htons(frag->len - 852 sizeof(struct ipv6hdr)); 853 854 ptr += len; 855 offset += len; 856 857 /* 858 * Put this fragment into the sending queue. 859 */ 860 err = output(net, sk, frag); 861 if (err) 862 goto fail; 863 864 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 865 IPSTATS_MIB_FRAGCREATES); 866 } 867 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 868 IPSTATS_MIB_FRAGOKS); 869 consume_skb(skb); 870 return err; 871 872 fail_toobig: 873 if (skb->sk && dst_allfrag(skb_dst(skb))) 874 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 875 876 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 877 err = -EMSGSIZE; 878 879 fail: 880 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 881 IPSTATS_MIB_FRAGFAILS); 882 kfree_skb(skb); 883 return err; 884 } 885 886 static inline int ip6_rt_check(const struct rt6key *rt_key, 887 const struct in6_addr *fl_addr, 888 const struct in6_addr *addr_cache) 889 { 890 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 891 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 892 } 893 894 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 895 struct dst_entry *dst, 896 const struct flowi6 *fl6) 897 { 898 struct ipv6_pinfo *np = inet6_sk(sk); 899 struct rt6_info *rt; 900 901 if (!dst) 902 goto out; 903 904 if (dst->ops->family != AF_INET6) { 905 dst_release(dst); 906 return NULL; 907 } 908 909 rt = (struct rt6_info *)dst; 910 /* Yes, checking route validity in not connected 911 * case is not very simple. Take into account, 912 * that we do not support routing by source, TOS, 913 * and MSG_DONTROUTE --ANK (980726) 914 * 915 * 1. ip6_rt_check(): If route was host route, 916 * check that cached destination is current. 917 * If it is network route, we still may 918 * check its validity using saved pointer 919 * to the last used address: daddr_cache. 920 * We do not want to save whole address now, 921 * (because main consumer of this service 922 * is tcp, which has not this problem), 923 * so that the last trick works only on connected 924 * sockets. 925 * 2. oif also should be the same. 926 */ 927 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 928 #ifdef CONFIG_IPV6_SUBTREES 929 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 930 #endif 931 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 932 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 933 dst_release(dst); 934 dst = NULL; 935 } 936 937 out: 938 return dst; 939 } 940 941 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 942 struct dst_entry **dst, struct flowi6 *fl6) 943 { 944 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 945 struct neighbour *n; 946 struct rt6_info *rt; 947 #endif 948 int err; 949 int flags = 0; 950 951 /* The correct way to handle this would be to do 952 * ip6_route_get_saddr, and then ip6_route_output; however, 953 * the route-specific preferred source forces the 954 * ip6_route_output call _before_ ip6_route_get_saddr. 955 * 956 * In source specific routing (no src=any default route), 957 * ip6_route_output will fail given src=any saddr, though, so 958 * that's why we try it again later. 959 */ 960 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 961 struct fib6_info *from; 962 struct rt6_info *rt; 963 bool had_dst = *dst != NULL; 964 965 if (!had_dst) 966 *dst = ip6_route_output(net, sk, fl6); 967 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 968 969 rcu_read_lock(); 970 from = rt ? rcu_dereference(rt->from) : NULL; 971 err = ip6_route_get_saddr(net, from, &fl6->daddr, 972 sk ? inet6_sk(sk)->srcprefs : 0, 973 &fl6->saddr); 974 rcu_read_unlock(); 975 976 if (err) 977 goto out_err_release; 978 979 /* If we had an erroneous initial result, pretend it 980 * never existed and let the SA-enabled version take 981 * over. 982 */ 983 if (!had_dst && (*dst)->error) { 984 dst_release(*dst); 985 *dst = NULL; 986 } 987 988 if (fl6->flowi6_oif) 989 flags |= RT6_LOOKUP_F_IFACE; 990 } 991 992 if (!*dst) 993 *dst = ip6_route_output_flags(net, sk, fl6, flags); 994 995 err = (*dst)->error; 996 if (err) 997 goto out_err_release; 998 999 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1000 /* 1001 * Here if the dst entry we've looked up 1002 * has a neighbour entry that is in the INCOMPLETE 1003 * state and the src address from the flow is 1004 * marked as OPTIMISTIC, we release the found 1005 * dst entry and replace it instead with the 1006 * dst entry of the nexthop router 1007 */ 1008 rt = (struct rt6_info *) *dst; 1009 rcu_read_lock_bh(); 1010 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1011 rt6_nexthop(rt, &fl6->daddr)); 1012 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1013 rcu_read_unlock_bh(); 1014 1015 if (err) { 1016 struct inet6_ifaddr *ifp; 1017 struct flowi6 fl_gw6; 1018 int redirect; 1019 1020 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1021 (*dst)->dev, 1); 1022 1023 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1024 if (ifp) 1025 in6_ifa_put(ifp); 1026 1027 if (redirect) { 1028 /* 1029 * We need to get the dst entry for the 1030 * default router instead 1031 */ 1032 dst_release(*dst); 1033 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1034 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1035 *dst = ip6_route_output(net, sk, &fl_gw6); 1036 err = (*dst)->error; 1037 if (err) 1038 goto out_err_release; 1039 } 1040 } 1041 #endif 1042 if (ipv6_addr_v4mapped(&fl6->saddr) && 1043 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1044 err = -EAFNOSUPPORT; 1045 goto out_err_release; 1046 } 1047 1048 return 0; 1049 1050 out_err_release: 1051 dst_release(*dst); 1052 *dst = NULL; 1053 1054 if (err == -ENETUNREACH) 1055 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1056 return err; 1057 } 1058 1059 /** 1060 * ip6_dst_lookup - perform route lookup on flow 1061 * @sk: socket which provides route info 1062 * @dst: pointer to dst_entry * for result 1063 * @fl6: flow to lookup 1064 * 1065 * This function performs a route lookup on the given flow. 1066 * 1067 * It returns zero on success, or a standard errno code on error. 1068 */ 1069 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1070 struct flowi6 *fl6) 1071 { 1072 *dst = NULL; 1073 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1074 } 1075 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1076 1077 /** 1078 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1079 * @sk: socket which provides route info 1080 * @fl6: flow to lookup 1081 * @final_dst: final destination address for ipsec lookup 1082 * 1083 * This function performs a route lookup on the given flow. 1084 * 1085 * It returns a valid dst pointer on success, or a pointer encoded 1086 * error code. 1087 */ 1088 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1089 const struct in6_addr *final_dst) 1090 { 1091 struct dst_entry *dst = NULL; 1092 int err; 1093 1094 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1095 if (err) 1096 return ERR_PTR(err); 1097 if (final_dst) 1098 fl6->daddr = *final_dst; 1099 1100 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1101 } 1102 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1103 1104 /** 1105 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1106 * @sk: socket which provides the dst cache and route info 1107 * @fl6: flow to lookup 1108 * @final_dst: final destination address for ipsec lookup 1109 * @connected: whether @sk is connected or not 1110 * 1111 * This function performs a route lookup on the given flow with the 1112 * possibility of using the cached route in the socket if it is valid. 1113 * It will take the socket dst lock when operating on the dst cache. 1114 * As a result, this function can only be used in process context. 1115 * 1116 * In addition, for a connected socket, cache the dst in the socket 1117 * if the current cache is not valid. 1118 * 1119 * It returns a valid dst pointer on success, or a pointer encoded 1120 * error code. 1121 */ 1122 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1123 const struct in6_addr *final_dst, 1124 bool connected) 1125 { 1126 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1127 1128 dst = ip6_sk_dst_check(sk, dst, fl6); 1129 if (dst) 1130 return dst; 1131 1132 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1133 if (connected && !IS_ERR(dst)) 1134 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1135 1136 return dst; 1137 } 1138 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1139 1140 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1141 gfp_t gfp) 1142 { 1143 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1144 } 1145 1146 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1147 gfp_t gfp) 1148 { 1149 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1150 } 1151 1152 static void ip6_append_data_mtu(unsigned int *mtu, 1153 int *maxfraglen, 1154 unsigned int fragheaderlen, 1155 struct sk_buff *skb, 1156 struct rt6_info *rt, 1157 unsigned int orig_mtu) 1158 { 1159 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1160 if (!skb) { 1161 /* first fragment, reserve header_len */ 1162 *mtu = orig_mtu - rt->dst.header_len; 1163 1164 } else { 1165 /* 1166 * this fragment is not first, the headers 1167 * space is regarded as data space. 1168 */ 1169 *mtu = orig_mtu; 1170 } 1171 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1172 + fragheaderlen - sizeof(struct frag_hdr); 1173 } 1174 } 1175 1176 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1177 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1178 struct rt6_info *rt, struct flowi6 *fl6) 1179 { 1180 struct ipv6_pinfo *np = inet6_sk(sk); 1181 unsigned int mtu; 1182 struct ipv6_txoptions *opt = ipc6->opt; 1183 1184 /* 1185 * setup for corking 1186 */ 1187 if (opt) { 1188 if (WARN_ON(v6_cork->opt)) 1189 return -EINVAL; 1190 1191 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1192 if (unlikely(!v6_cork->opt)) 1193 return -ENOBUFS; 1194 1195 v6_cork->opt->tot_len = sizeof(*opt); 1196 v6_cork->opt->opt_flen = opt->opt_flen; 1197 v6_cork->opt->opt_nflen = opt->opt_nflen; 1198 1199 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1200 sk->sk_allocation); 1201 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1202 return -ENOBUFS; 1203 1204 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1205 sk->sk_allocation); 1206 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1207 return -ENOBUFS; 1208 1209 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1210 sk->sk_allocation); 1211 if (opt->hopopt && !v6_cork->opt->hopopt) 1212 return -ENOBUFS; 1213 1214 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1215 sk->sk_allocation); 1216 if (opt->srcrt && !v6_cork->opt->srcrt) 1217 return -ENOBUFS; 1218 1219 /* need source address above miyazawa*/ 1220 } 1221 dst_hold(&rt->dst); 1222 cork->base.dst = &rt->dst; 1223 cork->fl.u.ip6 = *fl6; 1224 v6_cork->hop_limit = ipc6->hlimit; 1225 v6_cork->tclass = ipc6->tclass; 1226 if (rt->dst.flags & DST_XFRM_TUNNEL) 1227 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1228 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1229 else 1230 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1231 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1232 if (np->frag_size < mtu) { 1233 if (np->frag_size) 1234 mtu = np->frag_size; 1235 } 1236 if (mtu < IPV6_MIN_MTU) 1237 return -EINVAL; 1238 cork->base.fragsize = mtu; 1239 cork->base.gso_size = ipc6->gso_size; 1240 cork->base.tx_flags = 0; 1241 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1242 1243 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1244 cork->base.flags |= IPCORK_ALLFRAG; 1245 cork->base.length = 0; 1246 1247 cork->base.transmit_time = ipc6->sockc.transmit_time; 1248 1249 return 0; 1250 } 1251 1252 static int __ip6_append_data(struct sock *sk, 1253 struct flowi6 *fl6, 1254 struct sk_buff_head *queue, 1255 struct inet_cork *cork, 1256 struct inet6_cork *v6_cork, 1257 struct page_frag *pfrag, 1258 int getfrag(void *from, char *to, int offset, 1259 int len, int odd, struct sk_buff *skb), 1260 void *from, int length, int transhdrlen, 1261 unsigned int flags, struct ipcm6_cookie *ipc6) 1262 { 1263 struct sk_buff *skb, *skb_prev = NULL; 1264 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1265 struct ubuf_info *uarg = NULL; 1266 int exthdrlen = 0; 1267 int dst_exthdrlen = 0; 1268 int hh_len; 1269 int copy; 1270 int err; 1271 int offset = 0; 1272 u32 tskey = 0; 1273 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1274 struct ipv6_txoptions *opt = v6_cork->opt; 1275 int csummode = CHECKSUM_NONE; 1276 unsigned int maxnonfragsize, headersize; 1277 unsigned int wmem_alloc_delta = 0; 1278 bool paged, extra_uref; 1279 1280 skb = skb_peek_tail(queue); 1281 if (!skb) { 1282 exthdrlen = opt ? opt->opt_flen : 0; 1283 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1284 } 1285 1286 paged = !!cork->gso_size; 1287 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1288 orig_mtu = mtu; 1289 1290 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 1291 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1292 tskey = sk->sk_tskey++; 1293 1294 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1295 1296 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1297 (opt ? opt->opt_nflen : 0); 1298 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1299 sizeof(struct frag_hdr); 1300 1301 headersize = sizeof(struct ipv6hdr) + 1302 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1303 (dst_allfrag(&rt->dst) ? 1304 sizeof(struct frag_hdr) : 0) + 1305 rt->rt6i_nfheader_len; 1306 1307 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1308 * the first fragment 1309 */ 1310 if (headersize + transhdrlen > mtu) 1311 goto emsgsize; 1312 1313 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1314 (sk->sk_protocol == IPPROTO_UDP || 1315 sk->sk_protocol == IPPROTO_RAW)) { 1316 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1317 sizeof(struct ipv6hdr)); 1318 goto emsgsize; 1319 } 1320 1321 if (ip6_sk_ignore_df(sk)) 1322 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1323 else 1324 maxnonfragsize = mtu; 1325 1326 if (cork->length + length > maxnonfragsize - headersize) { 1327 emsgsize: 1328 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1329 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1330 return -EMSGSIZE; 1331 } 1332 1333 /* CHECKSUM_PARTIAL only with no extension headers and when 1334 * we are not going to fragment 1335 */ 1336 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1337 headersize == sizeof(struct ipv6hdr) && 1338 length <= mtu - headersize && 1339 (!(flags & MSG_MORE) || cork->gso_size) && 1340 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1341 csummode = CHECKSUM_PARTIAL; 1342 1343 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1344 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1345 if (!uarg) 1346 return -ENOBUFS; 1347 extra_uref = true; 1348 if (rt->dst.dev->features & NETIF_F_SG && 1349 csummode == CHECKSUM_PARTIAL) { 1350 paged = true; 1351 } else { 1352 uarg->zerocopy = 0; 1353 skb_zcopy_set(skb, uarg, &extra_uref); 1354 } 1355 } 1356 1357 /* 1358 * Let's try using as much space as possible. 1359 * Use MTU if total length of the message fits into the MTU. 1360 * Otherwise, we need to reserve fragment header and 1361 * fragment alignment (= 8-15 octects, in total). 1362 * 1363 * Note that we may need to "move" the data from the tail of 1364 * of the buffer to the new fragment when we split 1365 * the message. 1366 * 1367 * FIXME: It may be fragmented into multiple chunks 1368 * at once if non-fragmentable extension headers 1369 * are too large. 1370 * --yoshfuji 1371 */ 1372 1373 cork->length += length; 1374 if (!skb) 1375 goto alloc_new_skb; 1376 1377 while (length > 0) { 1378 /* Check if the remaining data fits into current packet. */ 1379 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1380 if (copy < length) 1381 copy = maxfraglen - skb->len; 1382 1383 if (copy <= 0) { 1384 char *data; 1385 unsigned int datalen; 1386 unsigned int fraglen; 1387 unsigned int fraggap; 1388 unsigned int alloclen; 1389 unsigned int pagedlen; 1390 alloc_new_skb: 1391 /* There's no room in the current skb */ 1392 if (skb) 1393 fraggap = skb->len - maxfraglen; 1394 else 1395 fraggap = 0; 1396 /* update mtu and maxfraglen if necessary */ 1397 if (!skb || !skb_prev) 1398 ip6_append_data_mtu(&mtu, &maxfraglen, 1399 fragheaderlen, skb, rt, 1400 orig_mtu); 1401 1402 skb_prev = skb; 1403 1404 /* 1405 * If remaining data exceeds the mtu, 1406 * we know we need more fragment(s). 1407 */ 1408 datalen = length + fraggap; 1409 1410 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1411 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1412 fraglen = datalen + fragheaderlen; 1413 pagedlen = 0; 1414 1415 if ((flags & MSG_MORE) && 1416 !(rt->dst.dev->features&NETIF_F_SG)) 1417 alloclen = mtu; 1418 else if (!paged) 1419 alloclen = fraglen; 1420 else { 1421 alloclen = min_t(int, fraglen, MAX_HEADER); 1422 pagedlen = fraglen - alloclen; 1423 } 1424 1425 alloclen += dst_exthdrlen; 1426 1427 if (datalen != length + fraggap) { 1428 /* 1429 * this is not the last fragment, the trailer 1430 * space is regarded as data space. 1431 */ 1432 datalen += rt->dst.trailer_len; 1433 } 1434 1435 alloclen += rt->dst.trailer_len; 1436 fraglen = datalen + fragheaderlen; 1437 1438 /* 1439 * We just reserve space for fragment header. 1440 * Note: this may be overallocation if the message 1441 * (without MSG_MORE) fits into the MTU. 1442 */ 1443 alloclen += sizeof(struct frag_hdr); 1444 1445 copy = datalen - transhdrlen - fraggap - pagedlen; 1446 if (copy < 0) { 1447 err = -EINVAL; 1448 goto error; 1449 } 1450 if (transhdrlen) { 1451 skb = sock_alloc_send_skb(sk, 1452 alloclen + hh_len, 1453 (flags & MSG_DONTWAIT), &err); 1454 } else { 1455 skb = NULL; 1456 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1457 2 * sk->sk_sndbuf) 1458 skb = alloc_skb(alloclen + hh_len, 1459 sk->sk_allocation); 1460 if (unlikely(!skb)) 1461 err = -ENOBUFS; 1462 } 1463 if (!skb) 1464 goto error; 1465 /* 1466 * Fill in the control structures 1467 */ 1468 skb->protocol = htons(ETH_P_IPV6); 1469 skb->ip_summed = csummode; 1470 skb->csum = 0; 1471 /* reserve for fragmentation and ipsec header */ 1472 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1473 dst_exthdrlen); 1474 1475 /* 1476 * Find where to start putting bytes 1477 */ 1478 data = skb_put(skb, fraglen - pagedlen); 1479 skb_set_network_header(skb, exthdrlen); 1480 data += fragheaderlen; 1481 skb->transport_header = (skb->network_header + 1482 fragheaderlen); 1483 if (fraggap) { 1484 skb->csum = skb_copy_and_csum_bits( 1485 skb_prev, maxfraglen, 1486 data + transhdrlen, fraggap, 0); 1487 skb_prev->csum = csum_sub(skb_prev->csum, 1488 skb->csum); 1489 data += fraggap; 1490 pskb_trim_unique(skb_prev, maxfraglen); 1491 } 1492 if (copy > 0 && 1493 getfrag(from, data + transhdrlen, offset, 1494 copy, fraggap, skb) < 0) { 1495 err = -EFAULT; 1496 kfree_skb(skb); 1497 goto error; 1498 } 1499 1500 offset += copy; 1501 length -= copy + transhdrlen; 1502 transhdrlen = 0; 1503 exthdrlen = 0; 1504 dst_exthdrlen = 0; 1505 1506 /* Only the initial fragment is time stamped */ 1507 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1508 cork->tx_flags = 0; 1509 skb_shinfo(skb)->tskey = tskey; 1510 tskey = 0; 1511 skb_zcopy_set(skb, uarg, &extra_uref); 1512 1513 if ((flags & MSG_CONFIRM) && !skb_prev) 1514 skb_set_dst_pending_confirm(skb, 1); 1515 1516 /* 1517 * Put the packet on the pending queue 1518 */ 1519 if (!skb->destructor) { 1520 skb->destructor = sock_wfree; 1521 skb->sk = sk; 1522 wmem_alloc_delta += skb->truesize; 1523 } 1524 __skb_queue_tail(queue, skb); 1525 continue; 1526 } 1527 1528 if (copy > length) 1529 copy = length; 1530 1531 if (!(rt->dst.dev->features&NETIF_F_SG) && 1532 skb_tailroom(skb) >= copy) { 1533 unsigned int off; 1534 1535 off = skb->len; 1536 if (getfrag(from, skb_put(skb, copy), 1537 offset, copy, off, skb) < 0) { 1538 __skb_trim(skb, off); 1539 err = -EFAULT; 1540 goto error; 1541 } 1542 } else if (!uarg || !uarg->zerocopy) { 1543 int i = skb_shinfo(skb)->nr_frags; 1544 1545 err = -ENOMEM; 1546 if (!sk_page_frag_refill(sk, pfrag)) 1547 goto error; 1548 1549 if (!skb_can_coalesce(skb, i, pfrag->page, 1550 pfrag->offset)) { 1551 err = -EMSGSIZE; 1552 if (i == MAX_SKB_FRAGS) 1553 goto error; 1554 1555 __skb_fill_page_desc(skb, i, pfrag->page, 1556 pfrag->offset, 0); 1557 skb_shinfo(skb)->nr_frags = ++i; 1558 get_page(pfrag->page); 1559 } 1560 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1561 if (getfrag(from, 1562 page_address(pfrag->page) + pfrag->offset, 1563 offset, copy, skb->len, skb) < 0) 1564 goto error_efault; 1565 1566 pfrag->offset += copy; 1567 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1568 skb->len += copy; 1569 skb->data_len += copy; 1570 skb->truesize += copy; 1571 wmem_alloc_delta += copy; 1572 } else { 1573 err = skb_zerocopy_iter_dgram(skb, from, copy); 1574 if (err < 0) 1575 goto error; 1576 } 1577 offset += copy; 1578 length -= copy; 1579 } 1580 1581 if (wmem_alloc_delta) 1582 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1583 return 0; 1584 1585 error_efault: 1586 err = -EFAULT; 1587 error: 1588 if (uarg) 1589 sock_zerocopy_put_abort(uarg, extra_uref); 1590 cork->length -= length; 1591 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1592 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1593 return err; 1594 } 1595 1596 int ip6_append_data(struct sock *sk, 1597 int getfrag(void *from, char *to, int offset, int len, 1598 int odd, struct sk_buff *skb), 1599 void *from, int length, int transhdrlen, 1600 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1601 struct rt6_info *rt, unsigned int flags) 1602 { 1603 struct inet_sock *inet = inet_sk(sk); 1604 struct ipv6_pinfo *np = inet6_sk(sk); 1605 int exthdrlen; 1606 int err; 1607 1608 if (flags&MSG_PROBE) 1609 return 0; 1610 if (skb_queue_empty(&sk->sk_write_queue)) { 1611 /* 1612 * setup for corking 1613 */ 1614 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1615 ipc6, rt, fl6); 1616 if (err) 1617 return err; 1618 1619 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1620 length += exthdrlen; 1621 transhdrlen += exthdrlen; 1622 } else { 1623 fl6 = &inet->cork.fl.u.ip6; 1624 transhdrlen = 0; 1625 } 1626 1627 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1628 &np->cork, sk_page_frag(sk), getfrag, 1629 from, length, transhdrlen, flags, ipc6); 1630 } 1631 EXPORT_SYMBOL_GPL(ip6_append_data); 1632 1633 static void ip6_cork_release(struct inet_cork_full *cork, 1634 struct inet6_cork *v6_cork) 1635 { 1636 if (v6_cork->opt) { 1637 kfree(v6_cork->opt->dst0opt); 1638 kfree(v6_cork->opt->dst1opt); 1639 kfree(v6_cork->opt->hopopt); 1640 kfree(v6_cork->opt->srcrt); 1641 kfree(v6_cork->opt); 1642 v6_cork->opt = NULL; 1643 } 1644 1645 if (cork->base.dst) { 1646 dst_release(cork->base.dst); 1647 cork->base.dst = NULL; 1648 cork->base.flags &= ~IPCORK_ALLFRAG; 1649 } 1650 memset(&cork->fl, 0, sizeof(cork->fl)); 1651 } 1652 1653 struct sk_buff *__ip6_make_skb(struct sock *sk, 1654 struct sk_buff_head *queue, 1655 struct inet_cork_full *cork, 1656 struct inet6_cork *v6_cork) 1657 { 1658 struct sk_buff *skb, *tmp_skb; 1659 struct sk_buff **tail_skb; 1660 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1661 struct ipv6_pinfo *np = inet6_sk(sk); 1662 struct net *net = sock_net(sk); 1663 struct ipv6hdr *hdr; 1664 struct ipv6_txoptions *opt = v6_cork->opt; 1665 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1666 struct flowi6 *fl6 = &cork->fl.u.ip6; 1667 unsigned char proto = fl6->flowi6_proto; 1668 1669 skb = __skb_dequeue(queue); 1670 if (!skb) 1671 goto out; 1672 tail_skb = &(skb_shinfo(skb)->frag_list); 1673 1674 /* move skb->data to ip header from ext header */ 1675 if (skb->data < skb_network_header(skb)) 1676 __skb_pull(skb, skb_network_offset(skb)); 1677 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1678 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1679 *tail_skb = tmp_skb; 1680 tail_skb = &(tmp_skb->next); 1681 skb->len += tmp_skb->len; 1682 skb->data_len += tmp_skb->len; 1683 skb->truesize += tmp_skb->truesize; 1684 tmp_skb->destructor = NULL; 1685 tmp_skb->sk = NULL; 1686 } 1687 1688 /* Allow local fragmentation. */ 1689 skb->ignore_df = ip6_sk_ignore_df(sk); 1690 1691 *final_dst = fl6->daddr; 1692 __skb_pull(skb, skb_network_header_len(skb)); 1693 if (opt && opt->opt_flen) 1694 ipv6_push_frag_opts(skb, opt, &proto); 1695 if (opt && opt->opt_nflen) 1696 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1697 1698 skb_push(skb, sizeof(struct ipv6hdr)); 1699 skb_reset_network_header(skb); 1700 hdr = ipv6_hdr(skb); 1701 1702 ip6_flow_hdr(hdr, v6_cork->tclass, 1703 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1704 ip6_autoflowlabel(net, np), fl6)); 1705 hdr->hop_limit = v6_cork->hop_limit; 1706 hdr->nexthdr = proto; 1707 hdr->saddr = fl6->saddr; 1708 hdr->daddr = *final_dst; 1709 1710 skb->priority = sk->sk_priority; 1711 skb->mark = sk->sk_mark; 1712 1713 skb->tstamp = cork->base.transmit_time; 1714 1715 skb_dst_set(skb, dst_clone(&rt->dst)); 1716 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1717 if (proto == IPPROTO_ICMPV6) { 1718 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1719 1720 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1721 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1722 } 1723 1724 ip6_cork_release(cork, v6_cork); 1725 out: 1726 return skb; 1727 } 1728 1729 int ip6_send_skb(struct sk_buff *skb) 1730 { 1731 struct net *net = sock_net(skb->sk); 1732 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1733 int err; 1734 1735 err = ip6_local_out(net, skb->sk, skb); 1736 if (err) { 1737 if (err > 0) 1738 err = net_xmit_errno(err); 1739 if (err) 1740 IP6_INC_STATS(net, rt->rt6i_idev, 1741 IPSTATS_MIB_OUTDISCARDS); 1742 } 1743 1744 return err; 1745 } 1746 1747 int ip6_push_pending_frames(struct sock *sk) 1748 { 1749 struct sk_buff *skb; 1750 1751 skb = ip6_finish_skb(sk); 1752 if (!skb) 1753 return 0; 1754 1755 return ip6_send_skb(skb); 1756 } 1757 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1758 1759 static void __ip6_flush_pending_frames(struct sock *sk, 1760 struct sk_buff_head *queue, 1761 struct inet_cork_full *cork, 1762 struct inet6_cork *v6_cork) 1763 { 1764 struct sk_buff *skb; 1765 1766 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1767 if (skb_dst(skb)) 1768 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1769 IPSTATS_MIB_OUTDISCARDS); 1770 kfree_skb(skb); 1771 } 1772 1773 ip6_cork_release(cork, v6_cork); 1774 } 1775 1776 void ip6_flush_pending_frames(struct sock *sk) 1777 { 1778 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1779 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1780 } 1781 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1782 1783 struct sk_buff *ip6_make_skb(struct sock *sk, 1784 int getfrag(void *from, char *to, int offset, 1785 int len, int odd, struct sk_buff *skb), 1786 void *from, int length, int transhdrlen, 1787 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1788 struct rt6_info *rt, unsigned int flags, 1789 struct inet_cork_full *cork) 1790 { 1791 struct inet6_cork v6_cork; 1792 struct sk_buff_head queue; 1793 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1794 int err; 1795 1796 if (flags & MSG_PROBE) 1797 return NULL; 1798 1799 __skb_queue_head_init(&queue); 1800 1801 cork->base.flags = 0; 1802 cork->base.addr = 0; 1803 cork->base.opt = NULL; 1804 cork->base.dst = NULL; 1805 v6_cork.opt = NULL; 1806 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); 1807 if (err) { 1808 ip6_cork_release(cork, &v6_cork); 1809 return ERR_PTR(err); 1810 } 1811 if (ipc6->dontfrag < 0) 1812 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1813 1814 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1815 ¤t->task_frag, getfrag, from, 1816 length + exthdrlen, transhdrlen + exthdrlen, 1817 flags, ipc6); 1818 if (err) { 1819 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1820 return ERR_PTR(err); 1821 } 1822 1823 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 1824 } 1825