1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 int ret; 134 135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 if (ret) { 137 kfree_skb(skb); 138 return ret; 139 } 140 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 /* Policy lookup after SNAT yielded a new policy */ 143 if (skb_dst(skb)->xfrm) { 144 IPCB(skb)->flags |= IPSKB_REROUTED; 145 return dst_output(net, sk, skb); 146 } 147 #endif 148 149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 150 dst_allfrag(skb_dst(skb)) || 151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 153 else 154 return ip6_finish_output2(net, sk, skb); 155 } 156 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 { 159 struct net_device *dev = skb_dst(skb)->dev; 160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 161 162 skb->protocol = htons(ETH_P_IPV6); 163 skb->dev = dev; 164 165 if (unlikely(idev->cnf.disable_ipv6)) { 166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 167 kfree_skb(skb); 168 return 0; 169 } 170 171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 172 net, sk, skb, NULL, dev, 173 ip6_finish_output, 174 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 } 176 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 178 { 179 if (!np->autoflowlabel_set) 180 return ip6_default_np_autolabel(net); 181 else 182 return np->autoflowlabel; 183 } 184 185 /* 186 * xmit an sk_buff (used by TCP, SCTP and DCCP) 187 * Note : socket lock is not held for SYNACK packets, but might be modified 188 * by calls to skb_set_owner_w() and ipv6_local_error(), 189 * which are using proper atomic operations or spinlocks. 190 */ 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 192 __u32 mark, struct ipv6_txoptions *opt, int tclass) 193 { 194 struct net *net = sock_net(sk); 195 const struct ipv6_pinfo *np = inet6_sk(sk); 196 struct in6_addr *first_hop = &fl6->daddr; 197 struct dst_entry *dst = skb_dst(skb); 198 unsigned int head_room; 199 struct ipv6hdr *hdr; 200 u8 proto = fl6->flowi6_proto; 201 int seg_len = skb->len; 202 int hlimit = -1; 203 u32 mtu; 204 205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 206 if (opt) 207 head_room += opt->opt_nflen + opt->opt_flen; 208 209 if (unlikely(skb_headroom(skb) < head_room)) { 210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 211 if (!skb2) { 212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 213 IPSTATS_MIB_OUTDISCARDS); 214 kfree_skb(skb); 215 return -ENOBUFS; 216 } 217 if (skb->sk) 218 skb_set_owner_w(skb2, skb->sk); 219 consume_skb(skb); 220 skb = skb2; 221 } 222 223 if (opt) { 224 seg_len += opt->opt_nflen + opt->opt_flen; 225 226 if (opt->opt_flen) 227 ipv6_push_frag_opts(skb, opt, &proto); 228 229 if (opt->opt_nflen) 230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 231 &fl6->saddr); 232 } 233 234 skb_push(skb, sizeof(struct ipv6hdr)); 235 skb_reset_network_header(skb); 236 hdr = ipv6_hdr(skb); 237 238 /* 239 * Fill in the IPv6 header 240 */ 241 if (np) 242 hlimit = np->hop_limit; 243 if (hlimit < 0) 244 hlimit = ip6_dst_hoplimit(dst); 245 246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 247 ip6_autoflowlabel(net, np), fl6)); 248 249 hdr->payload_len = htons(seg_len); 250 hdr->nexthdr = proto; 251 hdr->hop_limit = hlimit; 252 253 hdr->saddr = fl6->saddr; 254 hdr->daddr = *first_hop; 255 256 skb->protocol = htons(ETH_P_IPV6); 257 skb->priority = sk->sk_priority; 258 skb->mark = mark; 259 260 mtu = dst_mtu(dst); 261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 263 IPSTATS_MIB_OUT, skb->len); 264 265 /* if egress device is enslaved to an L3 master device pass the 266 * skb to its handler for processing 267 */ 268 skb = l3mdev_ip6_out((struct sock *)sk, skb); 269 if (unlikely(!skb)) 270 return 0; 271 272 /* hooks should never assume socket lock is held. 273 * we promote our socket to non const 274 */ 275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 276 net, (struct sock *)sk, skb, NULL, dst->dev, 277 dst_output); 278 } 279 280 skb->dev = dst->dev; 281 /* ipv6_local_error() does not require socket lock, 282 * we promote our socket to non const 283 */ 284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 285 286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 287 kfree_skb(skb); 288 return -EMSGSIZE; 289 } 290 EXPORT_SYMBOL(ip6_xmit); 291 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 293 { 294 struct ip6_ra_chain *ra; 295 struct sock *last = NULL; 296 297 read_lock(&ip6_ra_lock); 298 for (ra = ip6_ra_chain; ra; ra = ra->next) { 299 struct sock *sk = ra->sk; 300 if (sk && ra->sel == sel && 301 (!sk->sk_bound_dev_if || 302 sk->sk_bound_dev_if == skb->dev->ifindex)) { 303 if (last) { 304 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 305 if (skb2) 306 rawv6_rcv(last, skb2); 307 } 308 last = sk; 309 } 310 } 311 312 if (last) { 313 rawv6_rcv(last, skb); 314 read_unlock(&ip6_ra_lock); 315 return 1; 316 } 317 read_unlock(&ip6_ra_lock); 318 return 0; 319 } 320 321 static int ip6_forward_proxy_check(struct sk_buff *skb) 322 { 323 struct ipv6hdr *hdr = ipv6_hdr(skb); 324 u8 nexthdr = hdr->nexthdr; 325 __be16 frag_off; 326 int offset; 327 328 if (ipv6_ext_hdr(nexthdr)) { 329 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 330 if (offset < 0) 331 return 0; 332 } else 333 offset = sizeof(struct ipv6hdr); 334 335 if (nexthdr == IPPROTO_ICMPV6) { 336 struct icmp6hdr *icmp6; 337 338 if (!pskb_may_pull(skb, (skb_network_header(skb) + 339 offset + 1 - skb->data))) 340 return 0; 341 342 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 343 344 switch (icmp6->icmp6_type) { 345 case NDISC_ROUTER_SOLICITATION: 346 case NDISC_ROUTER_ADVERTISEMENT: 347 case NDISC_NEIGHBOUR_SOLICITATION: 348 case NDISC_NEIGHBOUR_ADVERTISEMENT: 349 case NDISC_REDIRECT: 350 /* For reaction involving unicast neighbor discovery 351 * message destined to the proxied address, pass it to 352 * input function. 353 */ 354 return 1; 355 default: 356 break; 357 } 358 } 359 360 /* 361 * The proxying router can't forward traffic sent to a link-local 362 * address, so signal the sender and discard the packet. This 363 * behavior is clarified by the MIPv6 specification. 364 */ 365 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 366 dst_link_failure(skb); 367 return -1; 368 } 369 370 return 0; 371 } 372 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 374 struct sk_buff *skb) 375 { 376 struct dst_entry *dst = skb_dst(skb); 377 378 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 379 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 380 381 #ifdef CONFIG_NET_SWITCHDEV 382 if (skb->offload_l3_fwd_mark) { 383 consume_skb(skb); 384 return 0; 385 } 386 #endif 387 388 return dst_output(net, sk, skb); 389 } 390 391 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 392 { 393 if (skb->len <= mtu) 394 return false; 395 396 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 397 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 398 return true; 399 400 if (skb->ignore_df) 401 return false; 402 403 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 404 return false; 405 406 return true; 407 } 408 409 int ip6_forward(struct sk_buff *skb) 410 { 411 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 412 struct dst_entry *dst = skb_dst(skb); 413 struct ipv6hdr *hdr = ipv6_hdr(skb); 414 struct inet6_skb_parm *opt = IP6CB(skb); 415 struct net *net = dev_net(dst->dev); 416 u32 mtu; 417 418 if (net->ipv6.devconf_all->forwarding == 0) 419 goto error; 420 421 if (skb->pkt_type != PACKET_HOST) 422 goto drop; 423 424 if (unlikely(skb->sk)) 425 goto drop; 426 427 if (skb_warn_if_lro(skb)) 428 goto drop; 429 430 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 431 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 432 goto drop; 433 } 434 435 skb_forward_csum(skb); 436 437 /* 438 * We DO NOT make any processing on 439 * RA packets, pushing them to user level AS IS 440 * without ane WARRANTY that application will be able 441 * to interpret them. The reason is that we 442 * cannot make anything clever here. 443 * 444 * We are not end-node, so that if packet contains 445 * AH/ESP, we cannot make anything. 446 * Defragmentation also would be mistake, RA packets 447 * cannot be fragmented, because there is no warranty 448 * that different fragments will go along one path. --ANK 449 */ 450 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 451 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 452 return 0; 453 } 454 455 /* 456 * check and decrement ttl 457 */ 458 if (hdr->hop_limit <= 1) { 459 /* Force OUTPUT device used as source address */ 460 skb->dev = dst->dev; 461 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 462 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 463 464 kfree_skb(skb); 465 return -ETIMEDOUT; 466 } 467 468 /* XXX: idev->cnf.proxy_ndp? */ 469 if (net->ipv6.devconf_all->proxy_ndp && 470 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 471 int proxied = ip6_forward_proxy_check(skb); 472 if (proxied > 0) 473 return ip6_input(skb); 474 else if (proxied < 0) { 475 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 476 goto drop; 477 } 478 } 479 480 if (!xfrm6_route_forward(skb)) { 481 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 482 goto drop; 483 } 484 dst = skb_dst(skb); 485 486 /* IPv6 specs say nothing about it, but it is clear that we cannot 487 send redirects to source routed frames. 488 We don't send redirects to frames decapsulated from IPsec. 489 */ 490 if (IP6CB(skb)->iif == dst->dev->ifindex && 491 opt->srcrt == 0 && !skb_sec_path(skb)) { 492 struct in6_addr *target = NULL; 493 struct inet_peer *peer; 494 struct rt6_info *rt; 495 496 /* 497 * incoming and outgoing devices are the same 498 * send a redirect. 499 */ 500 501 rt = (struct rt6_info *) dst; 502 if (rt->rt6i_flags & RTF_GATEWAY) 503 target = &rt->rt6i_gateway; 504 else 505 target = &hdr->daddr; 506 507 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 508 509 /* Limit redirects both by destination (here) 510 and by source (inside ndisc_send_redirect) 511 */ 512 if (inet_peer_xrlim_allow(peer, 1*HZ)) 513 ndisc_send_redirect(skb, target); 514 if (peer) 515 inet_putpeer(peer); 516 } else { 517 int addrtype = ipv6_addr_type(&hdr->saddr); 518 519 /* This check is security critical. */ 520 if (addrtype == IPV6_ADDR_ANY || 521 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 522 goto error; 523 if (addrtype & IPV6_ADDR_LINKLOCAL) { 524 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 525 ICMPV6_NOT_NEIGHBOUR, 0); 526 goto error; 527 } 528 } 529 530 mtu = ip6_dst_mtu_forward(dst); 531 if (mtu < IPV6_MIN_MTU) 532 mtu = IPV6_MIN_MTU; 533 534 if (ip6_pkt_too_big(skb, mtu)) { 535 /* Again, force OUTPUT device used as source address */ 536 skb->dev = dst->dev; 537 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 538 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 539 __IP6_INC_STATS(net, ip6_dst_idev(dst), 540 IPSTATS_MIB_FRAGFAILS); 541 kfree_skb(skb); 542 return -EMSGSIZE; 543 } 544 545 if (skb_cow(skb, dst->dev->hard_header_len)) { 546 __IP6_INC_STATS(net, ip6_dst_idev(dst), 547 IPSTATS_MIB_OUTDISCARDS); 548 goto drop; 549 } 550 551 hdr = ipv6_hdr(skb); 552 553 /* Mangling hops number delayed to point after skb COW */ 554 555 hdr->hop_limit--; 556 557 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 558 net, NULL, skb, skb->dev, dst->dev, 559 ip6_forward_finish); 560 561 error: 562 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 563 drop: 564 kfree_skb(skb); 565 return -EINVAL; 566 } 567 568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 569 { 570 to->pkt_type = from->pkt_type; 571 to->priority = from->priority; 572 to->protocol = from->protocol; 573 skb_dst_drop(to); 574 skb_dst_set(to, dst_clone(skb_dst(from))); 575 to->dev = from->dev; 576 to->mark = from->mark; 577 578 skb_copy_hash(to, from); 579 580 #ifdef CONFIG_NET_SCHED 581 to->tc_index = from->tc_index; 582 #endif 583 nf_copy(to, from); 584 skb_copy_secmark(to, from); 585 } 586 587 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 588 int (*output)(struct net *, struct sock *, struct sk_buff *)) 589 { 590 struct sk_buff *frag; 591 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 592 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 593 inet6_sk(skb->sk) : NULL; 594 struct ipv6hdr *tmp_hdr; 595 struct frag_hdr *fh; 596 unsigned int mtu, hlen, left, len; 597 int hroom, troom; 598 __be32 frag_id; 599 int ptr, offset = 0, err = 0; 600 u8 *prevhdr, nexthdr = 0; 601 602 err = ip6_find_1stfragopt(skb, &prevhdr); 603 if (err < 0) 604 goto fail; 605 hlen = err; 606 nexthdr = *prevhdr; 607 608 mtu = ip6_skb_dst_mtu(skb); 609 610 /* We must not fragment if the socket is set to force MTU discovery 611 * or if the skb it not generated by a local socket. 612 */ 613 if (unlikely(!skb->ignore_df && skb->len > mtu)) 614 goto fail_toobig; 615 616 if (IP6CB(skb)->frag_max_size) { 617 if (IP6CB(skb)->frag_max_size > mtu) 618 goto fail_toobig; 619 620 /* don't send fragments larger than what we received */ 621 mtu = IP6CB(skb)->frag_max_size; 622 if (mtu < IPV6_MIN_MTU) 623 mtu = IPV6_MIN_MTU; 624 } 625 626 if (np && np->frag_size < mtu) { 627 if (np->frag_size) 628 mtu = np->frag_size; 629 } 630 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 631 goto fail_toobig; 632 mtu -= hlen + sizeof(struct frag_hdr); 633 634 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 635 &ipv6_hdr(skb)->saddr); 636 637 if (skb->ip_summed == CHECKSUM_PARTIAL && 638 (err = skb_checksum_help(skb))) 639 goto fail; 640 641 hroom = LL_RESERVED_SPACE(rt->dst.dev); 642 if (skb_has_frag_list(skb)) { 643 unsigned int first_len = skb_pagelen(skb); 644 struct sk_buff *frag2; 645 646 if (first_len - hlen > mtu || 647 ((first_len - hlen) & 7) || 648 skb_cloned(skb) || 649 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 650 goto slow_path; 651 652 skb_walk_frags(skb, frag) { 653 /* Correct geometry. */ 654 if (frag->len > mtu || 655 ((frag->len & 7) && frag->next) || 656 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 657 goto slow_path_clean; 658 659 /* Partially cloned skb? */ 660 if (skb_shared(frag)) 661 goto slow_path_clean; 662 663 BUG_ON(frag->sk); 664 if (skb->sk) { 665 frag->sk = skb->sk; 666 frag->destructor = sock_wfree; 667 } 668 skb->truesize -= frag->truesize; 669 } 670 671 err = 0; 672 offset = 0; 673 /* BUILD HEADER */ 674 675 *prevhdr = NEXTHDR_FRAGMENT; 676 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 677 if (!tmp_hdr) { 678 err = -ENOMEM; 679 goto fail; 680 } 681 frag = skb_shinfo(skb)->frag_list; 682 skb_frag_list_init(skb); 683 684 __skb_pull(skb, hlen); 685 fh = __skb_push(skb, sizeof(struct frag_hdr)); 686 __skb_push(skb, hlen); 687 skb_reset_network_header(skb); 688 memcpy(skb_network_header(skb), tmp_hdr, hlen); 689 690 fh->nexthdr = nexthdr; 691 fh->reserved = 0; 692 fh->frag_off = htons(IP6_MF); 693 fh->identification = frag_id; 694 695 first_len = skb_pagelen(skb); 696 skb->data_len = first_len - skb_headlen(skb); 697 skb->len = first_len; 698 ipv6_hdr(skb)->payload_len = htons(first_len - 699 sizeof(struct ipv6hdr)); 700 701 for (;;) { 702 /* Prepare header of the next frame, 703 * before previous one went down. */ 704 if (frag) { 705 frag->ip_summed = CHECKSUM_NONE; 706 skb_reset_transport_header(frag); 707 fh = __skb_push(frag, sizeof(struct frag_hdr)); 708 __skb_push(frag, hlen); 709 skb_reset_network_header(frag); 710 memcpy(skb_network_header(frag), tmp_hdr, 711 hlen); 712 offset += skb->len - hlen - sizeof(struct frag_hdr); 713 fh->nexthdr = nexthdr; 714 fh->reserved = 0; 715 fh->frag_off = htons(offset); 716 if (frag->next) 717 fh->frag_off |= htons(IP6_MF); 718 fh->identification = frag_id; 719 ipv6_hdr(frag)->payload_len = 720 htons(frag->len - 721 sizeof(struct ipv6hdr)); 722 ip6_copy_metadata(frag, skb); 723 } 724 725 err = output(net, sk, skb); 726 if (!err) 727 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 728 IPSTATS_MIB_FRAGCREATES); 729 730 if (err || !frag) 731 break; 732 733 skb = frag; 734 frag = skb->next; 735 skb_mark_not_on_list(skb); 736 } 737 738 kfree(tmp_hdr); 739 740 if (err == 0) { 741 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 742 IPSTATS_MIB_FRAGOKS); 743 return 0; 744 } 745 746 kfree_skb_list(frag); 747 748 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 749 IPSTATS_MIB_FRAGFAILS); 750 return err; 751 752 slow_path_clean: 753 skb_walk_frags(skb, frag2) { 754 if (frag2 == frag) 755 break; 756 frag2->sk = NULL; 757 frag2->destructor = NULL; 758 skb->truesize += frag2->truesize; 759 } 760 } 761 762 slow_path: 763 left = skb->len - hlen; /* Space per frame */ 764 ptr = hlen; /* Where to start from */ 765 766 /* 767 * Fragment the datagram. 768 */ 769 770 troom = rt->dst.dev->needed_tailroom; 771 772 /* 773 * Keep copying data until we run out. 774 */ 775 while (left > 0) { 776 u8 *fragnexthdr_offset; 777 778 len = left; 779 /* IF: it doesn't fit, use 'mtu' - the data space left */ 780 if (len > mtu) 781 len = mtu; 782 /* IF: we are not sending up to and including the packet end 783 then align the next start on an eight byte boundary */ 784 if (len < left) { 785 len &= ~7; 786 } 787 788 /* Allocate buffer */ 789 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 790 hroom + troom, GFP_ATOMIC); 791 if (!frag) { 792 err = -ENOMEM; 793 goto fail; 794 } 795 796 /* 797 * Set up data on packet 798 */ 799 800 ip6_copy_metadata(frag, skb); 801 skb_reserve(frag, hroom); 802 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 803 skb_reset_network_header(frag); 804 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 805 frag->transport_header = (frag->network_header + hlen + 806 sizeof(struct frag_hdr)); 807 808 /* 809 * Charge the memory for the fragment to any owner 810 * it might possess 811 */ 812 if (skb->sk) 813 skb_set_owner_w(frag, skb->sk); 814 815 /* 816 * Copy the packet header into the new buffer. 817 */ 818 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 819 820 fragnexthdr_offset = skb_network_header(frag); 821 fragnexthdr_offset += prevhdr - skb_network_header(skb); 822 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 823 824 /* 825 * Build fragment header. 826 */ 827 fh->nexthdr = nexthdr; 828 fh->reserved = 0; 829 fh->identification = frag_id; 830 831 /* 832 * Copy a block of the IP datagram. 833 */ 834 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 835 len)); 836 left -= len; 837 838 fh->frag_off = htons(offset); 839 if (left > 0) 840 fh->frag_off |= htons(IP6_MF); 841 ipv6_hdr(frag)->payload_len = htons(frag->len - 842 sizeof(struct ipv6hdr)); 843 844 ptr += len; 845 offset += len; 846 847 /* 848 * Put this fragment into the sending queue. 849 */ 850 err = output(net, sk, frag); 851 if (err) 852 goto fail; 853 854 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 855 IPSTATS_MIB_FRAGCREATES); 856 } 857 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 858 IPSTATS_MIB_FRAGOKS); 859 consume_skb(skb); 860 return err; 861 862 fail_toobig: 863 if (skb->sk && dst_allfrag(skb_dst(skb))) 864 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 865 866 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 867 err = -EMSGSIZE; 868 869 fail: 870 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 871 IPSTATS_MIB_FRAGFAILS); 872 kfree_skb(skb); 873 return err; 874 } 875 876 static inline int ip6_rt_check(const struct rt6key *rt_key, 877 const struct in6_addr *fl_addr, 878 const struct in6_addr *addr_cache) 879 { 880 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 881 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 882 } 883 884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 885 struct dst_entry *dst, 886 const struct flowi6 *fl6) 887 { 888 struct ipv6_pinfo *np = inet6_sk(sk); 889 struct rt6_info *rt; 890 891 if (!dst) 892 goto out; 893 894 if (dst->ops->family != AF_INET6) { 895 dst_release(dst); 896 return NULL; 897 } 898 899 rt = (struct rt6_info *)dst; 900 /* Yes, checking route validity in not connected 901 * case is not very simple. Take into account, 902 * that we do not support routing by source, TOS, 903 * and MSG_DONTROUTE --ANK (980726) 904 * 905 * 1. ip6_rt_check(): If route was host route, 906 * check that cached destination is current. 907 * If it is network route, we still may 908 * check its validity using saved pointer 909 * to the last used address: daddr_cache. 910 * We do not want to save whole address now, 911 * (because main consumer of this service 912 * is tcp, which has not this problem), 913 * so that the last trick works only on connected 914 * sockets. 915 * 2. oif also should be the same. 916 */ 917 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 918 #ifdef CONFIG_IPV6_SUBTREES 919 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 920 #endif 921 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 922 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 923 dst_release(dst); 924 dst = NULL; 925 } 926 927 out: 928 return dst; 929 } 930 931 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 932 struct dst_entry **dst, struct flowi6 *fl6) 933 { 934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 935 struct neighbour *n; 936 struct rt6_info *rt; 937 #endif 938 int err; 939 int flags = 0; 940 941 /* The correct way to handle this would be to do 942 * ip6_route_get_saddr, and then ip6_route_output; however, 943 * the route-specific preferred source forces the 944 * ip6_route_output call _before_ ip6_route_get_saddr. 945 * 946 * In source specific routing (no src=any default route), 947 * ip6_route_output will fail given src=any saddr, though, so 948 * that's why we try it again later. 949 */ 950 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 951 struct fib6_info *from; 952 struct rt6_info *rt; 953 bool had_dst = *dst != NULL; 954 955 if (!had_dst) 956 *dst = ip6_route_output(net, sk, fl6); 957 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 958 959 rcu_read_lock(); 960 from = rt ? rcu_dereference(rt->from) : NULL; 961 err = ip6_route_get_saddr(net, from, &fl6->daddr, 962 sk ? inet6_sk(sk)->srcprefs : 0, 963 &fl6->saddr); 964 rcu_read_unlock(); 965 966 if (err) 967 goto out_err_release; 968 969 /* If we had an erroneous initial result, pretend it 970 * never existed and let the SA-enabled version take 971 * over. 972 */ 973 if (!had_dst && (*dst)->error) { 974 dst_release(*dst); 975 *dst = NULL; 976 } 977 978 if (fl6->flowi6_oif) 979 flags |= RT6_LOOKUP_F_IFACE; 980 } 981 982 if (!*dst) 983 *dst = ip6_route_output_flags(net, sk, fl6, flags); 984 985 err = (*dst)->error; 986 if (err) 987 goto out_err_release; 988 989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 990 /* 991 * Here if the dst entry we've looked up 992 * has a neighbour entry that is in the INCOMPLETE 993 * state and the src address from the flow is 994 * marked as OPTIMISTIC, we release the found 995 * dst entry and replace it instead with the 996 * dst entry of the nexthop router 997 */ 998 rt = (struct rt6_info *) *dst; 999 rcu_read_lock_bh(); 1000 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1001 rt6_nexthop(rt, &fl6->daddr)); 1002 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1003 rcu_read_unlock_bh(); 1004 1005 if (err) { 1006 struct inet6_ifaddr *ifp; 1007 struct flowi6 fl_gw6; 1008 int redirect; 1009 1010 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1011 (*dst)->dev, 1); 1012 1013 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1014 if (ifp) 1015 in6_ifa_put(ifp); 1016 1017 if (redirect) { 1018 /* 1019 * We need to get the dst entry for the 1020 * default router instead 1021 */ 1022 dst_release(*dst); 1023 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1024 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1025 *dst = ip6_route_output(net, sk, &fl_gw6); 1026 err = (*dst)->error; 1027 if (err) 1028 goto out_err_release; 1029 } 1030 } 1031 #endif 1032 if (ipv6_addr_v4mapped(&fl6->saddr) && 1033 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1034 err = -EAFNOSUPPORT; 1035 goto out_err_release; 1036 } 1037 1038 return 0; 1039 1040 out_err_release: 1041 dst_release(*dst); 1042 *dst = NULL; 1043 1044 if (err == -ENETUNREACH) 1045 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1046 return err; 1047 } 1048 1049 /** 1050 * ip6_dst_lookup - perform route lookup on flow 1051 * @sk: socket which provides route info 1052 * @dst: pointer to dst_entry * for result 1053 * @fl6: flow to lookup 1054 * 1055 * This function performs a route lookup on the given flow. 1056 * 1057 * It returns zero on success, or a standard errno code on error. 1058 */ 1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1060 struct flowi6 *fl6) 1061 { 1062 *dst = NULL; 1063 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1064 } 1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1066 1067 /** 1068 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1069 * @sk: socket which provides route info 1070 * @fl6: flow to lookup 1071 * @final_dst: final destination address for ipsec lookup 1072 * 1073 * This function performs a route lookup on the given flow. 1074 * 1075 * It returns a valid dst pointer on success, or a pointer encoded 1076 * error code. 1077 */ 1078 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1079 const struct in6_addr *final_dst) 1080 { 1081 struct dst_entry *dst = NULL; 1082 int err; 1083 1084 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1085 if (err) 1086 return ERR_PTR(err); 1087 if (final_dst) 1088 fl6->daddr = *final_dst; 1089 1090 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1091 } 1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1093 1094 /** 1095 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1096 * @sk: socket which provides the dst cache and route info 1097 * @fl6: flow to lookup 1098 * @final_dst: final destination address for ipsec lookup 1099 * @connected: whether @sk is connected or not 1100 * 1101 * This function performs a route lookup on the given flow with the 1102 * possibility of using the cached route in the socket if it is valid. 1103 * It will take the socket dst lock when operating on the dst cache. 1104 * As a result, this function can only be used in process context. 1105 * 1106 * In addition, for a connected socket, cache the dst in the socket 1107 * if the current cache is not valid. 1108 * 1109 * It returns a valid dst pointer on success, or a pointer encoded 1110 * error code. 1111 */ 1112 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1113 const struct in6_addr *final_dst, 1114 bool connected) 1115 { 1116 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1117 1118 dst = ip6_sk_dst_check(sk, dst, fl6); 1119 if (dst) 1120 return dst; 1121 1122 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1123 if (connected && !IS_ERR(dst)) 1124 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1125 1126 return dst; 1127 } 1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1129 1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1131 gfp_t gfp) 1132 { 1133 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1134 } 1135 1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1137 gfp_t gfp) 1138 { 1139 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1140 } 1141 1142 static void ip6_append_data_mtu(unsigned int *mtu, 1143 int *maxfraglen, 1144 unsigned int fragheaderlen, 1145 struct sk_buff *skb, 1146 struct rt6_info *rt, 1147 unsigned int orig_mtu) 1148 { 1149 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1150 if (!skb) { 1151 /* first fragment, reserve header_len */ 1152 *mtu = orig_mtu - rt->dst.header_len; 1153 1154 } else { 1155 /* 1156 * this fragment is not first, the headers 1157 * space is regarded as data space. 1158 */ 1159 *mtu = orig_mtu; 1160 } 1161 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1162 + fragheaderlen - sizeof(struct frag_hdr); 1163 } 1164 } 1165 1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1167 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1168 struct rt6_info *rt, struct flowi6 *fl6) 1169 { 1170 struct ipv6_pinfo *np = inet6_sk(sk); 1171 unsigned int mtu; 1172 struct ipv6_txoptions *opt = ipc6->opt; 1173 1174 /* 1175 * setup for corking 1176 */ 1177 if (opt) { 1178 if (WARN_ON(v6_cork->opt)) 1179 return -EINVAL; 1180 1181 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1182 if (unlikely(!v6_cork->opt)) 1183 return -ENOBUFS; 1184 1185 v6_cork->opt->tot_len = sizeof(*opt); 1186 v6_cork->opt->opt_flen = opt->opt_flen; 1187 v6_cork->opt->opt_nflen = opt->opt_nflen; 1188 1189 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1190 sk->sk_allocation); 1191 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1192 return -ENOBUFS; 1193 1194 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1195 sk->sk_allocation); 1196 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1197 return -ENOBUFS; 1198 1199 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1200 sk->sk_allocation); 1201 if (opt->hopopt && !v6_cork->opt->hopopt) 1202 return -ENOBUFS; 1203 1204 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1205 sk->sk_allocation); 1206 if (opt->srcrt && !v6_cork->opt->srcrt) 1207 return -ENOBUFS; 1208 1209 /* need source address above miyazawa*/ 1210 } 1211 dst_hold(&rt->dst); 1212 cork->base.dst = &rt->dst; 1213 cork->fl.u.ip6 = *fl6; 1214 v6_cork->hop_limit = ipc6->hlimit; 1215 v6_cork->tclass = ipc6->tclass; 1216 if (rt->dst.flags & DST_XFRM_TUNNEL) 1217 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1218 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1219 else 1220 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1221 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1222 if (np->frag_size < mtu) { 1223 if (np->frag_size) 1224 mtu = np->frag_size; 1225 } 1226 if (mtu < IPV6_MIN_MTU) 1227 return -EINVAL; 1228 cork->base.fragsize = mtu; 1229 cork->base.gso_size = ipc6->gso_size; 1230 cork->base.tx_flags = 0; 1231 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1232 1233 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1234 cork->base.flags |= IPCORK_ALLFRAG; 1235 cork->base.length = 0; 1236 1237 cork->base.transmit_time = ipc6->sockc.transmit_time; 1238 1239 return 0; 1240 } 1241 1242 static int __ip6_append_data(struct sock *sk, 1243 struct flowi6 *fl6, 1244 struct sk_buff_head *queue, 1245 struct inet_cork *cork, 1246 struct inet6_cork *v6_cork, 1247 struct page_frag *pfrag, 1248 int getfrag(void *from, char *to, int offset, 1249 int len, int odd, struct sk_buff *skb), 1250 void *from, int length, int transhdrlen, 1251 unsigned int flags, struct ipcm6_cookie *ipc6) 1252 { 1253 struct sk_buff *skb, *skb_prev = NULL; 1254 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1255 struct ubuf_info *uarg = NULL; 1256 int exthdrlen = 0; 1257 int dst_exthdrlen = 0; 1258 int hh_len; 1259 int copy; 1260 int err; 1261 int offset = 0; 1262 u32 tskey = 0; 1263 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1264 struct ipv6_txoptions *opt = v6_cork->opt; 1265 int csummode = CHECKSUM_NONE; 1266 unsigned int maxnonfragsize, headersize; 1267 unsigned int wmem_alloc_delta = 0; 1268 bool paged, extra_uref; 1269 1270 skb = skb_peek_tail(queue); 1271 if (!skb) { 1272 exthdrlen = opt ? opt->opt_flen : 0; 1273 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1274 } 1275 1276 paged = !!cork->gso_size; 1277 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1278 orig_mtu = mtu; 1279 1280 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 1281 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1282 tskey = sk->sk_tskey++; 1283 1284 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1285 1286 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1287 (opt ? opt->opt_nflen : 0); 1288 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1289 sizeof(struct frag_hdr); 1290 1291 headersize = sizeof(struct ipv6hdr) + 1292 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1293 (dst_allfrag(&rt->dst) ? 1294 sizeof(struct frag_hdr) : 0) + 1295 rt->rt6i_nfheader_len; 1296 1297 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1298 * the first fragment 1299 */ 1300 if (headersize + transhdrlen > mtu) 1301 goto emsgsize; 1302 1303 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1304 (sk->sk_protocol == IPPROTO_UDP || 1305 sk->sk_protocol == IPPROTO_RAW)) { 1306 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1307 sizeof(struct ipv6hdr)); 1308 goto emsgsize; 1309 } 1310 1311 if (ip6_sk_ignore_df(sk)) 1312 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1313 else 1314 maxnonfragsize = mtu; 1315 1316 if (cork->length + length > maxnonfragsize - headersize) { 1317 emsgsize: 1318 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1319 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1320 return -EMSGSIZE; 1321 } 1322 1323 /* CHECKSUM_PARTIAL only with no extension headers and when 1324 * we are not going to fragment 1325 */ 1326 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1327 headersize == sizeof(struct ipv6hdr) && 1328 length <= mtu - headersize && 1329 (!(flags & MSG_MORE) || cork->gso_size) && 1330 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1331 csummode = CHECKSUM_PARTIAL; 1332 1333 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1334 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1335 if (!uarg) 1336 return -ENOBUFS; 1337 extra_uref = true; 1338 if (rt->dst.dev->features & NETIF_F_SG && 1339 csummode == CHECKSUM_PARTIAL) { 1340 paged = true; 1341 } else { 1342 uarg->zerocopy = 0; 1343 skb_zcopy_set(skb, uarg, &extra_uref); 1344 } 1345 } 1346 1347 /* 1348 * Let's try using as much space as possible. 1349 * Use MTU if total length of the message fits into the MTU. 1350 * Otherwise, we need to reserve fragment header and 1351 * fragment alignment (= 8-15 octects, in total). 1352 * 1353 * Note that we may need to "move" the data from the tail of 1354 * of the buffer to the new fragment when we split 1355 * the message. 1356 * 1357 * FIXME: It may be fragmented into multiple chunks 1358 * at once if non-fragmentable extension headers 1359 * are too large. 1360 * --yoshfuji 1361 */ 1362 1363 cork->length += length; 1364 if (!skb) 1365 goto alloc_new_skb; 1366 1367 while (length > 0) { 1368 /* Check if the remaining data fits into current packet. */ 1369 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1370 if (copy < length) 1371 copy = maxfraglen - skb->len; 1372 1373 if (copy <= 0) { 1374 char *data; 1375 unsigned int datalen; 1376 unsigned int fraglen; 1377 unsigned int fraggap; 1378 unsigned int alloclen; 1379 unsigned int pagedlen; 1380 alloc_new_skb: 1381 /* There's no room in the current skb */ 1382 if (skb) 1383 fraggap = skb->len - maxfraglen; 1384 else 1385 fraggap = 0; 1386 /* update mtu and maxfraglen if necessary */ 1387 if (!skb || !skb_prev) 1388 ip6_append_data_mtu(&mtu, &maxfraglen, 1389 fragheaderlen, skb, rt, 1390 orig_mtu); 1391 1392 skb_prev = skb; 1393 1394 /* 1395 * If remaining data exceeds the mtu, 1396 * we know we need more fragment(s). 1397 */ 1398 datalen = length + fraggap; 1399 1400 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1401 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1402 fraglen = datalen + fragheaderlen; 1403 pagedlen = 0; 1404 1405 if ((flags & MSG_MORE) && 1406 !(rt->dst.dev->features&NETIF_F_SG)) 1407 alloclen = mtu; 1408 else if (!paged) 1409 alloclen = fraglen; 1410 else { 1411 alloclen = min_t(int, fraglen, MAX_HEADER); 1412 pagedlen = fraglen - alloclen; 1413 } 1414 1415 alloclen += dst_exthdrlen; 1416 1417 if (datalen != length + fraggap) { 1418 /* 1419 * this is not the last fragment, the trailer 1420 * space is regarded as data space. 1421 */ 1422 datalen += rt->dst.trailer_len; 1423 } 1424 1425 alloclen += rt->dst.trailer_len; 1426 fraglen = datalen + fragheaderlen; 1427 1428 /* 1429 * We just reserve space for fragment header. 1430 * Note: this may be overallocation if the message 1431 * (without MSG_MORE) fits into the MTU. 1432 */ 1433 alloclen += sizeof(struct frag_hdr); 1434 1435 copy = datalen - transhdrlen - fraggap - pagedlen; 1436 if (copy < 0) { 1437 err = -EINVAL; 1438 goto error; 1439 } 1440 if (transhdrlen) { 1441 skb = sock_alloc_send_skb(sk, 1442 alloclen + hh_len, 1443 (flags & MSG_DONTWAIT), &err); 1444 } else { 1445 skb = NULL; 1446 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1447 2 * sk->sk_sndbuf) 1448 skb = alloc_skb(alloclen + hh_len, 1449 sk->sk_allocation); 1450 if (unlikely(!skb)) 1451 err = -ENOBUFS; 1452 } 1453 if (!skb) 1454 goto error; 1455 /* 1456 * Fill in the control structures 1457 */ 1458 skb->protocol = htons(ETH_P_IPV6); 1459 skb->ip_summed = csummode; 1460 skb->csum = 0; 1461 /* reserve for fragmentation and ipsec header */ 1462 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1463 dst_exthdrlen); 1464 1465 /* 1466 * Find where to start putting bytes 1467 */ 1468 data = skb_put(skb, fraglen - pagedlen); 1469 skb_set_network_header(skb, exthdrlen); 1470 data += fragheaderlen; 1471 skb->transport_header = (skb->network_header + 1472 fragheaderlen); 1473 if (fraggap) { 1474 skb->csum = skb_copy_and_csum_bits( 1475 skb_prev, maxfraglen, 1476 data + transhdrlen, fraggap, 0); 1477 skb_prev->csum = csum_sub(skb_prev->csum, 1478 skb->csum); 1479 data += fraggap; 1480 pskb_trim_unique(skb_prev, maxfraglen); 1481 } 1482 if (copy > 0 && 1483 getfrag(from, data + transhdrlen, offset, 1484 copy, fraggap, skb) < 0) { 1485 err = -EFAULT; 1486 kfree_skb(skb); 1487 goto error; 1488 } 1489 1490 offset += copy; 1491 length -= copy + transhdrlen; 1492 transhdrlen = 0; 1493 exthdrlen = 0; 1494 dst_exthdrlen = 0; 1495 1496 /* Only the initial fragment is time stamped */ 1497 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1498 cork->tx_flags = 0; 1499 skb_shinfo(skb)->tskey = tskey; 1500 tskey = 0; 1501 skb_zcopy_set(skb, uarg, &extra_uref); 1502 1503 if ((flags & MSG_CONFIRM) && !skb_prev) 1504 skb_set_dst_pending_confirm(skb, 1); 1505 1506 /* 1507 * Put the packet on the pending queue 1508 */ 1509 if (!skb->destructor) { 1510 skb->destructor = sock_wfree; 1511 skb->sk = sk; 1512 wmem_alloc_delta += skb->truesize; 1513 } 1514 __skb_queue_tail(queue, skb); 1515 continue; 1516 } 1517 1518 if (copy > length) 1519 copy = length; 1520 1521 if (!(rt->dst.dev->features&NETIF_F_SG) && 1522 skb_tailroom(skb) >= copy) { 1523 unsigned int off; 1524 1525 off = skb->len; 1526 if (getfrag(from, skb_put(skb, copy), 1527 offset, copy, off, skb) < 0) { 1528 __skb_trim(skb, off); 1529 err = -EFAULT; 1530 goto error; 1531 } 1532 } else if (!uarg || !uarg->zerocopy) { 1533 int i = skb_shinfo(skb)->nr_frags; 1534 1535 err = -ENOMEM; 1536 if (!sk_page_frag_refill(sk, pfrag)) 1537 goto error; 1538 1539 if (!skb_can_coalesce(skb, i, pfrag->page, 1540 pfrag->offset)) { 1541 err = -EMSGSIZE; 1542 if (i == MAX_SKB_FRAGS) 1543 goto error; 1544 1545 __skb_fill_page_desc(skb, i, pfrag->page, 1546 pfrag->offset, 0); 1547 skb_shinfo(skb)->nr_frags = ++i; 1548 get_page(pfrag->page); 1549 } 1550 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1551 if (getfrag(from, 1552 page_address(pfrag->page) + pfrag->offset, 1553 offset, copy, skb->len, skb) < 0) 1554 goto error_efault; 1555 1556 pfrag->offset += copy; 1557 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1558 skb->len += copy; 1559 skb->data_len += copy; 1560 skb->truesize += copy; 1561 wmem_alloc_delta += copy; 1562 } else { 1563 err = skb_zerocopy_iter_dgram(skb, from, copy); 1564 if (err < 0) 1565 goto error; 1566 } 1567 offset += copy; 1568 length -= copy; 1569 } 1570 1571 if (wmem_alloc_delta) 1572 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1573 return 0; 1574 1575 error_efault: 1576 err = -EFAULT; 1577 error: 1578 if (uarg) 1579 sock_zerocopy_put_abort(uarg, extra_uref); 1580 cork->length -= length; 1581 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1582 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1583 return err; 1584 } 1585 1586 int ip6_append_data(struct sock *sk, 1587 int getfrag(void *from, char *to, int offset, int len, 1588 int odd, struct sk_buff *skb), 1589 void *from, int length, int transhdrlen, 1590 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1591 struct rt6_info *rt, unsigned int flags) 1592 { 1593 struct inet_sock *inet = inet_sk(sk); 1594 struct ipv6_pinfo *np = inet6_sk(sk); 1595 int exthdrlen; 1596 int err; 1597 1598 if (flags&MSG_PROBE) 1599 return 0; 1600 if (skb_queue_empty(&sk->sk_write_queue)) { 1601 /* 1602 * setup for corking 1603 */ 1604 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1605 ipc6, rt, fl6); 1606 if (err) 1607 return err; 1608 1609 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1610 length += exthdrlen; 1611 transhdrlen += exthdrlen; 1612 } else { 1613 fl6 = &inet->cork.fl.u.ip6; 1614 transhdrlen = 0; 1615 } 1616 1617 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1618 &np->cork, sk_page_frag(sk), getfrag, 1619 from, length, transhdrlen, flags, ipc6); 1620 } 1621 EXPORT_SYMBOL_GPL(ip6_append_data); 1622 1623 static void ip6_cork_release(struct inet_cork_full *cork, 1624 struct inet6_cork *v6_cork) 1625 { 1626 if (v6_cork->opt) { 1627 kfree(v6_cork->opt->dst0opt); 1628 kfree(v6_cork->opt->dst1opt); 1629 kfree(v6_cork->opt->hopopt); 1630 kfree(v6_cork->opt->srcrt); 1631 kfree(v6_cork->opt); 1632 v6_cork->opt = NULL; 1633 } 1634 1635 if (cork->base.dst) { 1636 dst_release(cork->base.dst); 1637 cork->base.dst = NULL; 1638 cork->base.flags &= ~IPCORK_ALLFRAG; 1639 } 1640 memset(&cork->fl, 0, sizeof(cork->fl)); 1641 } 1642 1643 struct sk_buff *__ip6_make_skb(struct sock *sk, 1644 struct sk_buff_head *queue, 1645 struct inet_cork_full *cork, 1646 struct inet6_cork *v6_cork) 1647 { 1648 struct sk_buff *skb, *tmp_skb; 1649 struct sk_buff **tail_skb; 1650 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1651 struct ipv6_pinfo *np = inet6_sk(sk); 1652 struct net *net = sock_net(sk); 1653 struct ipv6hdr *hdr; 1654 struct ipv6_txoptions *opt = v6_cork->opt; 1655 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1656 struct flowi6 *fl6 = &cork->fl.u.ip6; 1657 unsigned char proto = fl6->flowi6_proto; 1658 1659 skb = __skb_dequeue(queue); 1660 if (!skb) 1661 goto out; 1662 tail_skb = &(skb_shinfo(skb)->frag_list); 1663 1664 /* move skb->data to ip header from ext header */ 1665 if (skb->data < skb_network_header(skb)) 1666 __skb_pull(skb, skb_network_offset(skb)); 1667 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1668 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1669 *tail_skb = tmp_skb; 1670 tail_skb = &(tmp_skb->next); 1671 skb->len += tmp_skb->len; 1672 skb->data_len += tmp_skb->len; 1673 skb->truesize += tmp_skb->truesize; 1674 tmp_skb->destructor = NULL; 1675 tmp_skb->sk = NULL; 1676 } 1677 1678 /* Allow local fragmentation. */ 1679 skb->ignore_df = ip6_sk_ignore_df(sk); 1680 1681 *final_dst = fl6->daddr; 1682 __skb_pull(skb, skb_network_header_len(skb)); 1683 if (opt && opt->opt_flen) 1684 ipv6_push_frag_opts(skb, opt, &proto); 1685 if (opt && opt->opt_nflen) 1686 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1687 1688 skb_push(skb, sizeof(struct ipv6hdr)); 1689 skb_reset_network_header(skb); 1690 hdr = ipv6_hdr(skb); 1691 1692 ip6_flow_hdr(hdr, v6_cork->tclass, 1693 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1694 ip6_autoflowlabel(net, np), fl6)); 1695 hdr->hop_limit = v6_cork->hop_limit; 1696 hdr->nexthdr = proto; 1697 hdr->saddr = fl6->saddr; 1698 hdr->daddr = *final_dst; 1699 1700 skb->priority = sk->sk_priority; 1701 skb->mark = sk->sk_mark; 1702 1703 skb->tstamp = cork->base.transmit_time; 1704 1705 skb_dst_set(skb, dst_clone(&rt->dst)); 1706 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1707 if (proto == IPPROTO_ICMPV6) { 1708 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1709 1710 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1711 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1712 } 1713 1714 ip6_cork_release(cork, v6_cork); 1715 out: 1716 return skb; 1717 } 1718 1719 int ip6_send_skb(struct sk_buff *skb) 1720 { 1721 struct net *net = sock_net(skb->sk); 1722 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1723 int err; 1724 1725 err = ip6_local_out(net, skb->sk, skb); 1726 if (err) { 1727 if (err > 0) 1728 err = net_xmit_errno(err); 1729 if (err) 1730 IP6_INC_STATS(net, rt->rt6i_idev, 1731 IPSTATS_MIB_OUTDISCARDS); 1732 } 1733 1734 return err; 1735 } 1736 1737 int ip6_push_pending_frames(struct sock *sk) 1738 { 1739 struct sk_buff *skb; 1740 1741 skb = ip6_finish_skb(sk); 1742 if (!skb) 1743 return 0; 1744 1745 return ip6_send_skb(skb); 1746 } 1747 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1748 1749 static void __ip6_flush_pending_frames(struct sock *sk, 1750 struct sk_buff_head *queue, 1751 struct inet_cork_full *cork, 1752 struct inet6_cork *v6_cork) 1753 { 1754 struct sk_buff *skb; 1755 1756 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1757 if (skb_dst(skb)) 1758 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1759 IPSTATS_MIB_OUTDISCARDS); 1760 kfree_skb(skb); 1761 } 1762 1763 ip6_cork_release(cork, v6_cork); 1764 } 1765 1766 void ip6_flush_pending_frames(struct sock *sk) 1767 { 1768 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1769 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1770 } 1771 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1772 1773 struct sk_buff *ip6_make_skb(struct sock *sk, 1774 int getfrag(void *from, char *to, int offset, 1775 int len, int odd, struct sk_buff *skb), 1776 void *from, int length, int transhdrlen, 1777 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1778 struct rt6_info *rt, unsigned int flags, 1779 struct inet_cork_full *cork) 1780 { 1781 struct inet6_cork v6_cork; 1782 struct sk_buff_head queue; 1783 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1784 int err; 1785 1786 if (flags & MSG_PROBE) 1787 return NULL; 1788 1789 __skb_queue_head_init(&queue); 1790 1791 cork->base.flags = 0; 1792 cork->base.addr = 0; 1793 cork->base.opt = NULL; 1794 cork->base.dst = NULL; 1795 v6_cork.opt = NULL; 1796 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); 1797 if (err) { 1798 ip6_cork_release(cork, &v6_cork); 1799 return ERR_PTR(err); 1800 } 1801 if (ipc6->dontfrag < 0) 1802 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1803 1804 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1805 ¤t->task_frag, getfrag, from, 1806 length + exthdrlen, transhdrlen + exthdrlen, 1807 flags, ipc6); 1808 if (err) { 1809 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1810 return ERR_PTR(err); 1811 } 1812 1813 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 1814 } 1815