1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 int ret; 134 135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 if (ret) { 137 kfree_skb(skb); 138 return ret; 139 } 140 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 /* Policy lookup after SNAT yielded a new policy */ 143 if (skb_dst(skb)->xfrm) { 144 IPCB(skb)->flags |= IPSKB_REROUTED; 145 return dst_output(net, sk, skb); 146 } 147 #endif 148 149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 150 dst_allfrag(skb_dst(skb)) || 151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 153 else 154 return ip6_finish_output2(net, sk, skb); 155 } 156 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 { 159 struct net_device *dev = skb_dst(skb)->dev; 160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 161 162 skb->protocol = htons(ETH_P_IPV6); 163 skb->dev = dev; 164 165 if (unlikely(idev->cnf.disable_ipv6)) { 166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 167 kfree_skb(skb); 168 return 0; 169 } 170 171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 172 net, sk, skb, NULL, dev, 173 ip6_finish_output, 174 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 } 176 177 static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 178 { 179 if (!np->autoflowlabel_set) 180 return ip6_default_np_autolabel(net); 181 else 182 return np->autoflowlabel; 183 } 184 185 /* 186 * xmit an sk_buff (used by TCP, SCTP and DCCP) 187 * Note : socket lock is not held for SYNACK packets, but might be modified 188 * by calls to skb_set_owner_w() and ipv6_local_error(), 189 * which are using proper atomic operations or spinlocks. 190 */ 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 192 __u32 mark, struct ipv6_txoptions *opt, int tclass) 193 { 194 struct net *net = sock_net(sk); 195 const struct ipv6_pinfo *np = inet6_sk(sk); 196 struct in6_addr *first_hop = &fl6->daddr; 197 struct dst_entry *dst = skb_dst(skb); 198 struct ipv6hdr *hdr; 199 u8 proto = fl6->flowi6_proto; 200 int seg_len = skb->len; 201 int hlimit = -1; 202 u32 mtu; 203 204 if (opt) { 205 unsigned int head_room; 206 207 /* First: exthdrs may take lots of space (~8K for now) 208 MAX_HEADER is not enough. 209 */ 210 head_room = opt->opt_nflen + opt->opt_flen; 211 seg_len += head_room; 212 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 213 214 if (skb_headroom(skb) < head_room) { 215 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 216 if (!skb2) { 217 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 218 IPSTATS_MIB_OUTDISCARDS); 219 kfree_skb(skb); 220 return -ENOBUFS; 221 } 222 consume_skb(skb); 223 skb = skb2; 224 /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, 225 * it is safe to call in our context (socket lock not held) 226 */ 227 skb_set_owner_w(skb, (struct sock *)sk); 228 } 229 if (opt->opt_flen) 230 ipv6_push_frag_opts(skb, opt, &proto); 231 if (opt->opt_nflen) 232 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 233 &fl6->saddr); 234 } 235 236 skb_push(skb, sizeof(struct ipv6hdr)); 237 skb_reset_network_header(skb); 238 hdr = ipv6_hdr(skb); 239 240 /* 241 * Fill in the IPv6 header 242 */ 243 if (np) 244 hlimit = np->hop_limit; 245 if (hlimit < 0) 246 hlimit = ip6_dst_hoplimit(dst); 247 248 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 249 ip6_autoflowlabel(net, np), fl6)); 250 251 hdr->payload_len = htons(seg_len); 252 hdr->nexthdr = proto; 253 hdr->hop_limit = hlimit; 254 255 hdr->saddr = fl6->saddr; 256 hdr->daddr = *first_hop; 257 258 skb->protocol = htons(ETH_P_IPV6); 259 skb->priority = sk->sk_priority; 260 skb->mark = mark; 261 262 mtu = dst_mtu(dst); 263 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 264 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 265 IPSTATS_MIB_OUT, skb->len); 266 267 /* if egress device is enslaved to an L3 master device pass the 268 * skb to its handler for processing 269 */ 270 skb = l3mdev_ip6_out((struct sock *)sk, skb); 271 if (unlikely(!skb)) 272 return 0; 273 274 /* hooks should never assume socket lock is held. 275 * we promote our socket to non const 276 */ 277 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 278 net, (struct sock *)sk, skb, NULL, dst->dev, 279 dst_output); 280 } 281 282 skb->dev = dst->dev; 283 /* ipv6_local_error() does not require socket lock, 284 * we promote our socket to non const 285 */ 286 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 287 288 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 289 kfree_skb(skb); 290 return -EMSGSIZE; 291 } 292 EXPORT_SYMBOL(ip6_xmit); 293 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 295 { 296 struct ip6_ra_chain *ra; 297 struct sock *last = NULL; 298 299 read_lock(&ip6_ra_lock); 300 for (ra = ip6_ra_chain; ra; ra = ra->next) { 301 struct sock *sk = ra->sk; 302 if (sk && ra->sel == sel && 303 (!sk->sk_bound_dev_if || 304 sk->sk_bound_dev_if == skb->dev->ifindex)) { 305 if (last) { 306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 307 if (skb2) 308 rawv6_rcv(last, skb2); 309 } 310 last = sk; 311 } 312 } 313 314 if (last) { 315 rawv6_rcv(last, skb); 316 read_unlock(&ip6_ra_lock); 317 return 1; 318 } 319 read_unlock(&ip6_ra_lock); 320 return 0; 321 } 322 323 static int ip6_forward_proxy_check(struct sk_buff *skb) 324 { 325 struct ipv6hdr *hdr = ipv6_hdr(skb); 326 u8 nexthdr = hdr->nexthdr; 327 __be16 frag_off; 328 int offset; 329 330 if (ipv6_ext_hdr(nexthdr)) { 331 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 332 if (offset < 0) 333 return 0; 334 } else 335 offset = sizeof(struct ipv6hdr); 336 337 if (nexthdr == IPPROTO_ICMPV6) { 338 struct icmp6hdr *icmp6; 339 340 if (!pskb_may_pull(skb, (skb_network_header(skb) + 341 offset + 1 - skb->data))) 342 return 0; 343 344 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 345 346 switch (icmp6->icmp6_type) { 347 case NDISC_ROUTER_SOLICITATION: 348 case NDISC_ROUTER_ADVERTISEMENT: 349 case NDISC_NEIGHBOUR_SOLICITATION: 350 case NDISC_NEIGHBOUR_ADVERTISEMENT: 351 case NDISC_REDIRECT: 352 /* For reaction involving unicast neighbor discovery 353 * message destined to the proxied address, pass it to 354 * input function. 355 */ 356 return 1; 357 default: 358 break; 359 } 360 } 361 362 /* 363 * The proxying router can't forward traffic sent to a link-local 364 * address, so signal the sender and discard the packet. This 365 * behavior is clarified by the MIPv6 specification. 366 */ 367 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 368 dst_link_failure(skb); 369 return -1; 370 } 371 372 return 0; 373 } 374 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 376 struct sk_buff *skb) 377 { 378 return dst_output(net, sk, skb); 379 } 380 381 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) 382 { 383 unsigned int mtu; 384 struct inet6_dev *idev; 385 386 if (dst_metric_locked(dst, RTAX_MTU)) { 387 mtu = dst_metric_raw(dst, RTAX_MTU); 388 if (mtu) 389 return mtu; 390 } 391 392 mtu = IPV6_MIN_MTU; 393 rcu_read_lock(); 394 idev = __in6_dev_get(dst->dev); 395 if (idev) 396 mtu = idev->cnf.mtu6; 397 rcu_read_unlock(); 398 399 return mtu; 400 } 401 402 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 403 { 404 if (skb->len <= mtu) 405 return false; 406 407 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 408 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 409 return true; 410 411 if (skb->ignore_df) 412 return false; 413 414 if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) 415 return false; 416 417 return true; 418 } 419 420 int ip6_forward(struct sk_buff *skb) 421 { 422 struct dst_entry *dst = skb_dst(skb); 423 struct ipv6hdr *hdr = ipv6_hdr(skb); 424 struct inet6_skb_parm *opt = IP6CB(skb); 425 struct net *net = dev_net(dst->dev); 426 u32 mtu; 427 428 if (net->ipv6.devconf_all->forwarding == 0) 429 goto error; 430 431 if (skb->pkt_type != PACKET_HOST) 432 goto drop; 433 434 if (unlikely(skb->sk)) 435 goto drop; 436 437 if (skb_warn_if_lro(skb)) 438 goto drop; 439 440 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 441 __IP6_INC_STATS(net, ip6_dst_idev(dst), 442 IPSTATS_MIB_INDISCARDS); 443 goto drop; 444 } 445 446 skb_forward_csum(skb); 447 448 /* 449 * We DO NOT make any processing on 450 * RA packets, pushing them to user level AS IS 451 * without ane WARRANTY that application will be able 452 * to interpret them. The reason is that we 453 * cannot make anything clever here. 454 * 455 * We are not end-node, so that if packet contains 456 * AH/ESP, we cannot make anything. 457 * Defragmentation also would be mistake, RA packets 458 * cannot be fragmented, because there is no warranty 459 * that different fragments will go along one path. --ANK 460 */ 461 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 462 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 463 return 0; 464 } 465 466 /* 467 * check and decrement ttl 468 */ 469 if (hdr->hop_limit <= 1) { 470 /* Force OUTPUT device used as source address */ 471 skb->dev = dst->dev; 472 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 473 __IP6_INC_STATS(net, ip6_dst_idev(dst), 474 IPSTATS_MIB_INHDRERRORS); 475 476 kfree_skb(skb); 477 return -ETIMEDOUT; 478 } 479 480 /* XXX: idev->cnf.proxy_ndp? */ 481 if (net->ipv6.devconf_all->proxy_ndp && 482 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 483 int proxied = ip6_forward_proxy_check(skb); 484 if (proxied > 0) 485 return ip6_input(skb); 486 else if (proxied < 0) { 487 __IP6_INC_STATS(net, ip6_dst_idev(dst), 488 IPSTATS_MIB_INDISCARDS); 489 goto drop; 490 } 491 } 492 493 if (!xfrm6_route_forward(skb)) { 494 __IP6_INC_STATS(net, ip6_dst_idev(dst), 495 IPSTATS_MIB_INDISCARDS); 496 goto drop; 497 } 498 dst = skb_dst(skb); 499 500 /* IPv6 specs say nothing about it, but it is clear that we cannot 501 send redirects to source routed frames. 502 We don't send redirects to frames decapsulated from IPsec. 503 */ 504 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { 505 struct in6_addr *target = NULL; 506 struct inet_peer *peer; 507 struct rt6_info *rt; 508 509 /* 510 * incoming and outgoing devices are the same 511 * send a redirect. 512 */ 513 514 rt = (struct rt6_info *) dst; 515 if (rt->rt6i_flags & RTF_GATEWAY) 516 target = &rt->rt6i_gateway; 517 else 518 target = &hdr->daddr; 519 520 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 521 522 /* Limit redirects both by destination (here) 523 and by source (inside ndisc_send_redirect) 524 */ 525 if (inet_peer_xrlim_allow(peer, 1*HZ)) 526 ndisc_send_redirect(skb, target); 527 if (peer) 528 inet_putpeer(peer); 529 } else { 530 int addrtype = ipv6_addr_type(&hdr->saddr); 531 532 /* This check is security critical. */ 533 if (addrtype == IPV6_ADDR_ANY || 534 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 535 goto error; 536 if (addrtype & IPV6_ADDR_LINKLOCAL) { 537 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 538 ICMPV6_NOT_NEIGHBOUR, 0); 539 goto error; 540 } 541 } 542 543 mtu = ip6_dst_mtu_forward(dst); 544 if (mtu < IPV6_MIN_MTU) 545 mtu = IPV6_MIN_MTU; 546 547 if (ip6_pkt_too_big(skb, mtu)) { 548 /* Again, force OUTPUT device used as source address */ 549 skb->dev = dst->dev; 550 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 551 __IP6_INC_STATS(net, ip6_dst_idev(dst), 552 IPSTATS_MIB_INTOOBIGERRORS); 553 __IP6_INC_STATS(net, ip6_dst_idev(dst), 554 IPSTATS_MIB_FRAGFAILS); 555 kfree_skb(skb); 556 return -EMSGSIZE; 557 } 558 559 if (skb_cow(skb, dst->dev->hard_header_len)) { 560 __IP6_INC_STATS(net, ip6_dst_idev(dst), 561 IPSTATS_MIB_OUTDISCARDS); 562 goto drop; 563 } 564 565 hdr = ipv6_hdr(skb); 566 567 /* Mangling hops number delayed to point after skb COW */ 568 569 hdr->hop_limit--; 570 571 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 572 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 573 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 574 net, NULL, skb, skb->dev, dst->dev, 575 ip6_forward_finish); 576 577 error: 578 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 579 drop: 580 kfree_skb(skb); 581 return -EINVAL; 582 } 583 584 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 585 { 586 to->pkt_type = from->pkt_type; 587 to->priority = from->priority; 588 to->protocol = from->protocol; 589 skb_dst_drop(to); 590 skb_dst_set(to, dst_clone(skb_dst(from))); 591 to->dev = from->dev; 592 to->mark = from->mark; 593 594 #ifdef CONFIG_NET_SCHED 595 to->tc_index = from->tc_index; 596 #endif 597 nf_copy(to, from); 598 skb_copy_secmark(to, from); 599 } 600 601 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 602 int (*output)(struct net *, struct sock *, struct sk_buff *)) 603 { 604 struct sk_buff *frag; 605 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 606 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 607 inet6_sk(skb->sk) : NULL; 608 struct ipv6hdr *tmp_hdr; 609 struct frag_hdr *fh; 610 unsigned int mtu, hlen, left, len; 611 int hroom, troom; 612 __be32 frag_id; 613 int ptr, offset = 0, err = 0; 614 u8 *prevhdr, nexthdr = 0; 615 616 err = ip6_find_1stfragopt(skb, &prevhdr); 617 if (err < 0) 618 goto fail; 619 hlen = err; 620 nexthdr = *prevhdr; 621 622 mtu = ip6_skb_dst_mtu(skb); 623 624 /* We must not fragment if the socket is set to force MTU discovery 625 * or if the skb it not generated by a local socket. 626 */ 627 if (unlikely(!skb->ignore_df && skb->len > mtu)) 628 goto fail_toobig; 629 630 if (IP6CB(skb)->frag_max_size) { 631 if (IP6CB(skb)->frag_max_size > mtu) 632 goto fail_toobig; 633 634 /* don't send fragments larger than what we received */ 635 mtu = IP6CB(skb)->frag_max_size; 636 if (mtu < IPV6_MIN_MTU) 637 mtu = IPV6_MIN_MTU; 638 } 639 640 if (np && np->frag_size < mtu) { 641 if (np->frag_size) 642 mtu = np->frag_size; 643 } 644 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 645 goto fail_toobig; 646 mtu -= hlen + sizeof(struct frag_hdr); 647 648 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 649 &ipv6_hdr(skb)->saddr); 650 651 if (skb->ip_summed == CHECKSUM_PARTIAL && 652 (err = skb_checksum_help(skb))) 653 goto fail; 654 655 hroom = LL_RESERVED_SPACE(rt->dst.dev); 656 if (skb_has_frag_list(skb)) { 657 unsigned int first_len = skb_pagelen(skb); 658 struct sk_buff *frag2; 659 660 if (first_len - hlen > mtu || 661 ((first_len - hlen) & 7) || 662 skb_cloned(skb) || 663 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 664 goto slow_path; 665 666 skb_walk_frags(skb, frag) { 667 /* Correct geometry. */ 668 if (frag->len > mtu || 669 ((frag->len & 7) && frag->next) || 670 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 671 goto slow_path_clean; 672 673 /* Partially cloned skb? */ 674 if (skb_shared(frag)) 675 goto slow_path_clean; 676 677 BUG_ON(frag->sk); 678 if (skb->sk) { 679 frag->sk = skb->sk; 680 frag->destructor = sock_wfree; 681 } 682 skb->truesize -= frag->truesize; 683 } 684 685 err = 0; 686 offset = 0; 687 /* BUILD HEADER */ 688 689 *prevhdr = NEXTHDR_FRAGMENT; 690 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 691 if (!tmp_hdr) { 692 err = -ENOMEM; 693 goto fail; 694 } 695 frag = skb_shinfo(skb)->frag_list; 696 skb_frag_list_init(skb); 697 698 __skb_pull(skb, hlen); 699 fh = __skb_push(skb, sizeof(struct frag_hdr)); 700 __skb_push(skb, hlen); 701 skb_reset_network_header(skb); 702 memcpy(skb_network_header(skb), tmp_hdr, hlen); 703 704 fh->nexthdr = nexthdr; 705 fh->reserved = 0; 706 fh->frag_off = htons(IP6_MF); 707 fh->identification = frag_id; 708 709 first_len = skb_pagelen(skb); 710 skb->data_len = first_len - skb_headlen(skb); 711 skb->len = first_len; 712 ipv6_hdr(skb)->payload_len = htons(first_len - 713 sizeof(struct ipv6hdr)); 714 715 for (;;) { 716 /* Prepare header of the next frame, 717 * before previous one went down. */ 718 if (frag) { 719 frag->ip_summed = CHECKSUM_NONE; 720 skb_reset_transport_header(frag); 721 fh = __skb_push(frag, sizeof(struct frag_hdr)); 722 __skb_push(frag, hlen); 723 skb_reset_network_header(frag); 724 memcpy(skb_network_header(frag), tmp_hdr, 725 hlen); 726 offset += skb->len - hlen - sizeof(struct frag_hdr); 727 fh->nexthdr = nexthdr; 728 fh->reserved = 0; 729 fh->frag_off = htons(offset); 730 if (frag->next) 731 fh->frag_off |= htons(IP6_MF); 732 fh->identification = frag_id; 733 ipv6_hdr(frag)->payload_len = 734 htons(frag->len - 735 sizeof(struct ipv6hdr)); 736 ip6_copy_metadata(frag, skb); 737 } 738 739 err = output(net, sk, skb); 740 if (!err) 741 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 742 IPSTATS_MIB_FRAGCREATES); 743 744 if (err || !frag) 745 break; 746 747 skb = frag; 748 frag = skb->next; 749 skb->next = NULL; 750 } 751 752 kfree(tmp_hdr); 753 754 if (err == 0) { 755 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 756 IPSTATS_MIB_FRAGOKS); 757 return 0; 758 } 759 760 kfree_skb_list(frag); 761 762 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 763 IPSTATS_MIB_FRAGFAILS); 764 return err; 765 766 slow_path_clean: 767 skb_walk_frags(skb, frag2) { 768 if (frag2 == frag) 769 break; 770 frag2->sk = NULL; 771 frag2->destructor = NULL; 772 skb->truesize += frag2->truesize; 773 } 774 } 775 776 slow_path: 777 left = skb->len - hlen; /* Space per frame */ 778 ptr = hlen; /* Where to start from */ 779 780 /* 781 * Fragment the datagram. 782 */ 783 784 troom = rt->dst.dev->needed_tailroom; 785 786 /* 787 * Keep copying data until we run out. 788 */ 789 while (left > 0) { 790 u8 *fragnexthdr_offset; 791 792 len = left; 793 /* IF: it doesn't fit, use 'mtu' - the data space left */ 794 if (len > mtu) 795 len = mtu; 796 /* IF: we are not sending up to and including the packet end 797 then align the next start on an eight byte boundary */ 798 if (len < left) { 799 len &= ~7; 800 } 801 802 /* Allocate buffer */ 803 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 804 hroom + troom, GFP_ATOMIC); 805 if (!frag) { 806 err = -ENOMEM; 807 goto fail; 808 } 809 810 /* 811 * Set up data on packet 812 */ 813 814 ip6_copy_metadata(frag, skb); 815 skb_reserve(frag, hroom); 816 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 817 skb_reset_network_header(frag); 818 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 819 frag->transport_header = (frag->network_header + hlen + 820 sizeof(struct frag_hdr)); 821 822 /* 823 * Charge the memory for the fragment to any owner 824 * it might possess 825 */ 826 if (skb->sk) 827 skb_set_owner_w(frag, skb->sk); 828 829 /* 830 * Copy the packet header into the new buffer. 831 */ 832 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 833 834 fragnexthdr_offset = skb_network_header(frag); 835 fragnexthdr_offset += prevhdr - skb_network_header(skb); 836 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 837 838 /* 839 * Build fragment header. 840 */ 841 fh->nexthdr = nexthdr; 842 fh->reserved = 0; 843 fh->identification = frag_id; 844 845 /* 846 * Copy a block of the IP datagram. 847 */ 848 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 849 len)); 850 left -= len; 851 852 fh->frag_off = htons(offset); 853 if (left > 0) 854 fh->frag_off |= htons(IP6_MF); 855 ipv6_hdr(frag)->payload_len = htons(frag->len - 856 sizeof(struct ipv6hdr)); 857 858 ptr += len; 859 offset += len; 860 861 /* 862 * Put this fragment into the sending queue. 863 */ 864 err = output(net, sk, frag); 865 if (err) 866 goto fail; 867 868 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 869 IPSTATS_MIB_FRAGCREATES); 870 } 871 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 872 IPSTATS_MIB_FRAGOKS); 873 consume_skb(skb); 874 return err; 875 876 fail_toobig: 877 if (skb->sk && dst_allfrag(skb_dst(skb))) 878 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 879 880 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 881 err = -EMSGSIZE; 882 883 fail: 884 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 885 IPSTATS_MIB_FRAGFAILS); 886 kfree_skb(skb); 887 return err; 888 } 889 890 static inline int ip6_rt_check(const struct rt6key *rt_key, 891 const struct in6_addr *fl_addr, 892 const struct in6_addr *addr_cache) 893 { 894 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 895 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 896 } 897 898 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 899 struct dst_entry *dst, 900 const struct flowi6 *fl6) 901 { 902 struct ipv6_pinfo *np = inet6_sk(sk); 903 struct rt6_info *rt; 904 905 if (!dst) 906 goto out; 907 908 if (dst->ops->family != AF_INET6) { 909 dst_release(dst); 910 return NULL; 911 } 912 913 rt = (struct rt6_info *)dst; 914 /* Yes, checking route validity in not connected 915 * case is not very simple. Take into account, 916 * that we do not support routing by source, TOS, 917 * and MSG_DONTROUTE --ANK (980726) 918 * 919 * 1. ip6_rt_check(): If route was host route, 920 * check that cached destination is current. 921 * If it is network route, we still may 922 * check its validity using saved pointer 923 * to the last used address: daddr_cache. 924 * We do not want to save whole address now, 925 * (because main consumer of this service 926 * is tcp, which has not this problem), 927 * so that the last trick works only on connected 928 * sockets. 929 * 2. oif also should be the same. 930 */ 931 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 932 #ifdef CONFIG_IPV6_SUBTREES 933 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 934 #endif 935 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 936 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 937 dst_release(dst); 938 dst = NULL; 939 } 940 941 out: 942 return dst; 943 } 944 945 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 946 struct dst_entry **dst, struct flowi6 *fl6) 947 { 948 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 949 struct neighbour *n; 950 struct rt6_info *rt; 951 #endif 952 int err; 953 int flags = 0; 954 955 /* The correct way to handle this would be to do 956 * ip6_route_get_saddr, and then ip6_route_output; however, 957 * the route-specific preferred source forces the 958 * ip6_route_output call _before_ ip6_route_get_saddr. 959 * 960 * In source specific routing (no src=any default route), 961 * ip6_route_output will fail given src=any saddr, though, so 962 * that's why we try it again later. 963 */ 964 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 965 struct rt6_info *rt; 966 bool had_dst = *dst != NULL; 967 968 if (!had_dst) 969 *dst = ip6_route_output(net, sk, fl6); 970 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 971 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 972 sk ? inet6_sk(sk)->srcprefs : 0, 973 &fl6->saddr); 974 if (err) 975 goto out_err_release; 976 977 /* If we had an erroneous initial result, pretend it 978 * never existed and let the SA-enabled version take 979 * over. 980 */ 981 if (!had_dst && (*dst)->error) { 982 dst_release(*dst); 983 *dst = NULL; 984 } 985 986 if (fl6->flowi6_oif) 987 flags |= RT6_LOOKUP_F_IFACE; 988 } 989 990 if (!*dst) 991 *dst = ip6_route_output_flags(net, sk, fl6, flags); 992 993 err = (*dst)->error; 994 if (err) 995 goto out_err_release; 996 997 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 998 /* 999 * Here if the dst entry we've looked up 1000 * has a neighbour entry that is in the INCOMPLETE 1001 * state and the src address from the flow is 1002 * marked as OPTIMISTIC, we release the found 1003 * dst entry and replace it instead with the 1004 * dst entry of the nexthop router 1005 */ 1006 rt = (struct rt6_info *) *dst; 1007 rcu_read_lock_bh(); 1008 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1009 rt6_nexthop(rt, &fl6->daddr)); 1010 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1011 rcu_read_unlock_bh(); 1012 1013 if (err) { 1014 struct inet6_ifaddr *ifp; 1015 struct flowi6 fl_gw6; 1016 int redirect; 1017 1018 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1019 (*dst)->dev, 1); 1020 1021 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1022 if (ifp) 1023 in6_ifa_put(ifp); 1024 1025 if (redirect) { 1026 /* 1027 * We need to get the dst entry for the 1028 * default router instead 1029 */ 1030 dst_release(*dst); 1031 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1032 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1033 *dst = ip6_route_output(net, sk, &fl_gw6); 1034 err = (*dst)->error; 1035 if (err) 1036 goto out_err_release; 1037 } 1038 } 1039 #endif 1040 if (ipv6_addr_v4mapped(&fl6->saddr) && 1041 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1042 err = -EAFNOSUPPORT; 1043 goto out_err_release; 1044 } 1045 1046 return 0; 1047 1048 out_err_release: 1049 dst_release(*dst); 1050 *dst = NULL; 1051 1052 if (err == -ENETUNREACH) 1053 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1054 return err; 1055 } 1056 1057 /** 1058 * ip6_dst_lookup - perform route lookup on flow 1059 * @sk: socket which provides route info 1060 * @dst: pointer to dst_entry * for result 1061 * @fl6: flow to lookup 1062 * 1063 * This function performs a route lookup on the given flow. 1064 * 1065 * It returns zero on success, or a standard errno code on error. 1066 */ 1067 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1068 struct flowi6 *fl6) 1069 { 1070 *dst = NULL; 1071 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1072 } 1073 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1074 1075 /** 1076 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1077 * @sk: socket which provides route info 1078 * @fl6: flow to lookup 1079 * @final_dst: final destination address for ipsec lookup 1080 * 1081 * This function performs a route lookup on the given flow. 1082 * 1083 * It returns a valid dst pointer on success, or a pointer encoded 1084 * error code. 1085 */ 1086 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1087 const struct in6_addr *final_dst) 1088 { 1089 struct dst_entry *dst = NULL; 1090 int err; 1091 1092 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1093 if (err) 1094 return ERR_PTR(err); 1095 if (final_dst) 1096 fl6->daddr = *final_dst; 1097 1098 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1099 } 1100 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1101 1102 /** 1103 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1104 * @sk: socket which provides the dst cache and route info 1105 * @fl6: flow to lookup 1106 * @final_dst: final destination address for ipsec lookup 1107 * 1108 * This function performs a route lookup on the given flow with the 1109 * possibility of using the cached route in the socket if it is valid. 1110 * It will take the socket dst lock when operating on the dst cache. 1111 * As a result, this function can only be used in process context. 1112 * 1113 * It returns a valid dst pointer on success, or a pointer encoded 1114 * error code. 1115 */ 1116 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1117 const struct in6_addr *final_dst) 1118 { 1119 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1120 1121 dst = ip6_sk_dst_check(sk, dst, fl6); 1122 if (!dst) 1123 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1124 1125 return dst; 1126 } 1127 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1128 1129 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1130 gfp_t gfp) 1131 { 1132 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1133 } 1134 1135 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1136 gfp_t gfp) 1137 { 1138 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1139 } 1140 1141 static void ip6_append_data_mtu(unsigned int *mtu, 1142 int *maxfraglen, 1143 unsigned int fragheaderlen, 1144 struct sk_buff *skb, 1145 struct rt6_info *rt, 1146 unsigned int orig_mtu) 1147 { 1148 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1149 if (!skb) { 1150 /* first fragment, reserve header_len */ 1151 *mtu = orig_mtu - rt->dst.header_len; 1152 1153 } else { 1154 /* 1155 * this fragment is not first, the headers 1156 * space is regarded as data space. 1157 */ 1158 *mtu = orig_mtu; 1159 } 1160 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1161 + fragheaderlen - sizeof(struct frag_hdr); 1162 } 1163 } 1164 1165 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1166 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1167 struct rt6_info *rt, struct flowi6 *fl6) 1168 { 1169 struct ipv6_pinfo *np = inet6_sk(sk); 1170 unsigned int mtu; 1171 struct ipv6_txoptions *opt = ipc6->opt; 1172 1173 /* 1174 * setup for corking 1175 */ 1176 if (opt) { 1177 if (WARN_ON(v6_cork->opt)) 1178 return -EINVAL; 1179 1180 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1181 if (unlikely(!v6_cork->opt)) 1182 return -ENOBUFS; 1183 1184 v6_cork->opt->tot_len = sizeof(*opt); 1185 v6_cork->opt->opt_flen = opt->opt_flen; 1186 v6_cork->opt->opt_nflen = opt->opt_nflen; 1187 1188 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1189 sk->sk_allocation); 1190 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1191 return -ENOBUFS; 1192 1193 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1194 sk->sk_allocation); 1195 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1196 return -ENOBUFS; 1197 1198 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1199 sk->sk_allocation); 1200 if (opt->hopopt && !v6_cork->opt->hopopt) 1201 return -ENOBUFS; 1202 1203 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1204 sk->sk_allocation); 1205 if (opt->srcrt && !v6_cork->opt->srcrt) 1206 return -ENOBUFS; 1207 1208 /* need source address above miyazawa*/ 1209 } 1210 dst_hold(&rt->dst); 1211 cork->base.dst = &rt->dst; 1212 cork->fl.u.ip6 = *fl6; 1213 v6_cork->hop_limit = ipc6->hlimit; 1214 v6_cork->tclass = ipc6->tclass; 1215 if (rt->dst.flags & DST_XFRM_TUNNEL) 1216 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1217 rt->dst.dev->mtu : dst_mtu(&rt->dst); 1218 else 1219 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1220 rt->dst.dev->mtu : dst_mtu(xfrm_dst_path(&rt->dst)); 1221 if (np->frag_size < mtu) { 1222 if (np->frag_size) 1223 mtu = np->frag_size; 1224 } 1225 cork->base.fragsize = mtu; 1226 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1227 cork->base.flags |= IPCORK_ALLFRAG; 1228 cork->base.length = 0; 1229 1230 return 0; 1231 } 1232 1233 static int __ip6_append_data(struct sock *sk, 1234 struct flowi6 *fl6, 1235 struct sk_buff_head *queue, 1236 struct inet_cork *cork, 1237 struct inet6_cork *v6_cork, 1238 struct page_frag *pfrag, 1239 int getfrag(void *from, char *to, int offset, 1240 int len, int odd, struct sk_buff *skb), 1241 void *from, int length, int transhdrlen, 1242 unsigned int flags, struct ipcm6_cookie *ipc6, 1243 const struct sockcm_cookie *sockc) 1244 { 1245 struct sk_buff *skb, *skb_prev = NULL; 1246 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; 1247 int exthdrlen = 0; 1248 int dst_exthdrlen = 0; 1249 int hh_len; 1250 int copy; 1251 int err; 1252 int offset = 0; 1253 __u8 tx_flags = 0; 1254 u32 tskey = 0; 1255 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1256 struct ipv6_txoptions *opt = v6_cork->opt; 1257 int csummode = CHECKSUM_NONE; 1258 unsigned int maxnonfragsize, headersize; 1259 1260 skb = skb_peek_tail(queue); 1261 if (!skb) { 1262 exthdrlen = opt ? opt->opt_flen : 0; 1263 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1264 } 1265 1266 mtu = cork->fragsize; 1267 orig_mtu = mtu; 1268 1269 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1270 1271 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1272 (opt ? opt->opt_nflen : 0); 1273 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1274 sizeof(struct frag_hdr); 1275 1276 headersize = sizeof(struct ipv6hdr) + 1277 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1278 (dst_allfrag(&rt->dst) ? 1279 sizeof(struct frag_hdr) : 0) + 1280 rt->rt6i_nfheader_len; 1281 1282 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1283 (sk->sk_protocol == IPPROTO_UDP || 1284 sk->sk_protocol == IPPROTO_RAW)) { 1285 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1286 sizeof(struct ipv6hdr)); 1287 goto emsgsize; 1288 } 1289 1290 if (ip6_sk_ignore_df(sk)) 1291 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1292 else 1293 maxnonfragsize = mtu; 1294 1295 if (cork->length + length > maxnonfragsize - headersize) { 1296 emsgsize: 1297 ipv6_local_error(sk, EMSGSIZE, fl6, 1298 mtu - headersize + 1299 sizeof(struct ipv6hdr)); 1300 return -EMSGSIZE; 1301 } 1302 1303 /* CHECKSUM_PARTIAL only with no extension headers and when 1304 * we are not going to fragment 1305 */ 1306 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1307 headersize == sizeof(struct ipv6hdr) && 1308 length <= mtu - headersize && 1309 !(flags & MSG_MORE) && 1310 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1311 csummode = CHECKSUM_PARTIAL; 1312 1313 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { 1314 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); 1315 if (tx_flags & SKBTX_ANY_SW_TSTAMP && 1316 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1317 tskey = sk->sk_tskey++; 1318 } 1319 1320 /* 1321 * Let's try using as much space as possible. 1322 * Use MTU if total length of the message fits into the MTU. 1323 * Otherwise, we need to reserve fragment header and 1324 * fragment alignment (= 8-15 octects, in total). 1325 * 1326 * Note that we may need to "move" the data from the tail of 1327 * of the buffer to the new fragment when we split 1328 * the message. 1329 * 1330 * FIXME: It may be fragmented into multiple chunks 1331 * at once if non-fragmentable extension headers 1332 * are too large. 1333 * --yoshfuji 1334 */ 1335 1336 cork->length += length; 1337 if (!skb) 1338 goto alloc_new_skb; 1339 1340 while (length > 0) { 1341 /* Check if the remaining data fits into current packet. */ 1342 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1343 if (copy < length) 1344 copy = maxfraglen - skb->len; 1345 1346 if (copy <= 0) { 1347 char *data; 1348 unsigned int datalen; 1349 unsigned int fraglen; 1350 unsigned int fraggap; 1351 unsigned int alloclen; 1352 alloc_new_skb: 1353 /* There's no room in the current skb */ 1354 if (skb) 1355 fraggap = skb->len - maxfraglen; 1356 else 1357 fraggap = 0; 1358 /* update mtu and maxfraglen if necessary */ 1359 if (!skb || !skb_prev) 1360 ip6_append_data_mtu(&mtu, &maxfraglen, 1361 fragheaderlen, skb, rt, 1362 orig_mtu); 1363 1364 skb_prev = skb; 1365 1366 /* 1367 * If remaining data exceeds the mtu, 1368 * we know we need more fragment(s). 1369 */ 1370 datalen = length + fraggap; 1371 1372 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1373 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1374 if ((flags & MSG_MORE) && 1375 !(rt->dst.dev->features&NETIF_F_SG)) 1376 alloclen = mtu; 1377 else 1378 alloclen = datalen + fragheaderlen; 1379 1380 alloclen += dst_exthdrlen; 1381 1382 if (datalen != length + fraggap) { 1383 /* 1384 * this is not the last fragment, the trailer 1385 * space is regarded as data space. 1386 */ 1387 datalen += rt->dst.trailer_len; 1388 } 1389 1390 alloclen += rt->dst.trailer_len; 1391 fraglen = datalen + fragheaderlen; 1392 1393 /* 1394 * We just reserve space for fragment header. 1395 * Note: this may be overallocation if the message 1396 * (without MSG_MORE) fits into the MTU. 1397 */ 1398 alloclen += sizeof(struct frag_hdr); 1399 1400 copy = datalen - transhdrlen - fraggap; 1401 if (copy < 0) { 1402 err = -EINVAL; 1403 goto error; 1404 } 1405 if (transhdrlen) { 1406 skb = sock_alloc_send_skb(sk, 1407 alloclen + hh_len, 1408 (flags & MSG_DONTWAIT), &err); 1409 } else { 1410 skb = NULL; 1411 if (refcount_read(&sk->sk_wmem_alloc) <= 1412 2 * sk->sk_sndbuf) 1413 skb = sock_wmalloc(sk, 1414 alloclen + hh_len, 1, 1415 sk->sk_allocation); 1416 if (unlikely(!skb)) 1417 err = -ENOBUFS; 1418 } 1419 if (!skb) 1420 goto error; 1421 /* 1422 * Fill in the control structures 1423 */ 1424 skb->protocol = htons(ETH_P_IPV6); 1425 skb->ip_summed = csummode; 1426 skb->csum = 0; 1427 /* reserve for fragmentation and ipsec header */ 1428 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1429 dst_exthdrlen); 1430 1431 /* Only the initial fragment is time stamped */ 1432 skb_shinfo(skb)->tx_flags = tx_flags; 1433 tx_flags = 0; 1434 skb_shinfo(skb)->tskey = tskey; 1435 tskey = 0; 1436 1437 /* 1438 * Find where to start putting bytes 1439 */ 1440 data = skb_put(skb, fraglen); 1441 skb_set_network_header(skb, exthdrlen); 1442 data += fragheaderlen; 1443 skb->transport_header = (skb->network_header + 1444 fragheaderlen); 1445 if (fraggap) { 1446 skb->csum = skb_copy_and_csum_bits( 1447 skb_prev, maxfraglen, 1448 data + transhdrlen, fraggap, 0); 1449 skb_prev->csum = csum_sub(skb_prev->csum, 1450 skb->csum); 1451 data += fraggap; 1452 pskb_trim_unique(skb_prev, maxfraglen); 1453 } 1454 if (copy > 0 && 1455 getfrag(from, data + transhdrlen, offset, 1456 copy, fraggap, skb) < 0) { 1457 err = -EFAULT; 1458 kfree_skb(skb); 1459 goto error; 1460 } 1461 1462 offset += copy; 1463 length -= datalen - fraggap; 1464 transhdrlen = 0; 1465 exthdrlen = 0; 1466 dst_exthdrlen = 0; 1467 1468 if ((flags & MSG_CONFIRM) && !skb_prev) 1469 skb_set_dst_pending_confirm(skb, 1); 1470 1471 /* 1472 * Put the packet on the pending queue 1473 */ 1474 __skb_queue_tail(queue, skb); 1475 continue; 1476 } 1477 1478 if (copy > length) 1479 copy = length; 1480 1481 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1482 unsigned int off; 1483 1484 off = skb->len; 1485 if (getfrag(from, skb_put(skb, copy), 1486 offset, copy, off, skb) < 0) { 1487 __skb_trim(skb, off); 1488 err = -EFAULT; 1489 goto error; 1490 } 1491 } else { 1492 int i = skb_shinfo(skb)->nr_frags; 1493 1494 err = -ENOMEM; 1495 if (!sk_page_frag_refill(sk, pfrag)) 1496 goto error; 1497 1498 if (!skb_can_coalesce(skb, i, pfrag->page, 1499 pfrag->offset)) { 1500 err = -EMSGSIZE; 1501 if (i == MAX_SKB_FRAGS) 1502 goto error; 1503 1504 __skb_fill_page_desc(skb, i, pfrag->page, 1505 pfrag->offset, 0); 1506 skb_shinfo(skb)->nr_frags = ++i; 1507 get_page(pfrag->page); 1508 } 1509 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1510 if (getfrag(from, 1511 page_address(pfrag->page) + pfrag->offset, 1512 offset, copy, skb->len, skb) < 0) 1513 goto error_efault; 1514 1515 pfrag->offset += copy; 1516 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1517 skb->len += copy; 1518 skb->data_len += copy; 1519 skb->truesize += copy; 1520 refcount_add(copy, &sk->sk_wmem_alloc); 1521 } 1522 offset += copy; 1523 length -= copy; 1524 } 1525 1526 return 0; 1527 1528 error_efault: 1529 err = -EFAULT; 1530 error: 1531 cork->length -= length; 1532 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1533 return err; 1534 } 1535 1536 int ip6_append_data(struct sock *sk, 1537 int getfrag(void *from, char *to, int offset, int len, 1538 int odd, struct sk_buff *skb), 1539 void *from, int length, int transhdrlen, 1540 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1541 struct rt6_info *rt, unsigned int flags, 1542 const struct sockcm_cookie *sockc) 1543 { 1544 struct inet_sock *inet = inet_sk(sk); 1545 struct ipv6_pinfo *np = inet6_sk(sk); 1546 int exthdrlen; 1547 int err; 1548 1549 if (flags&MSG_PROBE) 1550 return 0; 1551 if (skb_queue_empty(&sk->sk_write_queue)) { 1552 /* 1553 * setup for corking 1554 */ 1555 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1556 ipc6, rt, fl6); 1557 if (err) 1558 return err; 1559 1560 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1561 length += exthdrlen; 1562 transhdrlen += exthdrlen; 1563 } else { 1564 fl6 = &inet->cork.fl.u.ip6; 1565 transhdrlen = 0; 1566 } 1567 1568 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1569 &np->cork, sk_page_frag(sk), getfrag, 1570 from, length, transhdrlen, flags, ipc6, sockc); 1571 } 1572 EXPORT_SYMBOL_GPL(ip6_append_data); 1573 1574 static void ip6_cork_release(struct inet_cork_full *cork, 1575 struct inet6_cork *v6_cork) 1576 { 1577 if (v6_cork->opt) { 1578 kfree(v6_cork->opt->dst0opt); 1579 kfree(v6_cork->opt->dst1opt); 1580 kfree(v6_cork->opt->hopopt); 1581 kfree(v6_cork->opt->srcrt); 1582 kfree(v6_cork->opt); 1583 v6_cork->opt = NULL; 1584 } 1585 1586 if (cork->base.dst) { 1587 dst_release(cork->base.dst); 1588 cork->base.dst = NULL; 1589 cork->base.flags &= ~IPCORK_ALLFRAG; 1590 } 1591 memset(&cork->fl, 0, sizeof(cork->fl)); 1592 } 1593 1594 struct sk_buff *__ip6_make_skb(struct sock *sk, 1595 struct sk_buff_head *queue, 1596 struct inet_cork_full *cork, 1597 struct inet6_cork *v6_cork) 1598 { 1599 struct sk_buff *skb, *tmp_skb; 1600 struct sk_buff **tail_skb; 1601 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1602 struct ipv6_pinfo *np = inet6_sk(sk); 1603 struct net *net = sock_net(sk); 1604 struct ipv6hdr *hdr; 1605 struct ipv6_txoptions *opt = v6_cork->opt; 1606 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1607 struct flowi6 *fl6 = &cork->fl.u.ip6; 1608 unsigned char proto = fl6->flowi6_proto; 1609 1610 skb = __skb_dequeue(queue); 1611 if (!skb) 1612 goto out; 1613 tail_skb = &(skb_shinfo(skb)->frag_list); 1614 1615 /* move skb->data to ip header from ext header */ 1616 if (skb->data < skb_network_header(skb)) 1617 __skb_pull(skb, skb_network_offset(skb)); 1618 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1619 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1620 *tail_skb = tmp_skb; 1621 tail_skb = &(tmp_skb->next); 1622 skb->len += tmp_skb->len; 1623 skb->data_len += tmp_skb->len; 1624 skb->truesize += tmp_skb->truesize; 1625 tmp_skb->destructor = NULL; 1626 tmp_skb->sk = NULL; 1627 } 1628 1629 /* Allow local fragmentation. */ 1630 skb->ignore_df = ip6_sk_ignore_df(sk); 1631 1632 *final_dst = fl6->daddr; 1633 __skb_pull(skb, skb_network_header_len(skb)); 1634 if (opt && opt->opt_flen) 1635 ipv6_push_frag_opts(skb, opt, &proto); 1636 if (opt && opt->opt_nflen) 1637 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1638 1639 skb_push(skb, sizeof(struct ipv6hdr)); 1640 skb_reset_network_header(skb); 1641 hdr = ipv6_hdr(skb); 1642 1643 ip6_flow_hdr(hdr, v6_cork->tclass, 1644 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1645 ip6_autoflowlabel(net, np), fl6)); 1646 hdr->hop_limit = v6_cork->hop_limit; 1647 hdr->nexthdr = proto; 1648 hdr->saddr = fl6->saddr; 1649 hdr->daddr = *final_dst; 1650 1651 skb->priority = sk->sk_priority; 1652 skb->mark = sk->sk_mark; 1653 1654 skb_dst_set(skb, dst_clone(&rt->dst)); 1655 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1656 if (proto == IPPROTO_ICMPV6) { 1657 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1658 1659 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1660 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1661 } 1662 1663 ip6_cork_release(cork, v6_cork); 1664 out: 1665 return skb; 1666 } 1667 1668 int ip6_send_skb(struct sk_buff *skb) 1669 { 1670 struct net *net = sock_net(skb->sk); 1671 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1672 int err; 1673 1674 err = ip6_local_out(net, skb->sk, skb); 1675 if (err) { 1676 if (err > 0) 1677 err = net_xmit_errno(err); 1678 if (err) 1679 IP6_INC_STATS(net, rt->rt6i_idev, 1680 IPSTATS_MIB_OUTDISCARDS); 1681 } 1682 1683 return err; 1684 } 1685 1686 int ip6_push_pending_frames(struct sock *sk) 1687 { 1688 struct sk_buff *skb; 1689 1690 skb = ip6_finish_skb(sk); 1691 if (!skb) 1692 return 0; 1693 1694 return ip6_send_skb(skb); 1695 } 1696 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1697 1698 static void __ip6_flush_pending_frames(struct sock *sk, 1699 struct sk_buff_head *queue, 1700 struct inet_cork_full *cork, 1701 struct inet6_cork *v6_cork) 1702 { 1703 struct sk_buff *skb; 1704 1705 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1706 if (skb_dst(skb)) 1707 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1708 IPSTATS_MIB_OUTDISCARDS); 1709 kfree_skb(skb); 1710 } 1711 1712 ip6_cork_release(cork, v6_cork); 1713 } 1714 1715 void ip6_flush_pending_frames(struct sock *sk) 1716 { 1717 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1718 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1719 } 1720 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1721 1722 struct sk_buff *ip6_make_skb(struct sock *sk, 1723 int getfrag(void *from, char *to, int offset, 1724 int len, int odd, struct sk_buff *skb), 1725 void *from, int length, int transhdrlen, 1726 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1727 struct rt6_info *rt, unsigned int flags, 1728 const struct sockcm_cookie *sockc) 1729 { 1730 struct inet_cork_full cork; 1731 struct inet6_cork v6_cork; 1732 struct sk_buff_head queue; 1733 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1734 int err; 1735 1736 if (flags & MSG_PROBE) 1737 return NULL; 1738 1739 __skb_queue_head_init(&queue); 1740 1741 cork.base.flags = 0; 1742 cork.base.addr = 0; 1743 cork.base.opt = NULL; 1744 v6_cork.opt = NULL; 1745 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); 1746 if (err) 1747 return ERR_PTR(err); 1748 1749 if (ipc6->dontfrag < 0) 1750 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1751 1752 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, 1753 ¤t->task_frag, getfrag, from, 1754 length + exthdrlen, transhdrlen + exthdrlen, 1755 flags, ipc6, sockc); 1756 if (err) { 1757 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); 1758 return ERR_PTR(err); 1759 } 1760 1761 return __ip6_make_skb(sk, &queue, &cork, &v6_cork); 1762 } 1763