1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv6.h> 44 45 #include <net/sock.h> 46 #include <net/snmp.h> 47 48 #include <net/ipv6.h> 49 #include <net/ndisc.h> 50 #include <net/protocol.h> 51 #include <net/ip6_route.h> 52 #include <net/addrconf.h> 53 #include <net/rawv6.h> 54 #include <net/icmp.h> 55 #include <net/xfrm.h> 56 #include <net/checksum.h> 57 #include <linux/mroute6.h> 58 59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); 60 61 int __ip6_local_out(struct sk_buff *skb) 62 { 63 int len; 64 65 len = skb->len - sizeof(struct ipv6hdr); 66 if (len > IPV6_MAXPLEN) 67 len = 0; 68 ipv6_hdr(skb)->payload_len = htons(len); 69 70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, 71 skb_dst(skb)->dev, dst_output); 72 } 73 74 int ip6_local_out(struct sk_buff *skb) 75 { 76 int err; 77 78 err = __ip6_local_out(skb); 79 if (likely(err == 1)) 80 err = dst_output(skb); 81 82 return err; 83 } 84 EXPORT_SYMBOL_GPL(ip6_local_out); 85 86 /* dev_loopback_xmit for use with netfilter. */ 87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb) 88 { 89 skb_reset_mac_header(newskb); 90 __skb_pull(newskb, skb_network_offset(newskb)); 91 newskb->pkt_type = PACKET_LOOPBACK; 92 newskb->ip_summed = CHECKSUM_UNNECESSARY; 93 WARN_ON(!skb_dst(newskb)); 94 95 netif_rx_ni(newskb); 96 return 0; 97 } 98 99 static int ip6_finish_output2(struct sk_buff *skb) 100 { 101 struct dst_entry *dst = skb_dst(skb); 102 struct net_device *dev = dst->dev; 103 struct neighbour *neigh; 104 105 skb->protocol = htons(ETH_P_IPV6); 106 skb->dev = dev; 107 108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 110 111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && 112 ((mroute6_socket(dev_net(dev), skb) && 113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 115 &ipv6_hdr(skb)->saddr))) { 116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 117 118 /* Do not check for IFF_ALLMULTI; multicast routing 119 is not supported in any case. 120 */ 121 if (newskb) 122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 123 newskb, NULL, newskb->dev, 124 ip6_dev_loopback_xmit); 125 126 if (ipv6_hdr(skb)->hop_limit == 0) { 127 IP6_INC_STATS(dev_net(dev), idev, 128 IPSTATS_MIB_OUTDISCARDS); 129 kfree_skb(skb); 130 return 0; 131 } 132 } 133 134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, 135 skb->len); 136 } 137 138 rcu_read_lock(); 139 neigh = dst_get_neighbour_noref(dst); 140 if (neigh) { 141 int res = neigh_output(neigh, skb); 142 143 rcu_read_unlock(); 144 return res; 145 } 146 rcu_read_unlock(); 147 IP6_INC_STATS_BH(dev_net(dst->dev), 148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 149 kfree_skb(skb); 150 return -EINVAL; 151 } 152 153 static int ip6_finish_output(struct sk_buff *skb) 154 { 155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 156 dst_allfrag(skb_dst(skb))) 157 return ip6_fragment(skb, ip6_finish_output2); 158 else 159 return ip6_finish_output2(skb); 160 } 161 162 int ip6_output(struct sk_buff *skb) 163 { 164 struct net_device *dev = skb_dst(skb)->dev; 165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 166 if (unlikely(idev->cnf.disable_ipv6)) { 167 IP6_INC_STATS(dev_net(dev), idev, 168 IPSTATS_MIB_OUTDISCARDS); 169 kfree_skb(skb); 170 return 0; 171 } 172 173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, 174 ip6_finish_output, 175 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 176 } 177 178 /* 179 * xmit an sk_buff (used by TCP, SCTP and DCCP) 180 */ 181 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 183 struct ipv6_txoptions *opt, int tclass) 184 { 185 struct net *net = sock_net(sk); 186 struct ipv6_pinfo *np = inet6_sk(sk); 187 struct in6_addr *first_hop = &fl6->daddr; 188 struct dst_entry *dst = skb_dst(skb); 189 struct ipv6hdr *hdr; 190 u8 proto = fl6->flowi6_proto; 191 int seg_len = skb->len; 192 int hlimit = -1; 193 u32 mtu; 194 195 if (opt) { 196 unsigned int head_room; 197 198 /* First: exthdrs may take lots of space (~8K for now) 199 MAX_HEADER is not enough. 200 */ 201 head_room = opt->opt_nflen + opt->opt_flen; 202 seg_len += head_room; 203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 204 205 if (skb_headroom(skb) < head_room) { 206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 207 if (skb2 == NULL) { 208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 209 IPSTATS_MIB_OUTDISCARDS); 210 kfree_skb(skb); 211 return -ENOBUFS; 212 } 213 kfree_skb(skb); 214 skb = skb2; 215 skb_set_owner_w(skb, sk); 216 } 217 if (opt->opt_flen) 218 ipv6_push_frag_opts(skb, opt, &proto); 219 if (opt->opt_nflen) 220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); 221 } 222 223 skb_push(skb, sizeof(struct ipv6hdr)); 224 skb_reset_network_header(skb); 225 hdr = ipv6_hdr(skb); 226 227 /* 228 * Fill in the IPv6 header 229 */ 230 if (np) 231 hlimit = np->hop_limit; 232 if (hlimit < 0) 233 hlimit = ip6_dst_hoplimit(dst); 234 235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel; 236 237 hdr->payload_len = htons(seg_len); 238 hdr->nexthdr = proto; 239 hdr->hop_limit = hlimit; 240 241 hdr->saddr = fl6->saddr; 242 hdr->daddr = *first_hop; 243 244 skb->priority = sk->sk_priority; 245 skb->mark = sk->sk_mark; 246 247 mtu = dst_mtu(dst); 248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { 249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 250 IPSTATS_MIB_OUT, skb->len); 251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, 252 dst->dev, dst_output); 253 } 254 255 if (net_ratelimit()) 256 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); 257 skb->dev = dst->dev; 258 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 259 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 260 kfree_skb(skb); 261 return -EMSGSIZE; 262 } 263 264 EXPORT_SYMBOL(ip6_xmit); 265 266 /* 267 * To avoid extra problems ND packets are send through this 268 * routine. It's code duplication but I really want to avoid 269 * extra checks since ipv6_build_header is used by TCP (which 270 * is for us performance critical) 271 */ 272 273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, 274 const struct in6_addr *saddr, const struct in6_addr *daddr, 275 int proto, int len) 276 { 277 struct ipv6_pinfo *np = inet6_sk(sk); 278 struct ipv6hdr *hdr; 279 280 skb->protocol = htons(ETH_P_IPV6); 281 skb->dev = dev; 282 283 skb_reset_network_header(skb); 284 skb_put(skb, sizeof(struct ipv6hdr)); 285 hdr = ipv6_hdr(skb); 286 287 *(__be32*)hdr = htonl(0x60000000); 288 289 hdr->payload_len = htons(len); 290 hdr->nexthdr = proto; 291 hdr->hop_limit = np->hop_limit; 292 293 hdr->saddr = *saddr; 294 hdr->daddr = *daddr; 295 296 return 0; 297 } 298 299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 300 { 301 struct ip6_ra_chain *ra; 302 struct sock *last = NULL; 303 304 read_lock(&ip6_ra_lock); 305 for (ra = ip6_ra_chain; ra; ra = ra->next) { 306 struct sock *sk = ra->sk; 307 if (sk && ra->sel == sel && 308 (!sk->sk_bound_dev_if || 309 sk->sk_bound_dev_if == skb->dev->ifindex)) { 310 if (last) { 311 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 312 if (skb2) 313 rawv6_rcv(last, skb2); 314 } 315 last = sk; 316 } 317 } 318 319 if (last) { 320 rawv6_rcv(last, skb); 321 read_unlock(&ip6_ra_lock); 322 return 1; 323 } 324 read_unlock(&ip6_ra_lock); 325 return 0; 326 } 327 328 static int ip6_forward_proxy_check(struct sk_buff *skb) 329 { 330 struct ipv6hdr *hdr = ipv6_hdr(skb); 331 u8 nexthdr = hdr->nexthdr; 332 __be16 frag_off; 333 int offset; 334 335 if (ipv6_ext_hdr(nexthdr)) { 336 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 337 if (offset < 0) 338 return 0; 339 } else 340 offset = sizeof(struct ipv6hdr); 341 342 if (nexthdr == IPPROTO_ICMPV6) { 343 struct icmp6hdr *icmp6; 344 345 if (!pskb_may_pull(skb, (skb_network_header(skb) + 346 offset + 1 - skb->data))) 347 return 0; 348 349 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 350 351 switch (icmp6->icmp6_type) { 352 case NDISC_ROUTER_SOLICITATION: 353 case NDISC_ROUTER_ADVERTISEMENT: 354 case NDISC_NEIGHBOUR_SOLICITATION: 355 case NDISC_NEIGHBOUR_ADVERTISEMENT: 356 case NDISC_REDIRECT: 357 /* For reaction involving unicast neighbor discovery 358 * message destined to the proxied address, pass it to 359 * input function. 360 */ 361 return 1; 362 default: 363 break; 364 } 365 } 366 367 /* 368 * The proxying router can't forward traffic sent to a link-local 369 * address, so signal the sender and discard the packet. This 370 * behavior is clarified by the MIPv6 specification. 371 */ 372 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 373 dst_link_failure(skb); 374 return -1; 375 } 376 377 return 0; 378 } 379 380 static inline int ip6_forward_finish(struct sk_buff *skb) 381 { 382 return dst_output(skb); 383 } 384 385 int ip6_forward(struct sk_buff *skb) 386 { 387 struct dst_entry *dst = skb_dst(skb); 388 struct ipv6hdr *hdr = ipv6_hdr(skb); 389 struct inet6_skb_parm *opt = IP6CB(skb); 390 struct net *net = dev_net(dst->dev); 391 u32 mtu; 392 393 if (net->ipv6.devconf_all->forwarding == 0) 394 goto error; 395 396 if (skb_warn_if_lro(skb)) 397 goto drop; 398 399 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 400 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); 401 goto drop; 402 } 403 404 if (skb->pkt_type != PACKET_HOST) 405 goto drop; 406 407 skb_forward_csum(skb); 408 409 /* 410 * We DO NOT make any processing on 411 * RA packets, pushing them to user level AS IS 412 * without ane WARRANTY that application will be able 413 * to interpret them. The reason is that we 414 * cannot make anything clever here. 415 * 416 * We are not end-node, so that if packet contains 417 * AH/ESP, we cannot make anything. 418 * Defragmentation also would be mistake, RA packets 419 * cannot be fragmented, because there is no warranty 420 * that different fragments will go along one path. --ANK 421 */ 422 if (opt->ra) { 423 u8 *ptr = skb_network_header(skb) + opt->ra; 424 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) 425 return 0; 426 } 427 428 /* 429 * check and decrement ttl 430 */ 431 if (hdr->hop_limit <= 1) { 432 /* Force OUTPUT device used as source address */ 433 skb->dev = dst->dev; 434 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 435 IP6_INC_STATS_BH(net, 436 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); 437 438 kfree_skb(skb); 439 return -ETIMEDOUT; 440 } 441 442 /* XXX: idev->cnf.proxy_ndp? */ 443 if (net->ipv6.devconf_all->proxy_ndp && 444 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 445 int proxied = ip6_forward_proxy_check(skb); 446 if (proxied > 0) 447 return ip6_input(skb); 448 else if (proxied < 0) { 449 IP6_INC_STATS(net, ip6_dst_idev(dst), 450 IPSTATS_MIB_INDISCARDS); 451 goto drop; 452 } 453 } 454 455 if (!xfrm6_route_forward(skb)) { 456 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); 457 goto drop; 458 } 459 dst = skb_dst(skb); 460 461 /* IPv6 specs say nothing about it, but it is clear that we cannot 462 send redirects to source routed frames. 463 We don't send redirects to frames decapsulated from IPsec. 464 */ 465 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { 466 struct in6_addr *target = NULL; 467 struct rt6_info *rt; 468 469 /* 470 * incoming and outgoing devices are the same 471 * send a redirect. 472 */ 473 474 rt = (struct rt6_info *) dst; 475 if (rt->rt6i_flags & RTF_GATEWAY) 476 target = &rt->rt6i_gateway; 477 else 478 target = &hdr->daddr; 479 480 if (!rt->rt6i_peer) 481 rt6_bind_peer(rt, 1); 482 483 /* Limit redirects both by destination (here) 484 and by source (inside ndisc_send_redirect) 485 */ 486 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) 487 ndisc_send_redirect(skb, target); 488 } else { 489 int addrtype = ipv6_addr_type(&hdr->saddr); 490 491 /* This check is security critical. */ 492 if (addrtype == IPV6_ADDR_ANY || 493 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 494 goto error; 495 if (addrtype & IPV6_ADDR_LINKLOCAL) { 496 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 497 ICMPV6_NOT_NEIGHBOUR, 0); 498 goto error; 499 } 500 } 501 502 mtu = dst_mtu(dst); 503 if (mtu < IPV6_MIN_MTU) 504 mtu = IPV6_MIN_MTU; 505 506 if (skb->len > mtu && !skb_is_gso(skb)) { 507 /* Again, force OUTPUT device used as source address */ 508 skb->dev = dst->dev; 509 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 510 IP6_INC_STATS_BH(net, 511 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); 512 IP6_INC_STATS_BH(net, 513 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); 514 kfree_skb(skb); 515 return -EMSGSIZE; 516 } 517 518 if (skb_cow(skb, dst->dev->hard_header_len)) { 519 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); 520 goto drop; 521 } 522 523 hdr = ipv6_hdr(skb); 524 525 /* Mangling hops number delayed to point after skb COW */ 526 527 hdr->hop_limit--; 528 529 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 530 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, 531 ip6_forward_finish); 532 533 error: 534 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 535 drop: 536 kfree_skb(skb); 537 return -EINVAL; 538 } 539 540 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 541 { 542 to->pkt_type = from->pkt_type; 543 to->priority = from->priority; 544 to->protocol = from->protocol; 545 skb_dst_drop(to); 546 skb_dst_set(to, dst_clone(skb_dst(from))); 547 to->dev = from->dev; 548 to->mark = from->mark; 549 550 #ifdef CONFIG_NET_SCHED 551 to->tc_index = from->tc_index; 552 #endif 553 nf_copy(to, from); 554 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 555 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 556 to->nf_trace = from->nf_trace; 557 #endif 558 skb_copy_secmark(to, from); 559 } 560 561 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) 562 { 563 u16 offset = sizeof(struct ipv6hdr); 564 struct ipv6_opt_hdr *exthdr = 565 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); 566 unsigned int packet_len = skb->tail - skb->network_header; 567 int found_rhdr = 0; 568 *nexthdr = &ipv6_hdr(skb)->nexthdr; 569 570 while (offset + 1 <= packet_len) { 571 572 switch (**nexthdr) { 573 574 case NEXTHDR_HOP: 575 break; 576 case NEXTHDR_ROUTING: 577 found_rhdr = 1; 578 break; 579 case NEXTHDR_DEST: 580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) 581 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) 582 break; 583 #endif 584 if (found_rhdr) 585 return offset; 586 break; 587 default : 588 return offset; 589 } 590 591 offset += ipv6_optlen(exthdr); 592 *nexthdr = &exthdr->nexthdr; 593 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + 594 offset); 595 } 596 597 return offset; 598 } 599 600 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) 601 { 602 static atomic_t ipv6_fragmentation_id; 603 int old, new; 604 605 if (rt && !(rt->dst.flags & DST_NOPEER)) { 606 struct inet_peer *peer; 607 608 if (!rt->rt6i_peer) 609 rt6_bind_peer(rt, 1); 610 peer = rt->rt6i_peer; 611 if (peer) { 612 fhdr->identification = htonl(inet_getid(peer, 0)); 613 return; 614 } 615 } 616 do { 617 old = atomic_read(&ipv6_fragmentation_id); 618 new = old + 1; 619 if (!new) 620 new = 1; 621 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old); 622 fhdr->identification = htonl(new); 623 } 624 625 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 626 { 627 struct sk_buff *frag; 628 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); 629 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; 630 struct ipv6hdr *tmp_hdr; 631 struct frag_hdr *fh; 632 unsigned int mtu, hlen, left, len; 633 int hroom, troom; 634 __be32 frag_id = 0; 635 int ptr, offset = 0, err=0; 636 u8 *prevhdr, nexthdr = 0; 637 struct net *net = dev_net(skb_dst(skb)->dev); 638 639 hlen = ip6_find_1stfragopt(skb, &prevhdr); 640 nexthdr = *prevhdr; 641 642 mtu = ip6_skb_dst_mtu(skb); 643 644 /* We must not fragment if the socket is set to force MTU discovery 645 * or if the skb it not generated by a local socket. 646 */ 647 if (!skb->local_df && skb->len > mtu) { 648 skb->dev = skb_dst(skb)->dev; 649 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 650 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 651 IPSTATS_MIB_FRAGFAILS); 652 kfree_skb(skb); 653 return -EMSGSIZE; 654 } 655 656 if (np && np->frag_size < mtu) { 657 if (np->frag_size) 658 mtu = np->frag_size; 659 } 660 mtu -= hlen + sizeof(struct frag_hdr); 661 662 if (skb_has_frag_list(skb)) { 663 int first_len = skb_pagelen(skb); 664 struct sk_buff *frag2; 665 666 if (first_len - hlen > mtu || 667 ((first_len - hlen) & 7) || 668 skb_cloned(skb)) 669 goto slow_path; 670 671 skb_walk_frags(skb, frag) { 672 /* Correct geometry. */ 673 if (frag->len > mtu || 674 ((frag->len & 7) && frag->next) || 675 skb_headroom(frag) < hlen) 676 goto slow_path_clean; 677 678 /* Partially cloned skb? */ 679 if (skb_shared(frag)) 680 goto slow_path_clean; 681 682 BUG_ON(frag->sk); 683 if (skb->sk) { 684 frag->sk = skb->sk; 685 frag->destructor = sock_wfree; 686 } 687 skb->truesize -= frag->truesize; 688 } 689 690 err = 0; 691 offset = 0; 692 frag = skb_shinfo(skb)->frag_list; 693 skb_frag_list_init(skb); 694 /* BUILD HEADER */ 695 696 *prevhdr = NEXTHDR_FRAGMENT; 697 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 698 if (!tmp_hdr) { 699 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 700 IPSTATS_MIB_FRAGFAILS); 701 return -ENOMEM; 702 } 703 704 __skb_pull(skb, hlen); 705 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); 706 __skb_push(skb, hlen); 707 skb_reset_network_header(skb); 708 memcpy(skb_network_header(skb), tmp_hdr, hlen); 709 710 ipv6_select_ident(fh, rt); 711 fh->nexthdr = nexthdr; 712 fh->reserved = 0; 713 fh->frag_off = htons(IP6_MF); 714 frag_id = fh->identification; 715 716 first_len = skb_pagelen(skb); 717 skb->data_len = first_len - skb_headlen(skb); 718 skb->len = first_len; 719 ipv6_hdr(skb)->payload_len = htons(first_len - 720 sizeof(struct ipv6hdr)); 721 722 dst_hold(&rt->dst); 723 724 for (;;) { 725 /* Prepare header of the next frame, 726 * before previous one went down. */ 727 if (frag) { 728 frag->ip_summed = CHECKSUM_NONE; 729 skb_reset_transport_header(frag); 730 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); 731 __skb_push(frag, hlen); 732 skb_reset_network_header(frag); 733 memcpy(skb_network_header(frag), tmp_hdr, 734 hlen); 735 offset += skb->len - hlen - sizeof(struct frag_hdr); 736 fh->nexthdr = nexthdr; 737 fh->reserved = 0; 738 fh->frag_off = htons(offset); 739 if (frag->next != NULL) 740 fh->frag_off |= htons(IP6_MF); 741 fh->identification = frag_id; 742 ipv6_hdr(frag)->payload_len = 743 htons(frag->len - 744 sizeof(struct ipv6hdr)); 745 ip6_copy_metadata(frag, skb); 746 } 747 748 err = output(skb); 749 if(!err) 750 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 751 IPSTATS_MIB_FRAGCREATES); 752 753 if (err || !frag) 754 break; 755 756 skb = frag; 757 frag = skb->next; 758 skb->next = NULL; 759 } 760 761 kfree(tmp_hdr); 762 763 if (err == 0) { 764 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 765 IPSTATS_MIB_FRAGOKS); 766 dst_release(&rt->dst); 767 return 0; 768 } 769 770 while (frag) { 771 skb = frag->next; 772 kfree_skb(frag); 773 frag = skb; 774 } 775 776 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 777 IPSTATS_MIB_FRAGFAILS); 778 dst_release(&rt->dst); 779 return err; 780 781 slow_path_clean: 782 skb_walk_frags(skb, frag2) { 783 if (frag2 == frag) 784 break; 785 frag2->sk = NULL; 786 frag2->destructor = NULL; 787 skb->truesize += frag2->truesize; 788 } 789 } 790 791 slow_path: 792 left = skb->len - hlen; /* Space per frame */ 793 ptr = hlen; /* Where to start from */ 794 795 /* 796 * Fragment the datagram. 797 */ 798 799 *prevhdr = NEXTHDR_FRAGMENT; 800 hroom = LL_RESERVED_SPACE(rt->dst.dev); 801 troom = rt->dst.dev->needed_tailroom; 802 803 /* 804 * Keep copying data until we run out. 805 */ 806 while(left > 0) { 807 len = left; 808 /* IF: it doesn't fit, use 'mtu' - the data space left */ 809 if (len > mtu) 810 len = mtu; 811 /* IF: we are not sending up to and including the packet end 812 then align the next start on an eight byte boundary */ 813 if (len < left) { 814 len &= ~7; 815 } 816 /* 817 * Allocate buffer. 818 */ 819 820 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 821 hroom + troom, GFP_ATOMIC)) == NULL) { 822 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); 823 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 824 IPSTATS_MIB_FRAGFAILS); 825 err = -ENOMEM; 826 goto fail; 827 } 828 829 /* 830 * Set up data on packet 831 */ 832 833 ip6_copy_metadata(frag, skb); 834 skb_reserve(frag, hroom); 835 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 836 skb_reset_network_header(frag); 837 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 838 frag->transport_header = (frag->network_header + hlen + 839 sizeof(struct frag_hdr)); 840 841 /* 842 * Charge the memory for the fragment to any owner 843 * it might possess 844 */ 845 if (skb->sk) 846 skb_set_owner_w(frag, skb->sk); 847 848 /* 849 * Copy the packet header into the new buffer. 850 */ 851 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 852 853 /* 854 * Build fragment header. 855 */ 856 fh->nexthdr = nexthdr; 857 fh->reserved = 0; 858 if (!frag_id) { 859 ipv6_select_ident(fh, rt); 860 frag_id = fh->identification; 861 } else 862 fh->identification = frag_id; 863 864 /* 865 * Copy a block of the IP datagram. 866 */ 867 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) 868 BUG(); 869 left -= len; 870 871 fh->frag_off = htons(offset); 872 if (left > 0) 873 fh->frag_off |= htons(IP6_MF); 874 ipv6_hdr(frag)->payload_len = htons(frag->len - 875 sizeof(struct ipv6hdr)); 876 877 ptr += len; 878 offset += len; 879 880 /* 881 * Put this fragment into the sending queue. 882 */ 883 err = output(frag); 884 if (err) 885 goto fail; 886 887 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 888 IPSTATS_MIB_FRAGCREATES); 889 } 890 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 891 IPSTATS_MIB_FRAGOKS); 892 kfree_skb(skb); 893 return err; 894 895 fail: 896 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 897 IPSTATS_MIB_FRAGFAILS); 898 kfree_skb(skb); 899 return err; 900 } 901 902 static inline int ip6_rt_check(const struct rt6key *rt_key, 903 const struct in6_addr *fl_addr, 904 const struct in6_addr *addr_cache) 905 { 906 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 907 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); 908 } 909 910 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 911 struct dst_entry *dst, 912 const struct flowi6 *fl6) 913 { 914 struct ipv6_pinfo *np = inet6_sk(sk); 915 struct rt6_info *rt = (struct rt6_info *)dst; 916 917 if (!dst) 918 goto out; 919 920 /* Yes, checking route validity in not connected 921 * case is not very simple. Take into account, 922 * that we do not support routing by source, TOS, 923 * and MSG_DONTROUTE --ANK (980726) 924 * 925 * 1. ip6_rt_check(): If route was host route, 926 * check that cached destination is current. 927 * If it is network route, we still may 928 * check its validity using saved pointer 929 * to the last used address: daddr_cache. 930 * We do not want to save whole address now, 931 * (because main consumer of this service 932 * is tcp, which has not this problem), 933 * so that the last trick works only on connected 934 * sockets. 935 * 2. oif also should be the same. 936 */ 937 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 938 #ifdef CONFIG_IPV6_SUBTREES 939 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 940 #endif 941 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { 942 dst_release(dst); 943 dst = NULL; 944 } 945 946 out: 947 return dst; 948 } 949 950 static int ip6_dst_lookup_tail(struct sock *sk, 951 struct dst_entry **dst, struct flowi6 *fl6) 952 { 953 struct net *net = sock_net(sk); 954 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 955 struct neighbour *n; 956 #endif 957 int err; 958 959 if (*dst == NULL) 960 *dst = ip6_route_output(net, sk, fl6); 961 962 if ((err = (*dst)->error)) 963 goto out_err_release; 964 965 if (ipv6_addr_any(&fl6->saddr)) { 966 struct rt6_info *rt = (struct rt6_info *) *dst; 967 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 968 sk ? inet6_sk(sk)->srcprefs : 0, 969 &fl6->saddr); 970 if (err) 971 goto out_err_release; 972 } 973 974 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 975 /* 976 * Here if the dst entry we've looked up 977 * has a neighbour entry that is in the INCOMPLETE 978 * state and the src address from the flow is 979 * marked as OPTIMISTIC, we release the found 980 * dst entry and replace it instead with the 981 * dst entry of the nexthop router 982 */ 983 rcu_read_lock(); 984 n = dst_get_neighbour_noref(*dst); 985 if (n && !(n->nud_state & NUD_VALID)) { 986 struct inet6_ifaddr *ifp; 987 struct flowi6 fl_gw6; 988 int redirect; 989 990 rcu_read_unlock(); 991 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 992 (*dst)->dev, 1); 993 994 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 995 if (ifp) 996 in6_ifa_put(ifp); 997 998 if (redirect) { 999 /* 1000 * We need to get the dst entry for the 1001 * default router instead 1002 */ 1003 dst_release(*dst); 1004 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1005 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1006 *dst = ip6_route_output(net, sk, &fl_gw6); 1007 if ((err = (*dst)->error)) 1008 goto out_err_release; 1009 } 1010 } else { 1011 rcu_read_unlock(); 1012 } 1013 #endif 1014 1015 return 0; 1016 1017 out_err_release: 1018 if (err == -ENETUNREACH) 1019 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1020 dst_release(*dst); 1021 *dst = NULL; 1022 return err; 1023 } 1024 1025 /** 1026 * ip6_dst_lookup - perform route lookup on flow 1027 * @sk: socket which provides route info 1028 * @dst: pointer to dst_entry * for result 1029 * @fl6: flow to lookup 1030 * 1031 * This function performs a route lookup on the given flow. 1032 * 1033 * It returns zero on success, or a standard errno code on error. 1034 */ 1035 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) 1036 { 1037 *dst = NULL; 1038 return ip6_dst_lookup_tail(sk, dst, fl6); 1039 } 1040 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1041 1042 /** 1043 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1044 * @sk: socket which provides route info 1045 * @fl6: flow to lookup 1046 * @final_dst: final destination address for ipsec lookup 1047 * @can_sleep: we are in a sleepable context 1048 * 1049 * This function performs a route lookup on the given flow. 1050 * 1051 * It returns a valid dst pointer on success, or a pointer encoded 1052 * error code. 1053 */ 1054 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1055 const struct in6_addr *final_dst, 1056 bool can_sleep) 1057 { 1058 struct dst_entry *dst = NULL; 1059 int err; 1060 1061 err = ip6_dst_lookup_tail(sk, &dst, fl6); 1062 if (err) 1063 return ERR_PTR(err); 1064 if (final_dst) 1065 fl6->daddr = *final_dst; 1066 if (can_sleep) 1067 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; 1068 1069 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1070 } 1071 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1072 1073 /** 1074 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1075 * @sk: socket which provides the dst cache and route info 1076 * @fl6: flow to lookup 1077 * @final_dst: final destination address for ipsec lookup 1078 * @can_sleep: we are in a sleepable context 1079 * 1080 * This function performs a route lookup on the given flow with the 1081 * possibility of using the cached route in the socket if it is valid. 1082 * It will take the socket dst lock when operating on the dst cache. 1083 * As a result, this function can only be used in process context. 1084 * 1085 * It returns a valid dst pointer on success, or a pointer encoded 1086 * error code. 1087 */ 1088 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1089 const struct in6_addr *final_dst, 1090 bool can_sleep) 1091 { 1092 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1093 int err; 1094 1095 dst = ip6_sk_dst_check(sk, dst, fl6); 1096 1097 err = ip6_dst_lookup_tail(sk, &dst, fl6); 1098 if (err) 1099 return ERR_PTR(err); 1100 if (final_dst) 1101 fl6->daddr = *final_dst; 1102 if (can_sleep) 1103 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; 1104 1105 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1106 } 1107 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1108 1109 static inline int ip6_ufo_append_data(struct sock *sk, 1110 int getfrag(void *from, char *to, int offset, int len, 1111 int odd, struct sk_buff *skb), 1112 void *from, int length, int hh_len, int fragheaderlen, 1113 int transhdrlen, int mtu,unsigned int flags, 1114 struct rt6_info *rt) 1115 1116 { 1117 struct sk_buff *skb; 1118 int err; 1119 1120 /* There is support for UDP large send offload by network 1121 * device, so create one single skb packet containing complete 1122 * udp datagram 1123 */ 1124 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 1125 skb = sock_alloc_send_skb(sk, 1126 hh_len + fragheaderlen + transhdrlen + 20, 1127 (flags & MSG_DONTWAIT), &err); 1128 if (skb == NULL) 1129 return err; 1130 1131 /* reserve space for Hardware header */ 1132 skb_reserve(skb, hh_len); 1133 1134 /* create space for UDP/IP header */ 1135 skb_put(skb,fragheaderlen + transhdrlen); 1136 1137 /* initialize network header pointer */ 1138 skb_reset_network_header(skb); 1139 1140 /* initialize protocol header pointer */ 1141 skb->transport_header = skb->network_header + fragheaderlen; 1142 1143 skb->ip_summed = CHECKSUM_PARTIAL; 1144 skb->csum = 0; 1145 } 1146 1147 err = skb_append_datato_frags(sk,skb, getfrag, from, 1148 (length - transhdrlen)); 1149 if (!err) { 1150 struct frag_hdr fhdr; 1151 1152 /* Specify the length of each IPv6 datagram fragment. 1153 * It has to be a multiple of 8. 1154 */ 1155 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - 1156 sizeof(struct frag_hdr)) & ~7; 1157 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1158 ipv6_select_ident(&fhdr, rt); 1159 skb_shinfo(skb)->ip6_frag_id = fhdr.identification; 1160 __skb_queue_tail(&sk->sk_write_queue, skb); 1161 1162 return 0; 1163 } 1164 /* There is not enough support do UPD LSO, 1165 * so follow normal path 1166 */ 1167 kfree_skb(skb); 1168 1169 return err; 1170 } 1171 1172 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1173 gfp_t gfp) 1174 { 1175 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1176 } 1177 1178 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1179 gfp_t gfp) 1180 { 1181 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1182 } 1183 1184 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, 1185 int offset, int len, int odd, struct sk_buff *skb), 1186 void *from, int length, int transhdrlen, 1187 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, 1188 struct rt6_info *rt, unsigned int flags, int dontfrag) 1189 { 1190 struct inet_sock *inet = inet_sk(sk); 1191 struct ipv6_pinfo *np = inet6_sk(sk); 1192 struct inet_cork *cork; 1193 struct sk_buff *skb; 1194 unsigned int maxfraglen, fragheaderlen; 1195 int exthdrlen; 1196 int dst_exthdrlen; 1197 int hh_len; 1198 int mtu; 1199 int copy; 1200 int err; 1201 int offset = 0; 1202 int csummode = CHECKSUM_NONE; 1203 __u8 tx_flags = 0; 1204 1205 if (flags&MSG_PROBE) 1206 return 0; 1207 cork = &inet->cork.base; 1208 if (skb_queue_empty(&sk->sk_write_queue)) { 1209 /* 1210 * setup for corking 1211 */ 1212 if (opt) { 1213 if (WARN_ON(np->cork.opt)) 1214 return -EINVAL; 1215 1216 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); 1217 if (unlikely(np->cork.opt == NULL)) 1218 return -ENOBUFS; 1219 1220 np->cork.opt->tot_len = opt->tot_len; 1221 np->cork.opt->opt_flen = opt->opt_flen; 1222 np->cork.opt->opt_nflen = opt->opt_nflen; 1223 1224 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1225 sk->sk_allocation); 1226 if (opt->dst0opt && !np->cork.opt->dst0opt) 1227 return -ENOBUFS; 1228 1229 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1230 sk->sk_allocation); 1231 if (opt->dst1opt && !np->cork.opt->dst1opt) 1232 return -ENOBUFS; 1233 1234 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, 1235 sk->sk_allocation); 1236 if (opt->hopopt && !np->cork.opt->hopopt) 1237 return -ENOBUFS; 1238 1239 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1240 sk->sk_allocation); 1241 if (opt->srcrt && !np->cork.opt->srcrt) 1242 return -ENOBUFS; 1243 1244 /* need source address above miyazawa*/ 1245 } 1246 dst_hold(&rt->dst); 1247 cork->dst = &rt->dst; 1248 inet->cork.fl.u.ip6 = *fl6; 1249 np->cork.hop_limit = hlimit; 1250 np->cork.tclass = tclass; 1251 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? 1252 rt->dst.dev->mtu : dst_mtu(&rt->dst); 1253 if (np->frag_size < mtu) { 1254 if (np->frag_size) 1255 mtu = np->frag_size; 1256 } 1257 cork->fragsize = mtu; 1258 if (dst_allfrag(rt->dst.path)) 1259 cork->flags |= IPCORK_ALLFRAG; 1260 cork->length = 0; 1261 sk->sk_sndmsg_page = NULL; 1262 sk->sk_sndmsg_off = 0; 1263 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len; 1264 length += exthdrlen; 1265 transhdrlen += exthdrlen; 1266 dst_exthdrlen = rt->dst.header_len; 1267 } else { 1268 rt = (struct rt6_info *)cork->dst; 1269 fl6 = &inet->cork.fl.u.ip6; 1270 opt = np->cork.opt; 1271 transhdrlen = 0; 1272 exthdrlen = 0; 1273 dst_exthdrlen = 0; 1274 mtu = cork->fragsize; 1275 } 1276 1277 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1278 1279 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1280 (opt ? opt->opt_nflen : 0); 1281 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); 1282 1283 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { 1284 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { 1285 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); 1286 return -EMSGSIZE; 1287 } 1288 } 1289 1290 /* For UDP, check if TX timestamp is enabled */ 1291 if (sk->sk_type == SOCK_DGRAM) { 1292 err = sock_tx_timestamp(sk, &tx_flags); 1293 if (err) 1294 goto error; 1295 } 1296 1297 /* 1298 * Let's try using as much space as possible. 1299 * Use MTU if total length of the message fits into the MTU. 1300 * Otherwise, we need to reserve fragment header and 1301 * fragment alignment (= 8-15 octects, in total). 1302 * 1303 * Note that we may need to "move" the data from the tail of 1304 * of the buffer to the new fragment when we split 1305 * the message. 1306 * 1307 * FIXME: It may be fragmented into multiple chunks 1308 * at once if non-fragmentable extension headers 1309 * are too large. 1310 * --yoshfuji 1311 */ 1312 1313 cork->length += length; 1314 if (length > mtu) { 1315 int proto = sk->sk_protocol; 1316 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){ 1317 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); 1318 return -EMSGSIZE; 1319 } 1320 1321 if (proto == IPPROTO_UDP && 1322 (rt->dst.dev->features & NETIF_F_UFO)) { 1323 1324 err = ip6_ufo_append_data(sk, getfrag, from, length, 1325 hh_len, fragheaderlen, 1326 transhdrlen, mtu, flags, rt); 1327 if (err) 1328 goto error; 1329 return 0; 1330 } 1331 } 1332 1333 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1334 goto alloc_new_skb; 1335 1336 while (length > 0) { 1337 /* Check if the remaining data fits into current packet. */ 1338 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1339 if (copy < length) 1340 copy = maxfraglen - skb->len; 1341 1342 if (copy <= 0) { 1343 char *data; 1344 unsigned int datalen; 1345 unsigned int fraglen; 1346 unsigned int fraggap; 1347 unsigned int alloclen; 1348 struct sk_buff *skb_prev; 1349 alloc_new_skb: 1350 skb_prev = skb; 1351 1352 /* There's no room in the current skb */ 1353 if (skb_prev) 1354 fraggap = skb_prev->len - maxfraglen; 1355 else 1356 fraggap = 0; 1357 1358 /* 1359 * If remaining data exceeds the mtu, 1360 * we know we need more fragment(s). 1361 */ 1362 datalen = length + fraggap; 1363 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1364 datalen = maxfraglen - fragheaderlen; 1365 1366 fraglen = datalen + fragheaderlen; 1367 if ((flags & MSG_MORE) && 1368 !(rt->dst.dev->features&NETIF_F_SG)) 1369 alloclen = mtu; 1370 else 1371 alloclen = datalen + fragheaderlen; 1372 1373 alloclen += dst_exthdrlen; 1374 1375 /* 1376 * The last fragment gets additional space at tail. 1377 * Note: we overallocate on fragments with MSG_MODE 1378 * because we have no idea if we're the last one. 1379 */ 1380 if (datalen == length + fraggap) 1381 alloclen += rt->dst.trailer_len; 1382 1383 /* 1384 * We just reserve space for fragment header. 1385 * Note: this may be overallocation if the message 1386 * (without MSG_MORE) fits into the MTU. 1387 */ 1388 alloclen += sizeof(struct frag_hdr); 1389 1390 if (transhdrlen) { 1391 skb = sock_alloc_send_skb(sk, 1392 alloclen + hh_len, 1393 (flags & MSG_DONTWAIT), &err); 1394 } else { 1395 skb = NULL; 1396 if (atomic_read(&sk->sk_wmem_alloc) <= 1397 2 * sk->sk_sndbuf) 1398 skb = sock_wmalloc(sk, 1399 alloclen + hh_len, 1, 1400 sk->sk_allocation); 1401 if (unlikely(skb == NULL)) 1402 err = -ENOBUFS; 1403 else { 1404 /* Only the initial fragment 1405 * is time stamped. 1406 */ 1407 tx_flags = 0; 1408 } 1409 } 1410 if (skb == NULL) 1411 goto error; 1412 /* 1413 * Fill in the control structures 1414 */ 1415 skb->ip_summed = csummode; 1416 skb->csum = 0; 1417 /* reserve for fragmentation and ipsec header */ 1418 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1419 dst_exthdrlen); 1420 1421 if (sk->sk_type == SOCK_DGRAM) 1422 skb_shinfo(skb)->tx_flags = tx_flags; 1423 1424 /* 1425 * Find where to start putting bytes 1426 */ 1427 data = skb_put(skb, fraglen); 1428 skb_set_network_header(skb, exthdrlen); 1429 data += fragheaderlen; 1430 skb->transport_header = (skb->network_header + 1431 fragheaderlen); 1432 if (fraggap) { 1433 skb->csum = skb_copy_and_csum_bits( 1434 skb_prev, maxfraglen, 1435 data + transhdrlen, fraggap, 0); 1436 skb_prev->csum = csum_sub(skb_prev->csum, 1437 skb->csum); 1438 data += fraggap; 1439 pskb_trim_unique(skb_prev, maxfraglen); 1440 } 1441 copy = datalen - transhdrlen - fraggap; 1442 1443 if (copy < 0) { 1444 err = -EINVAL; 1445 kfree_skb(skb); 1446 goto error; 1447 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 1448 err = -EFAULT; 1449 kfree_skb(skb); 1450 goto error; 1451 } 1452 1453 offset += copy; 1454 length -= datalen - fraggap; 1455 transhdrlen = 0; 1456 exthdrlen = 0; 1457 dst_exthdrlen = 0; 1458 csummode = CHECKSUM_NONE; 1459 1460 /* 1461 * Put the packet on the pending queue 1462 */ 1463 __skb_queue_tail(&sk->sk_write_queue, skb); 1464 continue; 1465 } 1466 1467 if (copy > length) 1468 copy = length; 1469 1470 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1471 unsigned int off; 1472 1473 off = skb->len; 1474 if (getfrag(from, skb_put(skb, copy), 1475 offset, copy, off, skb) < 0) { 1476 __skb_trim(skb, off); 1477 err = -EFAULT; 1478 goto error; 1479 } 1480 } else { 1481 int i = skb_shinfo(skb)->nr_frags; 1482 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 1483 struct page *page = sk->sk_sndmsg_page; 1484 int off = sk->sk_sndmsg_off; 1485 unsigned int left; 1486 1487 if (page && (left = PAGE_SIZE - off) > 0) { 1488 if (copy >= left) 1489 copy = left; 1490 if (page != skb_frag_page(frag)) { 1491 if (i == MAX_SKB_FRAGS) { 1492 err = -EMSGSIZE; 1493 goto error; 1494 } 1495 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 1496 skb_frag_ref(skb, i); 1497 frag = &skb_shinfo(skb)->frags[i]; 1498 } 1499 } else if(i < MAX_SKB_FRAGS) { 1500 if (copy > PAGE_SIZE) 1501 copy = PAGE_SIZE; 1502 page = alloc_pages(sk->sk_allocation, 0); 1503 if (page == NULL) { 1504 err = -ENOMEM; 1505 goto error; 1506 } 1507 sk->sk_sndmsg_page = page; 1508 sk->sk_sndmsg_off = 0; 1509 1510 skb_fill_page_desc(skb, i, page, 0, 0); 1511 frag = &skb_shinfo(skb)->frags[i]; 1512 } else { 1513 err = -EMSGSIZE; 1514 goto error; 1515 } 1516 if (getfrag(from, 1517 skb_frag_address(frag) + skb_frag_size(frag), 1518 offset, copy, skb->len, skb) < 0) { 1519 err = -EFAULT; 1520 goto error; 1521 } 1522 sk->sk_sndmsg_off += copy; 1523 skb_frag_size_add(frag, copy); 1524 skb->len += copy; 1525 skb->data_len += copy; 1526 skb->truesize += copy; 1527 atomic_add(copy, &sk->sk_wmem_alloc); 1528 } 1529 offset += copy; 1530 length -= copy; 1531 } 1532 return 0; 1533 error: 1534 cork->length -= length; 1535 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1536 return err; 1537 } 1538 1539 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) 1540 { 1541 if (np->cork.opt) { 1542 kfree(np->cork.opt->dst0opt); 1543 kfree(np->cork.opt->dst1opt); 1544 kfree(np->cork.opt->hopopt); 1545 kfree(np->cork.opt->srcrt); 1546 kfree(np->cork.opt); 1547 np->cork.opt = NULL; 1548 } 1549 1550 if (inet->cork.base.dst) { 1551 dst_release(inet->cork.base.dst); 1552 inet->cork.base.dst = NULL; 1553 inet->cork.base.flags &= ~IPCORK_ALLFRAG; 1554 } 1555 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); 1556 } 1557 1558 int ip6_push_pending_frames(struct sock *sk) 1559 { 1560 struct sk_buff *skb, *tmp_skb; 1561 struct sk_buff **tail_skb; 1562 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1563 struct inet_sock *inet = inet_sk(sk); 1564 struct ipv6_pinfo *np = inet6_sk(sk); 1565 struct net *net = sock_net(sk); 1566 struct ipv6hdr *hdr; 1567 struct ipv6_txoptions *opt = np->cork.opt; 1568 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; 1569 struct flowi6 *fl6 = &inet->cork.fl.u.ip6; 1570 unsigned char proto = fl6->flowi6_proto; 1571 int err = 0; 1572 1573 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1574 goto out; 1575 tail_skb = &(skb_shinfo(skb)->frag_list); 1576 1577 /* move skb->data to ip header from ext header */ 1578 if (skb->data < skb_network_header(skb)) 1579 __skb_pull(skb, skb_network_offset(skb)); 1580 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1581 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1582 *tail_skb = tmp_skb; 1583 tail_skb = &(tmp_skb->next); 1584 skb->len += tmp_skb->len; 1585 skb->data_len += tmp_skb->len; 1586 skb->truesize += tmp_skb->truesize; 1587 tmp_skb->destructor = NULL; 1588 tmp_skb->sk = NULL; 1589 } 1590 1591 /* Allow local fragmentation. */ 1592 if (np->pmtudisc < IPV6_PMTUDISC_DO) 1593 skb->local_df = 1; 1594 1595 *final_dst = fl6->daddr; 1596 __skb_pull(skb, skb_network_header_len(skb)); 1597 if (opt && opt->opt_flen) 1598 ipv6_push_frag_opts(skb, opt, &proto); 1599 if (opt && opt->opt_nflen) 1600 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); 1601 1602 skb_push(skb, sizeof(struct ipv6hdr)); 1603 skb_reset_network_header(skb); 1604 hdr = ipv6_hdr(skb); 1605 1606 *(__be32*)hdr = fl6->flowlabel | 1607 htonl(0x60000000 | ((int)np->cork.tclass << 20)); 1608 1609 hdr->hop_limit = np->cork.hop_limit; 1610 hdr->nexthdr = proto; 1611 hdr->saddr = fl6->saddr; 1612 hdr->daddr = *final_dst; 1613 1614 skb->priority = sk->sk_priority; 1615 skb->mark = sk->sk_mark; 1616 1617 skb_dst_set(skb, dst_clone(&rt->dst)); 1618 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1619 if (proto == IPPROTO_ICMPV6) { 1620 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1621 1622 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); 1623 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); 1624 } 1625 1626 err = ip6_local_out(skb); 1627 if (err) { 1628 if (err > 0) 1629 err = net_xmit_errno(err); 1630 if (err) 1631 goto error; 1632 } 1633 1634 out: 1635 ip6_cork_release(inet, np); 1636 return err; 1637 error: 1638 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1639 goto out; 1640 } 1641 1642 void ip6_flush_pending_frames(struct sock *sk) 1643 { 1644 struct sk_buff *skb; 1645 1646 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { 1647 if (skb_dst(skb)) 1648 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1649 IPSTATS_MIB_OUTDISCARDS); 1650 kfree_skb(skb); 1651 } 1652 1653 ip6_cork_release(inet_sk(sk), inet6_sk(sk)); 1654 } 1655