1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 int ret; 134 135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 if (ret) { 137 kfree_skb(skb); 138 return ret; 139 } 140 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 /* Policy lookup after SNAT yielded a new policy */ 143 if (skb_dst(skb)->xfrm) { 144 IPCB(skb)->flags |= IPSKB_REROUTED; 145 return dst_output(net, sk, skb); 146 } 147 #endif 148 149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 150 dst_allfrag(skb_dst(skb)) || 151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 153 else 154 return ip6_finish_output2(net, sk, skb); 155 } 156 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 { 159 struct net_device *dev = skb_dst(skb)->dev; 160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 161 162 skb->protocol = htons(ETH_P_IPV6); 163 skb->dev = dev; 164 165 if (unlikely(idev->cnf.disable_ipv6)) { 166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 167 kfree_skb(skb); 168 return 0; 169 } 170 171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 172 net, sk, skb, NULL, dev, 173 ip6_finish_output, 174 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 } 176 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 178 { 179 if (!np->autoflowlabel_set) 180 return ip6_default_np_autolabel(net); 181 else 182 return np->autoflowlabel; 183 } 184 185 /* 186 * xmit an sk_buff (used by TCP, SCTP and DCCP) 187 * Note : socket lock is not held for SYNACK packets, but might be modified 188 * by calls to skb_set_owner_w() and ipv6_local_error(), 189 * which are using proper atomic operations or spinlocks. 190 */ 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 192 __u32 mark, struct ipv6_txoptions *opt, int tclass) 193 { 194 struct net *net = sock_net(sk); 195 const struct ipv6_pinfo *np = inet6_sk(sk); 196 struct in6_addr *first_hop = &fl6->daddr; 197 struct dst_entry *dst = skb_dst(skb); 198 unsigned int head_room; 199 struct ipv6hdr *hdr; 200 u8 proto = fl6->flowi6_proto; 201 int seg_len = skb->len; 202 int hlimit = -1; 203 u32 mtu; 204 205 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 206 if (opt) 207 head_room += opt->opt_nflen + opt->opt_flen; 208 209 if (unlikely(skb_headroom(skb) < head_room)) { 210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 211 if (!skb2) { 212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 213 IPSTATS_MIB_OUTDISCARDS); 214 kfree_skb(skb); 215 return -ENOBUFS; 216 } 217 if (skb->sk) 218 skb_set_owner_w(skb2, skb->sk); 219 consume_skb(skb); 220 skb = skb2; 221 } 222 223 if (opt) { 224 seg_len += opt->opt_nflen + opt->opt_flen; 225 226 if (opt->opt_flen) 227 ipv6_push_frag_opts(skb, opt, &proto); 228 229 if (opt->opt_nflen) 230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 231 &fl6->saddr); 232 } 233 234 skb_push(skb, sizeof(struct ipv6hdr)); 235 skb_reset_network_header(skb); 236 hdr = ipv6_hdr(skb); 237 238 /* 239 * Fill in the IPv6 header 240 */ 241 if (np) 242 hlimit = np->hop_limit; 243 if (hlimit < 0) 244 hlimit = ip6_dst_hoplimit(dst); 245 246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 247 ip6_autoflowlabel(net, np), fl6)); 248 249 hdr->payload_len = htons(seg_len); 250 hdr->nexthdr = proto; 251 hdr->hop_limit = hlimit; 252 253 hdr->saddr = fl6->saddr; 254 hdr->daddr = *first_hop; 255 256 skb->protocol = htons(ETH_P_IPV6); 257 skb->priority = sk->sk_priority; 258 skb->mark = mark; 259 260 mtu = dst_mtu(dst); 261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 263 IPSTATS_MIB_OUT, skb->len); 264 265 /* if egress device is enslaved to an L3 master device pass the 266 * skb to its handler for processing 267 */ 268 skb = l3mdev_ip6_out((struct sock *)sk, skb); 269 if (unlikely(!skb)) 270 return 0; 271 272 /* hooks should never assume socket lock is held. 273 * we promote our socket to non const 274 */ 275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 276 net, (struct sock *)sk, skb, NULL, dst->dev, 277 dst_output); 278 } 279 280 skb->dev = dst->dev; 281 /* ipv6_local_error() does not require socket lock, 282 * we promote our socket to non const 283 */ 284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 285 286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 287 kfree_skb(skb); 288 return -EMSGSIZE; 289 } 290 EXPORT_SYMBOL(ip6_xmit); 291 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 293 { 294 struct ip6_ra_chain *ra; 295 struct sock *last = NULL; 296 297 read_lock(&ip6_ra_lock); 298 for (ra = ip6_ra_chain; ra; ra = ra->next) { 299 struct sock *sk = ra->sk; 300 if (sk && ra->sel == sel && 301 (!sk->sk_bound_dev_if || 302 sk->sk_bound_dev_if == skb->dev->ifindex)) { 303 if (last) { 304 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 305 if (skb2) 306 rawv6_rcv(last, skb2); 307 } 308 last = sk; 309 } 310 } 311 312 if (last) { 313 rawv6_rcv(last, skb); 314 read_unlock(&ip6_ra_lock); 315 return 1; 316 } 317 read_unlock(&ip6_ra_lock); 318 return 0; 319 } 320 321 static int ip6_forward_proxy_check(struct sk_buff *skb) 322 { 323 struct ipv6hdr *hdr = ipv6_hdr(skb); 324 u8 nexthdr = hdr->nexthdr; 325 __be16 frag_off; 326 int offset; 327 328 if (ipv6_ext_hdr(nexthdr)) { 329 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 330 if (offset < 0) 331 return 0; 332 } else 333 offset = sizeof(struct ipv6hdr); 334 335 if (nexthdr == IPPROTO_ICMPV6) { 336 struct icmp6hdr *icmp6; 337 338 if (!pskb_may_pull(skb, (skb_network_header(skb) + 339 offset + 1 - skb->data))) 340 return 0; 341 342 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 343 344 switch (icmp6->icmp6_type) { 345 case NDISC_ROUTER_SOLICITATION: 346 case NDISC_ROUTER_ADVERTISEMENT: 347 case NDISC_NEIGHBOUR_SOLICITATION: 348 case NDISC_NEIGHBOUR_ADVERTISEMENT: 349 case NDISC_REDIRECT: 350 /* For reaction involving unicast neighbor discovery 351 * message destined to the proxied address, pass it to 352 * input function. 353 */ 354 return 1; 355 default: 356 break; 357 } 358 } 359 360 /* 361 * The proxying router can't forward traffic sent to a link-local 362 * address, so signal the sender and discard the packet. This 363 * behavior is clarified by the MIPv6 specification. 364 */ 365 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 366 dst_link_failure(skb); 367 return -1; 368 } 369 370 return 0; 371 } 372 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 374 struct sk_buff *skb) 375 { 376 struct dst_entry *dst = skb_dst(skb); 377 378 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 379 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 380 381 #ifdef CONFIG_NET_SWITCHDEV 382 if (skb->offload_l3_fwd_mark) { 383 consume_skb(skb); 384 return 0; 385 } 386 #endif 387 388 skb->tstamp = 0; 389 return dst_output(net, sk, skb); 390 } 391 392 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 393 { 394 if (skb->len <= mtu) 395 return false; 396 397 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 398 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 399 return true; 400 401 if (skb->ignore_df) 402 return false; 403 404 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 405 return false; 406 407 return true; 408 } 409 410 int ip6_forward(struct sk_buff *skb) 411 { 412 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 413 struct dst_entry *dst = skb_dst(skb); 414 struct ipv6hdr *hdr = ipv6_hdr(skb); 415 struct inet6_skb_parm *opt = IP6CB(skb); 416 struct net *net = dev_net(dst->dev); 417 u32 mtu; 418 419 if (net->ipv6.devconf_all->forwarding == 0) 420 goto error; 421 422 if (skb->pkt_type != PACKET_HOST) 423 goto drop; 424 425 if (unlikely(skb->sk)) 426 goto drop; 427 428 if (skb_warn_if_lro(skb)) 429 goto drop; 430 431 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 432 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 433 goto drop; 434 } 435 436 skb_forward_csum(skb); 437 438 /* 439 * We DO NOT make any processing on 440 * RA packets, pushing them to user level AS IS 441 * without ane WARRANTY that application will be able 442 * to interpret them. The reason is that we 443 * cannot make anything clever here. 444 * 445 * We are not end-node, so that if packet contains 446 * AH/ESP, we cannot make anything. 447 * Defragmentation also would be mistake, RA packets 448 * cannot be fragmented, because there is no warranty 449 * that different fragments will go along one path. --ANK 450 */ 451 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 452 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 453 return 0; 454 } 455 456 /* 457 * check and decrement ttl 458 */ 459 if (hdr->hop_limit <= 1) { 460 /* Force OUTPUT device used as source address */ 461 skb->dev = dst->dev; 462 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 463 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 464 465 kfree_skb(skb); 466 return -ETIMEDOUT; 467 } 468 469 /* XXX: idev->cnf.proxy_ndp? */ 470 if (net->ipv6.devconf_all->proxy_ndp && 471 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 472 int proxied = ip6_forward_proxy_check(skb); 473 if (proxied > 0) 474 return ip6_input(skb); 475 else if (proxied < 0) { 476 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 477 goto drop; 478 } 479 } 480 481 if (!xfrm6_route_forward(skb)) { 482 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 483 goto drop; 484 } 485 dst = skb_dst(skb); 486 487 /* IPv6 specs say nothing about it, but it is clear that we cannot 488 send redirects to source routed frames. 489 We don't send redirects to frames decapsulated from IPsec. 490 */ 491 if (IP6CB(skb)->iif == dst->dev->ifindex && 492 opt->srcrt == 0 && !skb_sec_path(skb)) { 493 struct in6_addr *target = NULL; 494 struct inet_peer *peer; 495 struct rt6_info *rt; 496 497 /* 498 * incoming and outgoing devices are the same 499 * send a redirect. 500 */ 501 502 rt = (struct rt6_info *) dst; 503 if (rt->rt6i_flags & RTF_GATEWAY) 504 target = &rt->rt6i_gateway; 505 else 506 target = &hdr->daddr; 507 508 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 509 510 /* Limit redirects both by destination (here) 511 and by source (inside ndisc_send_redirect) 512 */ 513 if (inet_peer_xrlim_allow(peer, 1*HZ)) 514 ndisc_send_redirect(skb, target); 515 if (peer) 516 inet_putpeer(peer); 517 } else { 518 int addrtype = ipv6_addr_type(&hdr->saddr); 519 520 /* This check is security critical. */ 521 if (addrtype == IPV6_ADDR_ANY || 522 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 523 goto error; 524 if (addrtype & IPV6_ADDR_LINKLOCAL) { 525 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 526 ICMPV6_NOT_NEIGHBOUR, 0); 527 goto error; 528 } 529 } 530 531 mtu = ip6_dst_mtu_forward(dst); 532 if (mtu < IPV6_MIN_MTU) 533 mtu = IPV6_MIN_MTU; 534 535 if (ip6_pkt_too_big(skb, mtu)) { 536 /* Again, force OUTPUT device used as source address */ 537 skb->dev = dst->dev; 538 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 539 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 540 __IP6_INC_STATS(net, ip6_dst_idev(dst), 541 IPSTATS_MIB_FRAGFAILS); 542 kfree_skb(skb); 543 return -EMSGSIZE; 544 } 545 546 if (skb_cow(skb, dst->dev->hard_header_len)) { 547 __IP6_INC_STATS(net, ip6_dst_idev(dst), 548 IPSTATS_MIB_OUTDISCARDS); 549 goto drop; 550 } 551 552 hdr = ipv6_hdr(skb); 553 554 /* Mangling hops number delayed to point after skb COW */ 555 556 hdr->hop_limit--; 557 558 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 559 net, NULL, skb, skb->dev, dst->dev, 560 ip6_forward_finish); 561 562 error: 563 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 564 drop: 565 kfree_skb(skb); 566 return -EINVAL; 567 } 568 569 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 570 { 571 to->pkt_type = from->pkt_type; 572 to->priority = from->priority; 573 to->protocol = from->protocol; 574 skb_dst_drop(to); 575 skb_dst_set(to, dst_clone(skb_dst(from))); 576 to->dev = from->dev; 577 to->mark = from->mark; 578 579 skb_copy_hash(to, from); 580 581 #ifdef CONFIG_NET_SCHED 582 to->tc_index = from->tc_index; 583 #endif 584 nf_copy(to, from); 585 skb_ext_copy(to, from); 586 skb_copy_secmark(to, from); 587 } 588 589 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 590 int (*output)(struct net *, struct sock *, struct sk_buff *)) 591 { 592 struct sk_buff *frag; 593 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 594 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 595 inet6_sk(skb->sk) : NULL; 596 struct ipv6hdr *tmp_hdr; 597 struct frag_hdr *fh; 598 unsigned int mtu, hlen, left, len; 599 int hroom, troom; 600 __be32 frag_id; 601 int ptr, offset = 0, err = 0; 602 u8 *prevhdr, nexthdr = 0; 603 604 err = ip6_find_1stfragopt(skb, &prevhdr); 605 if (err < 0) 606 goto fail; 607 hlen = err; 608 nexthdr = *prevhdr; 609 610 mtu = ip6_skb_dst_mtu(skb); 611 612 /* We must not fragment if the socket is set to force MTU discovery 613 * or if the skb it not generated by a local socket. 614 */ 615 if (unlikely(!skb->ignore_df && skb->len > mtu)) 616 goto fail_toobig; 617 618 if (IP6CB(skb)->frag_max_size) { 619 if (IP6CB(skb)->frag_max_size > mtu) 620 goto fail_toobig; 621 622 /* don't send fragments larger than what we received */ 623 mtu = IP6CB(skb)->frag_max_size; 624 if (mtu < IPV6_MIN_MTU) 625 mtu = IPV6_MIN_MTU; 626 } 627 628 if (np && np->frag_size < mtu) { 629 if (np->frag_size) 630 mtu = np->frag_size; 631 } 632 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 633 goto fail_toobig; 634 mtu -= hlen + sizeof(struct frag_hdr); 635 636 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 637 &ipv6_hdr(skb)->saddr); 638 639 if (skb->ip_summed == CHECKSUM_PARTIAL && 640 (err = skb_checksum_help(skb))) 641 goto fail; 642 643 hroom = LL_RESERVED_SPACE(rt->dst.dev); 644 if (skb_has_frag_list(skb)) { 645 unsigned int first_len = skb_pagelen(skb); 646 struct sk_buff *frag2; 647 648 if (first_len - hlen > mtu || 649 ((first_len - hlen) & 7) || 650 skb_cloned(skb) || 651 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 652 goto slow_path; 653 654 skb_walk_frags(skb, frag) { 655 /* Correct geometry. */ 656 if (frag->len > mtu || 657 ((frag->len & 7) && frag->next) || 658 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 659 goto slow_path_clean; 660 661 /* Partially cloned skb? */ 662 if (skb_shared(frag)) 663 goto slow_path_clean; 664 665 BUG_ON(frag->sk); 666 if (skb->sk) { 667 frag->sk = skb->sk; 668 frag->destructor = sock_wfree; 669 } 670 skb->truesize -= frag->truesize; 671 } 672 673 err = 0; 674 offset = 0; 675 /* BUILD HEADER */ 676 677 *prevhdr = NEXTHDR_FRAGMENT; 678 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 679 if (!tmp_hdr) { 680 err = -ENOMEM; 681 goto fail; 682 } 683 frag = skb_shinfo(skb)->frag_list; 684 skb_frag_list_init(skb); 685 686 __skb_pull(skb, hlen); 687 fh = __skb_push(skb, sizeof(struct frag_hdr)); 688 __skb_push(skb, hlen); 689 skb_reset_network_header(skb); 690 memcpy(skb_network_header(skb), tmp_hdr, hlen); 691 692 fh->nexthdr = nexthdr; 693 fh->reserved = 0; 694 fh->frag_off = htons(IP6_MF); 695 fh->identification = frag_id; 696 697 first_len = skb_pagelen(skb); 698 skb->data_len = first_len - skb_headlen(skb); 699 skb->len = first_len; 700 ipv6_hdr(skb)->payload_len = htons(first_len - 701 sizeof(struct ipv6hdr)); 702 703 for (;;) { 704 /* Prepare header of the next frame, 705 * before previous one went down. */ 706 if (frag) { 707 frag->ip_summed = CHECKSUM_NONE; 708 skb_reset_transport_header(frag); 709 fh = __skb_push(frag, sizeof(struct frag_hdr)); 710 __skb_push(frag, hlen); 711 skb_reset_network_header(frag); 712 memcpy(skb_network_header(frag), tmp_hdr, 713 hlen); 714 offset += skb->len - hlen - sizeof(struct frag_hdr); 715 fh->nexthdr = nexthdr; 716 fh->reserved = 0; 717 fh->frag_off = htons(offset); 718 if (frag->next) 719 fh->frag_off |= htons(IP6_MF); 720 fh->identification = frag_id; 721 ipv6_hdr(frag)->payload_len = 722 htons(frag->len - 723 sizeof(struct ipv6hdr)); 724 ip6_copy_metadata(frag, skb); 725 } 726 727 err = output(net, sk, skb); 728 if (!err) 729 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 730 IPSTATS_MIB_FRAGCREATES); 731 732 if (err || !frag) 733 break; 734 735 skb = frag; 736 frag = skb->next; 737 skb_mark_not_on_list(skb); 738 } 739 740 kfree(tmp_hdr); 741 742 if (err == 0) { 743 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 744 IPSTATS_MIB_FRAGOKS); 745 return 0; 746 } 747 748 kfree_skb_list(frag); 749 750 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 751 IPSTATS_MIB_FRAGFAILS); 752 return err; 753 754 slow_path_clean: 755 skb_walk_frags(skb, frag2) { 756 if (frag2 == frag) 757 break; 758 frag2->sk = NULL; 759 frag2->destructor = NULL; 760 skb->truesize += frag2->truesize; 761 } 762 } 763 764 slow_path: 765 left = skb->len - hlen; /* Space per frame */ 766 ptr = hlen; /* Where to start from */ 767 768 /* 769 * Fragment the datagram. 770 */ 771 772 troom = rt->dst.dev->needed_tailroom; 773 774 /* 775 * Keep copying data until we run out. 776 */ 777 while (left > 0) { 778 u8 *fragnexthdr_offset; 779 780 len = left; 781 /* IF: it doesn't fit, use 'mtu' - the data space left */ 782 if (len > mtu) 783 len = mtu; 784 /* IF: we are not sending up to and including the packet end 785 then align the next start on an eight byte boundary */ 786 if (len < left) { 787 len &= ~7; 788 } 789 790 /* Allocate buffer */ 791 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 792 hroom + troom, GFP_ATOMIC); 793 if (!frag) { 794 err = -ENOMEM; 795 goto fail; 796 } 797 798 /* 799 * Set up data on packet 800 */ 801 802 ip6_copy_metadata(frag, skb); 803 skb_reserve(frag, hroom); 804 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 805 skb_reset_network_header(frag); 806 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 807 frag->transport_header = (frag->network_header + hlen + 808 sizeof(struct frag_hdr)); 809 810 /* 811 * Charge the memory for the fragment to any owner 812 * it might possess 813 */ 814 if (skb->sk) 815 skb_set_owner_w(frag, skb->sk); 816 817 /* 818 * Copy the packet header into the new buffer. 819 */ 820 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 821 822 fragnexthdr_offset = skb_network_header(frag); 823 fragnexthdr_offset += prevhdr - skb_network_header(skb); 824 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 825 826 /* 827 * Build fragment header. 828 */ 829 fh->nexthdr = nexthdr; 830 fh->reserved = 0; 831 fh->identification = frag_id; 832 833 /* 834 * Copy a block of the IP datagram. 835 */ 836 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 837 len)); 838 left -= len; 839 840 fh->frag_off = htons(offset); 841 if (left > 0) 842 fh->frag_off |= htons(IP6_MF); 843 ipv6_hdr(frag)->payload_len = htons(frag->len - 844 sizeof(struct ipv6hdr)); 845 846 ptr += len; 847 offset += len; 848 849 /* 850 * Put this fragment into the sending queue. 851 */ 852 err = output(net, sk, frag); 853 if (err) 854 goto fail; 855 856 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 857 IPSTATS_MIB_FRAGCREATES); 858 } 859 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 860 IPSTATS_MIB_FRAGOKS); 861 consume_skb(skb); 862 return err; 863 864 fail_toobig: 865 if (skb->sk && dst_allfrag(skb_dst(skb))) 866 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 867 868 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 869 err = -EMSGSIZE; 870 871 fail: 872 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 873 IPSTATS_MIB_FRAGFAILS); 874 kfree_skb(skb); 875 return err; 876 } 877 878 static inline int ip6_rt_check(const struct rt6key *rt_key, 879 const struct in6_addr *fl_addr, 880 const struct in6_addr *addr_cache) 881 { 882 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 883 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 884 } 885 886 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 887 struct dst_entry *dst, 888 const struct flowi6 *fl6) 889 { 890 struct ipv6_pinfo *np = inet6_sk(sk); 891 struct rt6_info *rt; 892 893 if (!dst) 894 goto out; 895 896 if (dst->ops->family != AF_INET6) { 897 dst_release(dst); 898 return NULL; 899 } 900 901 rt = (struct rt6_info *)dst; 902 /* Yes, checking route validity in not connected 903 * case is not very simple. Take into account, 904 * that we do not support routing by source, TOS, 905 * and MSG_DONTROUTE --ANK (980726) 906 * 907 * 1. ip6_rt_check(): If route was host route, 908 * check that cached destination is current. 909 * If it is network route, we still may 910 * check its validity using saved pointer 911 * to the last used address: daddr_cache. 912 * We do not want to save whole address now, 913 * (because main consumer of this service 914 * is tcp, which has not this problem), 915 * so that the last trick works only on connected 916 * sockets. 917 * 2. oif also should be the same. 918 */ 919 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 920 #ifdef CONFIG_IPV6_SUBTREES 921 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 922 #endif 923 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 924 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 925 dst_release(dst); 926 dst = NULL; 927 } 928 929 out: 930 return dst; 931 } 932 933 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 934 struct dst_entry **dst, struct flowi6 *fl6) 935 { 936 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 937 struct neighbour *n; 938 struct rt6_info *rt; 939 #endif 940 int err; 941 int flags = 0; 942 943 /* The correct way to handle this would be to do 944 * ip6_route_get_saddr, and then ip6_route_output; however, 945 * the route-specific preferred source forces the 946 * ip6_route_output call _before_ ip6_route_get_saddr. 947 * 948 * In source specific routing (no src=any default route), 949 * ip6_route_output will fail given src=any saddr, though, so 950 * that's why we try it again later. 951 */ 952 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 953 struct fib6_info *from; 954 struct rt6_info *rt; 955 bool had_dst = *dst != NULL; 956 957 if (!had_dst) 958 *dst = ip6_route_output(net, sk, fl6); 959 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 960 961 rcu_read_lock(); 962 from = rt ? rcu_dereference(rt->from) : NULL; 963 err = ip6_route_get_saddr(net, from, &fl6->daddr, 964 sk ? inet6_sk(sk)->srcprefs : 0, 965 &fl6->saddr); 966 rcu_read_unlock(); 967 968 if (err) 969 goto out_err_release; 970 971 /* If we had an erroneous initial result, pretend it 972 * never existed and let the SA-enabled version take 973 * over. 974 */ 975 if (!had_dst && (*dst)->error) { 976 dst_release(*dst); 977 *dst = NULL; 978 } 979 980 if (fl6->flowi6_oif) 981 flags |= RT6_LOOKUP_F_IFACE; 982 } 983 984 if (!*dst) 985 *dst = ip6_route_output_flags(net, sk, fl6, flags); 986 987 err = (*dst)->error; 988 if (err) 989 goto out_err_release; 990 991 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 992 /* 993 * Here if the dst entry we've looked up 994 * has a neighbour entry that is in the INCOMPLETE 995 * state and the src address from the flow is 996 * marked as OPTIMISTIC, we release the found 997 * dst entry and replace it instead with the 998 * dst entry of the nexthop router 999 */ 1000 rt = (struct rt6_info *) *dst; 1001 rcu_read_lock_bh(); 1002 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1003 rt6_nexthop(rt, &fl6->daddr)); 1004 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1005 rcu_read_unlock_bh(); 1006 1007 if (err) { 1008 struct inet6_ifaddr *ifp; 1009 struct flowi6 fl_gw6; 1010 int redirect; 1011 1012 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1013 (*dst)->dev, 1); 1014 1015 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1016 if (ifp) 1017 in6_ifa_put(ifp); 1018 1019 if (redirect) { 1020 /* 1021 * We need to get the dst entry for the 1022 * default router instead 1023 */ 1024 dst_release(*dst); 1025 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1026 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1027 *dst = ip6_route_output(net, sk, &fl_gw6); 1028 err = (*dst)->error; 1029 if (err) 1030 goto out_err_release; 1031 } 1032 } 1033 #endif 1034 if (ipv6_addr_v4mapped(&fl6->saddr) && 1035 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1036 err = -EAFNOSUPPORT; 1037 goto out_err_release; 1038 } 1039 1040 return 0; 1041 1042 out_err_release: 1043 dst_release(*dst); 1044 *dst = NULL; 1045 1046 if (err == -ENETUNREACH) 1047 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1048 return err; 1049 } 1050 1051 /** 1052 * ip6_dst_lookup - perform route lookup on flow 1053 * @sk: socket which provides route info 1054 * @dst: pointer to dst_entry * for result 1055 * @fl6: flow to lookup 1056 * 1057 * This function performs a route lookup on the given flow. 1058 * 1059 * It returns zero on success, or a standard errno code on error. 1060 */ 1061 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1062 struct flowi6 *fl6) 1063 { 1064 *dst = NULL; 1065 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1066 } 1067 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1068 1069 /** 1070 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1071 * @sk: socket which provides route info 1072 * @fl6: flow to lookup 1073 * @final_dst: final destination address for ipsec lookup 1074 * 1075 * This function performs a route lookup on the given flow. 1076 * 1077 * It returns a valid dst pointer on success, or a pointer encoded 1078 * error code. 1079 */ 1080 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1081 const struct in6_addr *final_dst) 1082 { 1083 struct dst_entry *dst = NULL; 1084 int err; 1085 1086 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1087 if (err) 1088 return ERR_PTR(err); 1089 if (final_dst) 1090 fl6->daddr = *final_dst; 1091 1092 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1093 } 1094 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1095 1096 /** 1097 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1098 * @sk: socket which provides the dst cache and route info 1099 * @fl6: flow to lookup 1100 * @final_dst: final destination address for ipsec lookup 1101 * @connected: whether @sk is connected or not 1102 * 1103 * This function performs a route lookup on the given flow with the 1104 * possibility of using the cached route in the socket if it is valid. 1105 * It will take the socket dst lock when operating on the dst cache. 1106 * As a result, this function can only be used in process context. 1107 * 1108 * In addition, for a connected socket, cache the dst in the socket 1109 * if the current cache is not valid. 1110 * 1111 * It returns a valid dst pointer on success, or a pointer encoded 1112 * error code. 1113 */ 1114 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1115 const struct in6_addr *final_dst, 1116 bool connected) 1117 { 1118 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1119 1120 dst = ip6_sk_dst_check(sk, dst, fl6); 1121 if (dst) 1122 return dst; 1123 1124 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1125 if (connected && !IS_ERR(dst)) 1126 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1127 1128 return dst; 1129 } 1130 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1131 1132 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1133 gfp_t gfp) 1134 { 1135 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1136 } 1137 1138 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1139 gfp_t gfp) 1140 { 1141 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1142 } 1143 1144 static void ip6_append_data_mtu(unsigned int *mtu, 1145 int *maxfraglen, 1146 unsigned int fragheaderlen, 1147 struct sk_buff *skb, 1148 struct rt6_info *rt, 1149 unsigned int orig_mtu) 1150 { 1151 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1152 if (!skb) { 1153 /* first fragment, reserve header_len */ 1154 *mtu = orig_mtu - rt->dst.header_len; 1155 1156 } else { 1157 /* 1158 * this fragment is not first, the headers 1159 * space is regarded as data space. 1160 */ 1161 *mtu = orig_mtu; 1162 } 1163 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1164 + fragheaderlen - sizeof(struct frag_hdr); 1165 } 1166 } 1167 1168 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1169 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1170 struct rt6_info *rt, struct flowi6 *fl6) 1171 { 1172 struct ipv6_pinfo *np = inet6_sk(sk); 1173 unsigned int mtu; 1174 struct ipv6_txoptions *opt = ipc6->opt; 1175 1176 /* 1177 * setup for corking 1178 */ 1179 if (opt) { 1180 if (WARN_ON(v6_cork->opt)) 1181 return -EINVAL; 1182 1183 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1184 if (unlikely(!v6_cork->opt)) 1185 return -ENOBUFS; 1186 1187 v6_cork->opt->tot_len = sizeof(*opt); 1188 v6_cork->opt->opt_flen = opt->opt_flen; 1189 v6_cork->opt->opt_nflen = opt->opt_nflen; 1190 1191 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1192 sk->sk_allocation); 1193 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1194 return -ENOBUFS; 1195 1196 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1197 sk->sk_allocation); 1198 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1199 return -ENOBUFS; 1200 1201 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1202 sk->sk_allocation); 1203 if (opt->hopopt && !v6_cork->opt->hopopt) 1204 return -ENOBUFS; 1205 1206 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1207 sk->sk_allocation); 1208 if (opt->srcrt && !v6_cork->opt->srcrt) 1209 return -ENOBUFS; 1210 1211 /* need source address above miyazawa*/ 1212 } 1213 dst_hold(&rt->dst); 1214 cork->base.dst = &rt->dst; 1215 cork->fl.u.ip6 = *fl6; 1216 v6_cork->hop_limit = ipc6->hlimit; 1217 v6_cork->tclass = ipc6->tclass; 1218 if (rt->dst.flags & DST_XFRM_TUNNEL) 1219 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1220 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1221 else 1222 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1223 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1224 if (np->frag_size < mtu) { 1225 if (np->frag_size) 1226 mtu = np->frag_size; 1227 } 1228 if (mtu < IPV6_MIN_MTU) 1229 return -EINVAL; 1230 cork->base.fragsize = mtu; 1231 cork->base.gso_size = ipc6->gso_size; 1232 cork->base.tx_flags = 0; 1233 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1234 1235 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1236 cork->base.flags |= IPCORK_ALLFRAG; 1237 cork->base.length = 0; 1238 1239 cork->base.transmit_time = ipc6->sockc.transmit_time; 1240 1241 return 0; 1242 } 1243 1244 static int __ip6_append_data(struct sock *sk, 1245 struct flowi6 *fl6, 1246 struct sk_buff_head *queue, 1247 struct inet_cork *cork, 1248 struct inet6_cork *v6_cork, 1249 struct page_frag *pfrag, 1250 int getfrag(void *from, char *to, int offset, 1251 int len, int odd, struct sk_buff *skb), 1252 void *from, int length, int transhdrlen, 1253 unsigned int flags, struct ipcm6_cookie *ipc6) 1254 { 1255 struct sk_buff *skb, *skb_prev = NULL; 1256 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1257 struct ubuf_info *uarg = NULL; 1258 int exthdrlen = 0; 1259 int dst_exthdrlen = 0; 1260 int hh_len; 1261 int copy; 1262 int err; 1263 int offset = 0; 1264 u32 tskey = 0; 1265 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1266 struct ipv6_txoptions *opt = v6_cork->opt; 1267 int csummode = CHECKSUM_NONE; 1268 unsigned int maxnonfragsize, headersize; 1269 unsigned int wmem_alloc_delta = 0; 1270 bool paged, extra_uref; 1271 1272 skb = skb_peek_tail(queue); 1273 if (!skb) { 1274 exthdrlen = opt ? opt->opt_flen : 0; 1275 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1276 } 1277 1278 paged = !!cork->gso_size; 1279 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1280 orig_mtu = mtu; 1281 1282 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 1283 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1284 tskey = sk->sk_tskey++; 1285 1286 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1287 1288 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1289 (opt ? opt->opt_nflen : 0); 1290 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1291 sizeof(struct frag_hdr); 1292 1293 headersize = sizeof(struct ipv6hdr) + 1294 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1295 (dst_allfrag(&rt->dst) ? 1296 sizeof(struct frag_hdr) : 0) + 1297 rt->rt6i_nfheader_len; 1298 1299 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1300 * the first fragment 1301 */ 1302 if (headersize + transhdrlen > mtu) 1303 goto emsgsize; 1304 1305 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1306 (sk->sk_protocol == IPPROTO_UDP || 1307 sk->sk_protocol == IPPROTO_RAW)) { 1308 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1309 sizeof(struct ipv6hdr)); 1310 goto emsgsize; 1311 } 1312 1313 if (ip6_sk_ignore_df(sk)) 1314 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1315 else 1316 maxnonfragsize = mtu; 1317 1318 if (cork->length + length > maxnonfragsize - headersize) { 1319 emsgsize: 1320 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1321 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1322 return -EMSGSIZE; 1323 } 1324 1325 /* CHECKSUM_PARTIAL only with no extension headers and when 1326 * we are not going to fragment 1327 */ 1328 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1329 headersize == sizeof(struct ipv6hdr) && 1330 length <= mtu - headersize && 1331 (!(flags & MSG_MORE) || cork->gso_size) && 1332 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1333 csummode = CHECKSUM_PARTIAL; 1334 1335 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1336 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1337 if (!uarg) 1338 return -ENOBUFS; 1339 extra_uref = true; 1340 if (rt->dst.dev->features & NETIF_F_SG && 1341 csummode == CHECKSUM_PARTIAL) { 1342 paged = true; 1343 } else { 1344 uarg->zerocopy = 0; 1345 skb_zcopy_set(skb, uarg, &extra_uref); 1346 } 1347 } 1348 1349 /* 1350 * Let's try using as much space as possible. 1351 * Use MTU if total length of the message fits into the MTU. 1352 * Otherwise, we need to reserve fragment header and 1353 * fragment alignment (= 8-15 octects, in total). 1354 * 1355 * Note that we may need to "move" the data from the tail of 1356 * of the buffer to the new fragment when we split 1357 * the message. 1358 * 1359 * FIXME: It may be fragmented into multiple chunks 1360 * at once if non-fragmentable extension headers 1361 * are too large. 1362 * --yoshfuji 1363 */ 1364 1365 cork->length += length; 1366 if (!skb) 1367 goto alloc_new_skb; 1368 1369 while (length > 0) { 1370 /* Check if the remaining data fits into current packet. */ 1371 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1372 if (copy < length) 1373 copy = maxfraglen - skb->len; 1374 1375 if (copy <= 0) { 1376 char *data; 1377 unsigned int datalen; 1378 unsigned int fraglen; 1379 unsigned int fraggap; 1380 unsigned int alloclen; 1381 unsigned int pagedlen; 1382 alloc_new_skb: 1383 /* There's no room in the current skb */ 1384 if (skb) 1385 fraggap = skb->len - maxfraglen; 1386 else 1387 fraggap = 0; 1388 /* update mtu and maxfraglen if necessary */ 1389 if (!skb || !skb_prev) 1390 ip6_append_data_mtu(&mtu, &maxfraglen, 1391 fragheaderlen, skb, rt, 1392 orig_mtu); 1393 1394 skb_prev = skb; 1395 1396 /* 1397 * If remaining data exceeds the mtu, 1398 * we know we need more fragment(s). 1399 */ 1400 datalen = length + fraggap; 1401 1402 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1403 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1404 fraglen = datalen + fragheaderlen; 1405 pagedlen = 0; 1406 1407 if ((flags & MSG_MORE) && 1408 !(rt->dst.dev->features&NETIF_F_SG)) 1409 alloclen = mtu; 1410 else if (!paged) 1411 alloclen = fraglen; 1412 else { 1413 alloclen = min_t(int, fraglen, MAX_HEADER); 1414 pagedlen = fraglen - alloclen; 1415 } 1416 1417 alloclen += dst_exthdrlen; 1418 1419 if (datalen != length + fraggap) { 1420 /* 1421 * this is not the last fragment, the trailer 1422 * space is regarded as data space. 1423 */ 1424 datalen += rt->dst.trailer_len; 1425 } 1426 1427 alloclen += rt->dst.trailer_len; 1428 fraglen = datalen + fragheaderlen; 1429 1430 /* 1431 * We just reserve space for fragment header. 1432 * Note: this may be overallocation if the message 1433 * (without MSG_MORE) fits into the MTU. 1434 */ 1435 alloclen += sizeof(struct frag_hdr); 1436 1437 copy = datalen - transhdrlen - fraggap - pagedlen; 1438 if (copy < 0) { 1439 err = -EINVAL; 1440 goto error; 1441 } 1442 if (transhdrlen) { 1443 skb = sock_alloc_send_skb(sk, 1444 alloclen + hh_len, 1445 (flags & MSG_DONTWAIT), &err); 1446 } else { 1447 skb = NULL; 1448 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1449 2 * sk->sk_sndbuf) 1450 skb = alloc_skb(alloclen + hh_len, 1451 sk->sk_allocation); 1452 if (unlikely(!skb)) 1453 err = -ENOBUFS; 1454 } 1455 if (!skb) 1456 goto error; 1457 /* 1458 * Fill in the control structures 1459 */ 1460 skb->protocol = htons(ETH_P_IPV6); 1461 skb->ip_summed = csummode; 1462 skb->csum = 0; 1463 /* reserve for fragmentation and ipsec header */ 1464 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1465 dst_exthdrlen); 1466 1467 /* 1468 * Find where to start putting bytes 1469 */ 1470 data = skb_put(skb, fraglen - pagedlen); 1471 skb_set_network_header(skb, exthdrlen); 1472 data += fragheaderlen; 1473 skb->transport_header = (skb->network_header + 1474 fragheaderlen); 1475 if (fraggap) { 1476 skb->csum = skb_copy_and_csum_bits( 1477 skb_prev, maxfraglen, 1478 data + transhdrlen, fraggap, 0); 1479 skb_prev->csum = csum_sub(skb_prev->csum, 1480 skb->csum); 1481 data += fraggap; 1482 pskb_trim_unique(skb_prev, maxfraglen); 1483 } 1484 if (copy > 0 && 1485 getfrag(from, data + transhdrlen, offset, 1486 copy, fraggap, skb) < 0) { 1487 err = -EFAULT; 1488 kfree_skb(skb); 1489 goto error; 1490 } 1491 1492 offset += copy; 1493 length -= copy + transhdrlen; 1494 transhdrlen = 0; 1495 exthdrlen = 0; 1496 dst_exthdrlen = 0; 1497 1498 /* Only the initial fragment is time stamped */ 1499 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1500 cork->tx_flags = 0; 1501 skb_shinfo(skb)->tskey = tskey; 1502 tskey = 0; 1503 skb_zcopy_set(skb, uarg, &extra_uref); 1504 1505 if ((flags & MSG_CONFIRM) && !skb_prev) 1506 skb_set_dst_pending_confirm(skb, 1); 1507 1508 /* 1509 * Put the packet on the pending queue 1510 */ 1511 if (!skb->destructor) { 1512 skb->destructor = sock_wfree; 1513 skb->sk = sk; 1514 wmem_alloc_delta += skb->truesize; 1515 } 1516 __skb_queue_tail(queue, skb); 1517 continue; 1518 } 1519 1520 if (copy > length) 1521 copy = length; 1522 1523 if (!(rt->dst.dev->features&NETIF_F_SG) && 1524 skb_tailroom(skb) >= copy) { 1525 unsigned int off; 1526 1527 off = skb->len; 1528 if (getfrag(from, skb_put(skb, copy), 1529 offset, copy, off, skb) < 0) { 1530 __skb_trim(skb, off); 1531 err = -EFAULT; 1532 goto error; 1533 } 1534 } else if (!uarg || !uarg->zerocopy) { 1535 int i = skb_shinfo(skb)->nr_frags; 1536 1537 err = -ENOMEM; 1538 if (!sk_page_frag_refill(sk, pfrag)) 1539 goto error; 1540 1541 if (!skb_can_coalesce(skb, i, pfrag->page, 1542 pfrag->offset)) { 1543 err = -EMSGSIZE; 1544 if (i == MAX_SKB_FRAGS) 1545 goto error; 1546 1547 __skb_fill_page_desc(skb, i, pfrag->page, 1548 pfrag->offset, 0); 1549 skb_shinfo(skb)->nr_frags = ++i; 1550 get_page(pfrag->page); 1551 } 1552 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1553 if (getfrag(from, 1554 page_address(pfrag->page) + pfrag->offset, 1555 offset, copy, skb->len, skb) < 0) 1556 goto error_efault; 1557 1558 pfrag->offset += copy; 1559 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1560 skb->len += copy; 1561 skb->data_len += copy; 1562 skb->truesize += copy; 1563 wmem_alloc_delta += copy; 1564 } else { 1565 err = skb_zerocopy_iter_dgram(skb, from, copy); 1566 if (err < 0) 1567 goto error; 1568 } 1569 offset += copy; 1570 length -= copy; 1571 } 1572 1573 if (wmem_alloc_delta) 1574 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1575 return 0; 1576 1577 error_efault: 1578 err = -EFAULT; 1579 error: 1580 if (uarg) 1581 sock_zerocopy_put_abort(uarg, extra_uref); 1582 cork->length -= length; 1583 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1584 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1585 return err; 1586 } 1587 1588 int ip6_append_data(struct sock *sk, 1589 int getfrag(void *from, char *to, int offset, int len, 1590 int odd, struct sk_buff *skb), 1591 void *from, int length, int transhdrlen, 1592 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1593 struct rt6_info *rt, unsigned int flags) 1594 { 1595 struct inet_sock *inet = inet_sk(sk); 1596 struct ipv6_pinfo *np = inet6_sk(sk); 1597 int exthdrlen; 1598 int err; 1599 1600 if (flags&MSG_PROBE) 1601 return 0; 1602 if (skb_queue_empty(&sk->sk_write_queue)) { 1603 /* 1604 * setup for corking 1605 */ 1606 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1607 ipc6, rt, fl6); 1608 if (err) 1609 return err; 1610 1611 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1612 length += exthdrlen; 1613 transhdrlen += exthdrlen; 1614 } else { 1615 fl6 = &inet->cork.fl.u.ip6; 1616 transhdrlen = 0; 1617 } 1618 1619 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1620 &np->cork, sk_page_frag(sk), getfrag, 1621 from, length, transhdrlen, flags, ipc6); 1622 } 1623 EXPORT_SYMBOL_GPL(ip6_append_data); 1624 1625 static void ip6_cork_release(struct inet_cork_full *cork, 1626 struct inet6_cork *v6_cork) 1627 { 1628 if (v6_cork->opt) { 1629 kfree(v6_cork->opt->dst0opt); 1630 kfree(v6_cork->opt->dst1opt); 1631 kfree(v6_cork->opt->hopopt); 1632 kfree(v6_cork->opt->srcrt); 1633 kfree(v6_cork->opt); 1634 v6_cork->opt = NULL; 1635 } 1636 1637 if (cork->base.dst) { 1638 dst_release(cork->base.dst); 1639 cork->base.dst = NULL; 1640 cork->base.flags &= ~IPCORK_ALLFRAG; 1641 } 1642 memset(&cork->fl, 0, sizeof(cork->fl)); 1643 } 1644 1645 struct sk_buff *__ip6_make_skb(struct sock *sk, 1646 struct sk_buff_head *queue, 1647 struct inet_cork_full *cork, 1648 struct inet6_cork *v6_cork) 1649 { 1650 struct sk_buff *skb, *tmp_skb; 1651 struct sk_buff **tail_skb; 1652 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1653 struct ipv6_pinfo *np = inet6_sk(sk); 1654 struct net *net = sock_net(sk); 1655 struct ipv6hdr *hdr; 1656 struct ipv6_txoptions *opt = v6_cork->opt; 1657 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1658 struct flowi6 *fl6 = &cork->fl.u.ip6; 1659 unsigned char proto = fl6->flowi6_proto; 1660 1661 skb = __skb_dequeue(queue); 1662 if (!skb) 1663 goto out; 1664 tail_skb = &(skb_shinfo(skb)->frag_list); 1665 1666 /* move skb->data to ip header from ext header */ 1667 if (skb->data < skb_network_header(skb)) 1668 __skb_pull(skb, skb_network_offset(skb)); 1669 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1670 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1671 *tail_skb = tmp_skb; 1672 tail_skb = &(tmp_skb->next); 1673 skb->len += tmp_skb->len; 1674 skb->data_len += tmp_skb->len; 1675 skb->truesize += tmp_skb->truesize; 1676 tmp_skb->destructor = NULL; 1677 tmp_skb->sk = NULL; 1678 } 1679 1680 /* Allow local fragmentation. */ 1681 skb->ignore_df = ip6_sk_ignore_df(sk); 1682 1683 *final_dst = fl6->daddr; 1684 __skb_pull(skb, skb_network_header_len(skb)); 1685 if (opt && opt->opt_flen) 1686 ipv6_push_frag_opts(skb, opt, &proto); 1687 if (opt && opt->opt_nflen) 1688 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1689 1690 skb_push(skb, sizeof(struct ipv6hdr)); 1691 skb_reset_network_header(skb); 1692 hdr = ipv6_hdr(skb); 1693 1694 ip6_flow_hdr(hdr, v6_cork->tclass, 1695 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1696 ip6_autoflowlabel(net, np), fl6)); 1697 hdr->hop_limit = v6_cork->hop_limit; 1698 hdr->nexthdr = proto; 1699 hdr->saddr = fl6->saddr; 1700 hdr->daddr = *final_dst; 1701 1702 skb->priority = sk->sk_priority; 1703 skb->mark = sk->sk_mark; 1704 1705 skb->tstamp = cork->base.transmit_time; 1706 1707 skb_dst_set(skb, dst_clone(&rt->dst)); 1708 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1709 if (proto == IPPROTO_ICMPV6) { 1710 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1711 1712 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1713 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1714 } 1715 1716 ip6_cork_release(cork, v6_cork); 1717 out: 1718 return skb; 1719 } 1720 1721 int ip6_send_skb(struct sk_buff *skb) 1722 { 1723 struct net *net = sock_net(skb->sk); 1724 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1725 int err; 1726 1727 err = ip6_local_out(net, skb->sk, skb); 1728 if (err) { 1729 if (err > 0) 1730 err = net_xmit_errno(err); 1731 if (err) 1732 IP6_INC_STATS(net, rt->rt6i_idev, 1733 IPSTATS_MIB_OUTDISCARDS); 1734 } 1735 1736 return err; 1737 } 1738 1739 int ip6_push_pending_frames(struct sock *sk) 1740 { 1741 struct sk_buff *skb; 1742 1743 skb = ip6_finish_skb(sk); 1744 if (!skb) 1745 return 0; 1746 1747 return ip6_send_skb(skb); 1748 } 1749 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1750 1751 static void __ip6_flush_pending_frames(struct sock *sk, 1752 struct sk_buff_head *queue, 1753 struct inet_cork_full *cork, 1754 struct inet6_cork *v6_cork) 1755 { 1756 struct sk_buff *skb; 1757 1758 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1759 if (skb_dst(skb)) 1760 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1761 IPSTATS_MIB_OUTDISCARDS); 1762 kfree_skb(skb); 1763 } 1764 1765 ip6_cork_release(cork, v6_cork); 1766 } 1767 1768 void ip6_flush_pending_frames(struct sock *sk) 1769 { 1770 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1771 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1772 } 1773 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1774 1775 struct sk_buff *ip6_make_skb(struct sock *sk, 1776 int getfrag(void *from, char *to, int offset, 1777 int len, int odd, struct sk_buff *skb), 1778 void *from, int length, int transhdrlen, 1779 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1780 struct rt6_info *rt, unsigned int flags, 1781 struct inet_cork_full *cork) 1782 { 1783 struct inet6_cork v6_cork; 1784 struct sk_buff_head queue; 1785 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1786 int err; 1787 1788 if (flags & MSG_PROBE) 1789 return NULL; 1790 1791 __skb_queue_head_init(&queue); 1792 1793 cork->base.flags = 0; 1794 cork->base.addr = 0; 1795 cork->base.opt = NULL; 1796 cork->base.dst = NULL; 1797 v6_cork.opt = NULL; 1798 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); 1799 if (err) { 1800 ip6_cork_release(cork, &v6_cork); 1801 return ERR_PTR(err); 1802 } 1803 if (ipc6->dontfrag < 0) 1804 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1805 1806 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1807 ¤t->task_frag, getfrag, from, 1808 length + exthdrlen, transhdrlen + exthdrlen, 1809 flags, ipc6); 1810 if (err) { 1811 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1812 return ERR_PTR(err); 1813 } 1814 1815 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 1816 } 1817