1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 int ret; 134 135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 if (ret) { 137 kfree_skb(skb); 138 return ret; 139 } 140 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 /* Policy lookup after SNAT yielded a new policy */ 143 if (skb_dst(skb)->xfrm) { 144 IPCB(skb)->flags |= IPSKB_REROUTED; 145 return dst_output(net, sk, skb); 146 } 147 #endif 148 149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 150 dst_allfrag(skb_dst(skb)) || 151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 153 else 154 return ip6_finish_output2(net, sk, skb); 155 } 156 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 { 159 struct net_device *dev = skb_dst(skb)->dev; 160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 161 162 skb->protocol = htons(ETH_P_IPV6); 163 skb->dev = dev; 164 165 if (unlikely(idev->cnf.disable_ipv6)) { 166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 167 kfree_skb(skb); 168 return 0; 169 } 170 171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 172 net, sk, skb, NULL, dev, 173 ip6_finish_output, 174 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 } 176 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 178 { 179 if (!np->autoflowlabel_set) 180 return ip6_default_np_autolabel(net); 181 else 182 return np->autoflowlabel; 183 } 184 185 /* 186 * xmit an sk_buff (used by TCP, SCTP and DCCP) 187 * Note : socket lock is not held for SYNACK packets, but might be modified 188 * by calls to skb_set_owner_w() and ipv6_local_error(), 189 * which are using proper atomic operations or spinlocks. 190 */ 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 192 __u32 mark, struct ipv6_txoptions *opt, int tclass) 193 { 194 struct net *net = sock_net(sk); 195 const struct ipv6_pinfo *np = inet6_sk(sk); 196 struct in6_addr *first_hop = &fl6->daddr; 197 struct dst_entry *dst = skb_dst(skb); 198 struct ipv6hdr *hdr; 199 u8 proto = fl6->flowi6_proto; 200 int seg_len = skb->len; 201 int hlimit = -1; 202 u32 mtu; 203 204 if (opt) { 205 unsigned int head_room; 206 207 /* First: exthdrs may take lots of space (~8K for now) 208 MAX_HEADER is not enough. 209 */ 210 head_room = opt->opt_nflen + opt->opt_flen; 211 seg_len += head_room; 212 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 213 214 if (skb_headroom(skb) < head_room) { 215 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 216 if (!skb2) { 217 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 218 IPSTATS_MIB_OUTDISCARDS); 219 kfree_skb(skb); 220 return -ENOBUFS; 221 } 222 if (skb->sk) 223 skb_set_owner_w(skb2, skb->sk); 224 consume_skb(skb); 225 skb = skb2; 226 } 227 if (opt->opt_flen) 228 ipv6_push_frag_opts(skb, opt, &proto); 229 if (opt->opt_nflen) 230 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 231 &fl6->saddr); 232 } 233 234 skb_push(skb, sizeof(struct ipv6hdr)); 235 skb_reset_network_header(skb); 236 hdr = ipv6_hdr(skb); 237 238 /* 239 * Fill in the IPv6 header 240 */ 241 if (np) 242 hlimit = np->hop_limit; 243 if (hlimit < 0) 244 hlimit = ip6_dst_hoplimit(dst); 245 246 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 247 ip6_autoflowlabel(net, np), fl6)); 248 249 hdr->payload_len = htons(seg_len); 250 hdr->nexthdr = proto; 251 hdr->hop_limit = hlimit; 252 253 hdr->saddr = fl6->saddr; 254 hdr->daddr = *first_hop; 255 256 skb->protocol = htons(ETH_P_IPV6); 257 skb->priority = sk->sk_priority; 258 skb->mark = mark; 259 260 mtu = dst_mtu(dst); 261 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 263 IPSTATS_MIB_OUT, skb->len); 264 265 /* if egress device is enslaved to an L3 master device pass the 266 * skb to its handler for processing 267 */ 268 skb = l3mdev_ip6_out((struct sock *)sk, skb); 269 if (unlikely(!skb)) 270 return 0; 271 272 /* hooks should never assume socket lock is held. 273 * we promote our socket to non const 274 */ 275 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 276 net, (struct sock *)sk, skb, NULL, dst->dev, 277 dst_output); 278 } 279 280 skb->dev = dst->dev; 281 /* ipv6_local_error() does not require socket lock, 282 * we promote our socket to non const 283 */ 284 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 285 286 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 287 kfree_skb(skb); 288 return -EMSGSIZE; 289 } 290 EXPORT_SYMBOL(ip6_xmit); 291 292 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 293 { 294 struct ip6_ra_chain *ra; 295 struct sock *last = NULL; 296 297 read_lock(&ip6_ra_lock); 298 for (ra = ip6_ra_chain; ra; ra = ra->next) { 299 struct sock *sk = ra->sk; 300 if (sk && ra->sel == sel && 301 (!sk->sk_bound_dev_if || 302 sk->sk_bound_dev_if == skb->dev->ifindex)) { 303 if (last) { 304 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 305 if (skb2) 306 rawv6_rcv(last, skb2); 307 } 308 last = sk; 309 } 310 } 311 312 if (last) { 313 rawv6_rcv(last, skb); 314 read_unlock(&ip6_ra_lock); 315 return 1; 316 } 317 read_unlock(&ip6_ra_lock); 318 return 0; 319 } 320 321 static int ip6_forward_proxy_check(struct sk_buff *skb) 322 { 323 struct ipv6hdr *hdr = ipv6_hdr(skb); 324 u8 nexthdr = hdr->nexthdr; 325 __be16 frag_off; 326 int offset; 327 328 if (ipv6_ext_hdr(nexthdr)) { 329 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 330 if (offset < 0) 331 return 0; 332 } else 333 offset = sizeof(struct ipv6hdr); 334 335 if (nexthdr == IPPROTO_ICMPV6) { 336 struct icmp6hdr *icmp6; 337 338 if (!pskb_may_pull(skb, (skb_network_header(skb) + 339 offset + 1 - skb->data))) 340 return 0; 341 342 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 343 344 switch (icmp6->icmp6_type) { 345 case NDISC_ROUTER_SOLICITATION: 346 case NDISC_ROUTER_ADVERTISEMENT: 347 case NDISC_NEIGHBOUR_SOLICITATION: 348 case NDISC_NEIGHBOUR_ADVERTISEMENT: 349 case NDISC_REDIRECT: 350 /* For reaction involving unicast neighbor discovery 351 * message destined to the proxied address, pass it to 352 * input function. 353 */ 354 return 1; 355 default: 356 break; 357 } 358 } 359 360 /* 361 * The proxying router can't forward traffic sent to a link-local 362 * address, so signal the sender and discard the packet. This 363 * behavior is clarified by the MIPv6 specification. 364 */ 365 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 366 dst_link_failure(skb); 367 return -1; 368 } 369 370 return 0; 371 } 372 373 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 374 struct sk_buff *skb) 375 { 376 struct dst_entry *dst = skb_dst(skb); 377 378 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 379 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 380 381 #ifdef CONFIG_NET_SWITCHDEV 382 if (skb->offload_l3_fwd_mark) { 383 consume_skb(skb); 384 return 0; 385 } 386 #endif 387 388 return dst_output(net, sk, skb); 389 } 390 391 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 392 { 393 if (skb->len <= mtu) 394 return false; 395 396 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 397 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 398 return true; 399 400 if (skb->ignore_df) 401 return false; 402 403 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 404 return false; 405 406 return true; 407 } 408 409 int ip6_forward(struct sk_buff *skb) 410 { 411 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 412 struct dst_entry *dst = skb_dst(skb); 413 struct ipv6hdr *hdr = ipv6_hdr(skb); 414 struct inet6_skb_parm *opt = IP6CB(skb); 415 struct net *net = dev_net(dst->dev); 416 u32 mtu; 417 418 if (net->ipv6.devconf_all->forwarding == 0) 419 goto error; 420 421 if (skb->pkt_type != PACKET_HOST) 422 goto drop; 423 424 if (unlikely(skb->sk)) 425 goto drop; 426 427 if (skb_warn_if_lro(skb)) 428 goto drop; 429 430 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 431 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 432 goto drop; 433 } 434 435 skb_forward_csum(skb); 436 437 /* 438 * We DO NOT make any processing on 439 * RA packets, pushing them to user level AS IS 440 * without ane WARRANTY that application will be able 441 * to interpret them. The reason is that we 442 * cannot make anything clever here. 443 * 444 * We are not end-node, so that if packet contains 445 * AH/ESP, we cannot make anything. 446 * Defragmentation also would be mistake, RA packets 447 * cannot be fragmented, because there is no warranty 448 * that different fragments will go along one path. --ANK 449 */ 450 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 451 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 452 return 0; 453 } 454 455 /* 456 * check and decrement ttl 457 */ 458 if (hdr->hop_limit <= 1) { 459 /* Force OUTPUT device used as source address */ 460 skb->dev = dst->dev; 461 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 462 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 463 464 kfree_skb(skb); 465 return -ETIMEDOUT; 466 } 467 468 /* XXX: idev->cnf.proxy_ndp? */ 469 if (net->ipv6.devconf_all->proxy_ndp && 470 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 471 int proxied = ip6_forward_proxy_check(skb); 472 if (proxied > 0) 473 return ip6_input(skb); 474 else if (proxied < 0) { 475 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 476 goto drop; 477 } 478 } 479 480 if (!xfrm6_route_forward(skb)) { 481 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 482 goto drop; 483 } 484 dst = skb_dst(skb); 485 486 /* IPv6 specs say nothing about it, but it is clear that we cannot 487 send redirects to source routed frames. 488 We don't send redirects to frames decapsulated from IPsec. 489 */ 490 if (IP6CB(skb)->iif == dst->dev->ifindex && 491 opt->srcrt == 0 && !skb_sec_path(skb)) { 492 struct in6_addr *target = NULL; 493 struct inet_peer *peer; 494 struct rt6_info *rt; 495 496 /* 497 * incoming and outgoing devices are the same 498 * send a redirect. 499 */ 500 501 rt = (struct rt6_info *) dst; 502 if (rt->rt6i_flags & RTF_GATEWAY) 503 target = &rt->rt6i_gateway; 504 else 505 target = &hdr->daddr; 506 507 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 508 509 /* Limit redirects both by destination (here) 510 and by source (inside ndisc_send_redirect) 511 */ 512 if (inet_peer_xrlim_allow(peer, 1*HZ)) 513 ndisc_send_redirect(skb, target); 514 if (peer) 515 inet_putpeer(peer); 516 } else { 517 int addrtype = ipv6_addr_type(&hdr->saddr); 518 519 /* This check is security critical. */ 520 if (addrtype == IPV6_ADDR_ANY || 521 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 522 goto error; 523 if (addrtype & IPV6_ADDR_LINKLOCAL) { 524 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 525 ICMPV6_NOT_NEIGHBOUR, 0); 526 goto error; 527 } 528 } 529 530 mtu = ip6_dst_mtu_forward(dst); 531 if (mtu < IPV6_MIN_MTU) 532 mtu = IPV6_MIN_MTU; 533 534 if (ip6_pkt_too_big(skb, mtu)) { 535 /* Again, force OUTPUT device used as source address */ 536 skb->dev = dst->dev; 537 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 538 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 539 __IP6_INC_STATS(net, ip6_dst_idev(dst), 540 IPSTATS_MIB_FRAGFAILS); 541 kfree_skb(skb); 542 return -EMSGSIZE; 543 } 544 545 if (skb_cow(skb, dst->dev->hard_header_len)) { 546 __IP6_INC_STATS(net, ip6_dst_idev(dst), 547 IPSTATS_MIB_OUTDISCARDS); 548 goto drop; 549 } 550 551 hdr = ipv6_hdr(skb); 552 553 /* Mangling hops number delayed to point after skb COW */ 554 555 hdr->hop_limit--; 556 557 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 558 net, NULL, skb, skb->dev, dst->dev, 559 ip6_forward_finish); 560 561 error: 562 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 563 drop: 564 kfree_skb(skb); 565 return -EINVAL; 566 } 567 568 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 569 { 570 to->pkt_type = from->pkt_type; 571 to->priority = from->priority; 572 to->protocol = from->protocol; 573 skb_dst_drop(to); 574 skb_dst_set(to, dst_clone(skb_dst(from))); 575 to->dev = from->dev; 576 to->mark = from->mark; 577 578 skb_copy_hash(to, from); 579 580 #ifdef CONFIG_NET_SCHED 581 to->tc_index = from->tc_index; 582 #endif 583 nf_copy(to, from); 584 skb_copy_secmark(to, from); 585 } 586 587 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 588 int (*output)(struct net *, struct sock *, struct sk_buff *)) 589 { 590 struct sk_buff *frag; 591 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 592 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 593 inet6_sk(skb->sk) : NULL; 594 struct ipv6hdr *tmp_hdr; 595 struct frag_hdr *fh; 596 unsigned int mtu, hlen, left, len; 597 int hroom, troom; 598 __be32 frag_id; 599 int ptr, offset = 0, err = 0; 600 u8 *prevhdr, nexthdr = 0; 601 602 err = ip6_find_1stfragopt(skb, &prevhdr); 603 if (err < 0) 604 goto fail; 605 hlen = err; 606 nexthdr = *prevhdr; 607 608 mtu = ip6_skb_dst_mtu(skb); 609 610 /* We must not fragment if the socket is set to force MTU discovery 611 * or if the skb it not generated by a local socket. 612 */ 613 if (unlikely(!skb->ignore_df && skb->len > mtu)) 614 goto fail_toobig; 615 616 if (IP6CB(skb)->frag_max_size) { 617 if (IP6CB(skb)->frag_max_size > mtu) 618 goto fail_toobig; 619 620 /* don't send fragments larger than what we received */ 621 mtu = IP6CB(skb)->frag_max_size; 622 if (mtu < IPV6_MIN_MTU) 623 mtu = IPV6_MIN_MTU; 624 } 625 626 if (np && np->frag_size < mtu) { 627 if (np->frag_size) 628 mtu = np->frag_size; 629 } 630 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 631 goto fail_toobig; 632 mtu -= hlen + sizeof(struct frag_hdr); 633 634 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 635 &ipv6_hdr(skb)->saddr); 636 637 if (skb->ip_summed == CHECKSUM_PARTIAL && 638 (err = skb_checksum_help(skb))) 639 goto fail; 640 641 hroom = LL_RESERVED_SPACE(rt->dst.dev); 642 if (skb_has_frag_list(skb)) { 643 unsigned int first_len = skb_pagelen(skb); 644 struct sk_buff *frag2; 645 646 if (first_len - hlen > mtu || 647 ((first_len - hlen) & 7) || 648 skb_cloned(skb) || 649 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 650 goto slow_path; 651 652 skb_walk_frags(skb, frag) { 653 /* Correct geometry. */ 654 if (frag->len > mtu || 655 ((frag->len & 7) && frag->next) || 656 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 657 goto slow_path_clean; 658 659 /* Partially cloned skb? */ 660 if (skb_shared(frag)) 661 goto slow_path_clean; 662 663 BUG_ON(frag->sk); 664 if (skb->sk) { 665 frag->sk = skb->sk; 666 frag->destructor = sock_wfree; 667 } 668 skb->truesize -= frag->truesize; 669 } 670 671 err = 0; 672 offset = 0; 673 /* BUILD HEADER */ 674 675 *prevhdr = NEXTHDR_FRAGMENT; 676 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 677 if (!tmp_hdr) { 678 err = -ENOMEM; 679 goto fail; 680 } 681 frag = skb_shinfo(skb)->frag_list; 682 skb_frag_list_init(skb); 683 684 __skb_pull(skb, hlen); 685 fh = __skb_push(skb, sizeof(struct frag_hdr)); 686 __skb_push(skb, hlen); 687 skb_reset_network_header(skb); 688 memcpy(skb_network_header(skb), tmp_hdr, hlen); 689 690 fh->nexthdr = nexthdr; 691 fh->reserved = 0; 692 fh->frag_off = htons(IP6_MF); 693 fh->identification = frag_id; 694 695 first_len = skb_pagelen(skb); 696 skb->data_len = first_len - skb_headlen(skb); 697 skb->len = first_len; 698 ipv6_hdr(skb)->payload_len = htons(first_len - 699 sizeof(struct ipv6hdr)); 700 701 for (;;) { 702 /* Prepare header of the next frame, 703 * before previous one went down. */ 704 if (frag) { 705 frag->ip_summed = CHECKSUM_NONE; 706 skb_reset_transport_header(frag); 707 fh = __skb_push(frag, sizeof(struct frag_hdr)); 708 __skb_push(frag, hlen); 709 skb_reset_network_header(frag); 710 memcpy(skb_network_header(frag), tmp_hdr, 711 hlen); 712 offset += skb->len - hlen - sizeof(struct frag_hdr); 713 fh->nexthdr = nexthdr; 714 fh->reserved = 0; 715 fh->frag_off = htons(offset); 716 if (frag->next) 717 fh->frag_off |= htons(IP6_MF); 718 fh->identification = frag_id; 719 ipv6_hdr(frag)->payload_len = 720 htons(frag->len - 721 sizeof(struct ipv6hdr)); 722 ip6_copy_metadata(frag, skb); 723 } 724 725 err = output(net, sk, skb); 726 if (!err) 727 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 728 IPSTATS_MIB_FRAGCREATES); 729 730 if (err || !frag) 731 break; 732 733 skb = frag; 734 frag = skb->next; 735 skb_mark_not_on_list(skb); 736 } 737 738 kfree(tmp_hdr); 739 740 if (err == 0) { 741 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 742 IPSTATS_MIB_FRAGOKS); 743 return 0; 744 } 745 746 kfree_skb_list(frag); 747 748 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 749 IPSTATS_MIB_FRAGFAILS); 750 return err; 751 752 slow_path_clean: 753 skb_walk_frags(skb, frag2) { 754 if (frag2 == frag) 755 break; 756 frag2->sk = NULL; 757 frag2->destructor = NULL; 758 skb->truesize += frag2->truesize; 759 } 760 } 761 762 slow_path: 763 left = skb->len - hlen; /* Space per frame */ 764 ptr = hlen; /* Where to start from */ 765 766 /* 767 * Fragment the datagram. 768 */ 769 770 troom = rt->dst.dev->needed_tailroom; 771 772 /* 773 * Keep copying data until we run out. 774 */ 775 while (left > 0) { 776 u8 *fragnexthdr_offset; 777 778 len = left; 779 /* IF: it doesn't fit, use 'mtu' - the data space left */ 780 if (len > mtu) 781 len = mtu; 782 /* IF: we are not sending up to and including the packet end 783 then align the next start on an eight byte boundary */ 784 if (len < left) { 785 len &= ~7; 786 } 787 788 /* Allocate buffer */ 789 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 790 hroom + troom, GFP_ATOMIC); 791 if (!frag) { 792 err = -ENOMEM; 793 goto fail; 794 } 795 796 /* 797 * Set up data on packet 798 */ 799 800 ip6_copy_metadata(frag, skb); 801 skb_reserve(frag, hroom); 802 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 803 skb_reset_network_header(frag); 804 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 805 frag->transport_header = (frag->network_header + hlen + 806 sizeof(struct frag_hdr)); 807 808 /* 809 * Charge the memory for the fragment to any owner 810 * it might possess 811 */ 812 if (skb->sk) 813 skb_set_owner_w(frag, skb->sk); 814 815 /* 816 * Copy the packet header into the new buffer. 817 */ 818 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 819 820 fragnexthdr_offset = skb_network_header(frag); 821 fragnexthdr_offset += prevhdr - skb_network_header(skb); 822 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 823 824 /* 825 * Build fragment header. 826 */ 827 fh->nexthdr = nexthdr; 828 fh->reserved = 0; 829 fh->identification = frag_id; 830 831 /* 832 * Copy a block of the IP datagram. 833 */ 834 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 835 len)); 836 left -= len; 837 838 fh->frag_off = htons(offset); 839 if (left > 0) 840 fh->frag_off |= htons(IP6_MF); 841 ipv6_hdr(frag)->payload_len = htons(frag->len - 842 sizeof(struct ipv6hdr)); 843 844 ptr += len; 845 offset += len; 846 847 /* 848 * Put this fragment into the sending queue. 849 */ 850 err = output(net, sk, frag); 851 if (err) 852 goto fail; 853 854 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 855 IPSTATS_MIB_FRAGCREATES); 856 } 857 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 858 IPSTATS_MIB_FRAGOKS); 859 consume_skb(skb); 860 return err; 861 862 fail_toobig: 863 if (skb->sk && dst_allfrag(skb_dst(skb))) 864 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 865 866 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 867 err = -EMSGSIZE; 868 869 fail: 870 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 871 IPSTATS_MIB_FRAGFAILS); 872 kfree_skb(skb); 873 return err; 874 } 875 876 static inline int ip6_rt_check(const struct rt6key *rt_key, 877 const struct in6_addr *fl_addr, 878 const struct in6_addr *addr_cache) 879 { 880 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 881 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 882 } 883 884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 885 struct dst_entry *dst, 886 const struct flowi6 *fl6) 887 { 888 struct ipv6_pinfo *np = inet6_sk(sk); 889 struct rt6_info *rt; 890 891 if (!dst) 892 goto out; 893 894 if (dst->ops->family != AF_INET6) { 895 dst_release(dst); 896 return NULL; 897 } 898 899 rt = (struct rt6_info *)dst; 900 /* Yes, checking route validity in not connected 901 * case is not very simple. Take into account, 902 * that we do not support routing by source, TOS, 903 * and MSG_DONTROUTE --ANK (980726) 904 * 905 * 1. ip6_rt_check(): If route was host route, 906 * check that cached destination is current. 907 * If it is network route, we still may 908 * check its validity using saved pointer 909 * to the last used address: daddr_cache. 910 * We do not want to save whole address now, 911 * (because main consumer of this service 912 * is tcp, which has not this problem), 913 * so that the last trick works only on connected 914 * sockets. 915 * 2. oif also should be the same. 916 */ 917 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 918 #ifdef CONFIG_IPV6_SUBTREES 919 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 920 #endif 921 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 922 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 923 dst_release(dst); 924 dst = NULL; 925 } 926 927 out: 928 return dst; 929 } 930 931 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 932 struct dst_entry **dst, struct flowi6 *fl6) 933 { 934 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 935 struct neighbour *n; 936 struct rt6_info *rt; 937 #endif 938 int err; 939 int flags = 0; 940 941 /* The correct way to handle this would be to do 942 * ip6_route_get_saddr, and then ip6_route_output; however, 943 * the route-specific preferred source forces the 944 * ip6_route_output call _before_ ip6_route_get_saddr. 945 * 946 * In source specific routing (no src=any default route), 947 * ip6_route_output will fail given src=any saddr, though, so 948 * that's why we try it again later. 949 */ 950 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 951 struct fib6_info *from; 952 struct rt6_info *rt; 953 bool had_dst = *dst != NULL; 954 955 if (!had_dst) 956 *dst = ip6_route_output(net, sk, fl6); 957 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 958 959 rcu_read_lock(); 960 from = rt ? rcu_dereference(rt->from) : NULL; 961 err = ip6_route_get_saddr(net, from, &fl6->daddr, 962 sk ? inet6_sk(sk)->srcprefs : 0, 963 &fl6->saddr); 964 rcu_read_unlock(); 965 966 if (err) 967 goto out_err_release; 968 969 /* If we had an erroneous initial result, pretend it 970 * never existed and let the SA-enabled version take 971 * over. 972 */ 973 if (!had_dst && (*dst)->error) { 974 dst_release(*dst); 975 *dst = NULL; 976 } 977 978 if (fl6->flowi6_oif) 979 flags |= RT6_LOOKUP_F_IFACE; 980 } 981 982 if (!*dst) 983 *dst = ip6_route_output_flags(net, sk, fl6, flags); 984 985 err = (*dst)->error; 986 if (err) 987 goto out_err_release; 988 989 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 990 /* 991 * Here if the dst entry we've looked up 992 * has a neighbour entry that is in the INCOMPLETE 993 * state and the src address from the flow is 994 * marked as OPTIMISTIC, we release the found 995 * dst entry and replace it instead with the 996 * dst entry of the nexthop router 997 */ 998 rt = (struct rt6_info *) *dst; 999 rcu_read_lock_bh(); 1000 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1001 rt6_nexthop(rt, &fl6->daddr)); 1002 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1003 rcu_read_unlock_bh(); 1004 1005 if (err) { 1006 struct inet6_ifaddr *ifp; 1007 struct flowi6 fl_gw6; 1008 int redirect; 1009 1010 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1011 (*dst)->dev, 1); 1012 1013 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1014 if (ifp) 1015 in6_ifa_put(ifp); 1016 1017 if (redirect) { 1018 /* 1019 * We need to get the dst entry for the 1020 * default router instead 1021 */ 1022 dst_release(*dst); 1023 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1024 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1025 *dst = ip6_route_output(net, sk, &fl_gw6); 1026 err = (*dst)->error; 1027 if (err) 1028 goto out_err_release; 1029 } 1030 } 1031 #endif 1032 if (ipv6_addr_v4mapped(&fl6->saddr) && 1033 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1034 err = -EAFNOSUPPORT; 1035 goto out_err_release; 1036 } 1037 1038 return 0; 1039 1040 out_err_release: 1041 dst_release(*dst); 1042 *dst = NULL; 1043 1044 if (err == -ENETUNREACH) 1045 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1046 return err; 1047 } 1048 1049 /** 1050 * ip6_dst_lookup - perform route lookup on flow 1051 * @sk: socket which provides route info 1052 * @dst: pointer to dst_entry * for result 1053 * @fl6: flow to lookup 1054 * 1055 * This function performs a route lookup on the given flow. 1056 * 1057 * It returns zero on success, or a standard errno code on error. 1058 */ 1059 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1060 struct flowi6 *fl6) 1061 { 1062 *dst = NULL; 1063 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1064 } 1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1066 1067 /** 1068 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1069 * @sk: socket which provides route info 1070 * @fl6: flow to lookup 1071 * @final_dst: final destination address for ipsec lookup 1072 * 1073 * This function performs a route lookup on the given flow. 1074 * 1075 * It returns a valid dst pointer on success, or a pointer encoded 1076 * error code. 1077 */ 1078 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1079 const struct in6_addr *final_dst) 1080 { 1081 struct dst_entry *dst = NULL; 1082 int err; 1083 1084 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1085 if (err) 1086 return ERR_PTR(err); 1087 if (final_dst) 1088 fl6->daddr = *final_dst; 1089 1090 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1091 } 1092 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1093 1094 /** 1095 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1096 * @sk: socket which provides the dst cache and route info 1097 * @fl6: flow to lookup 1098 * @final_dst: final destination address for ipsec lookup 1099 * @connected: whether @sk is connected or not 1100 * 1101 * This function performs a route lookup on the given flow with the 1102 * possibility of using the cached route in the socket if it is valid. 1103 * It will take the socket dst lock when operating on the dst cache. 1104 * As a result, this function can only be used in process context. 1105 * 1106 * In addition, for a connected socket, cache the dst in the socket 1107 * if the current cache is not valid. 1108 * 1109 * It returns a valid dst pointer on success, or a pointer encoded 1110 * error code. 1111 */ 1112 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1113 const struct in6_addr *final_dst, 1114 bool connected) 1115 { 1116 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1117 1118 dst = ip6_sk_dst_check(sk, dst, fl6); 1119 if (dst) 1120 return dst; 1121 1122 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1123 if (connected && !IS_ERR(dst)) 1124 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1125 1126 return dst; 1127 } 1128 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1129 1130 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1131 gfp_t gfp) 1132 { 1133 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1134 } 1135 1136 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1137 gfp_t gfp) 1138 { 1139 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1140 } 1141 1142 static void ip6_append_data_mtu(unsigned int *mtu, 1143 int *maxfraglen, 1144 unsigned int fragheaderlen, 1145 struct sk_buff *skb, 1146 struct rt6_info *rt, 1147 unsigned int orig_mtu) 1148 { 1149 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1150 if (!skb) { 1151 /* first fragment, reserve header_len */ 1152 *mtu = orig_mtu - rt->dst.header_len; 1153 1154 } else { 1155 /* 1156 * this fragment is not first, the headers 1157 * space is regarded as data space. 1158 */ 1159 *mtu = orig_mtu; 1160 } 1161 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1162 + fragheaderlen - sizeof(struct frag_hdr); 1163 } 1164 } 1165 1166 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1167 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1168 struct rt6_info *rt, struct flowi6 *fl6) 1169 { 1170 struct ipv6_pinfo *np = inet6_sk(sk); 1171 unsigned int mtu; 1172 struct ipv6_txoptions *opt = ipc6->opt; 1173 1174 /* 1175 * setup for corking 1176 */ 1177 if (opt) { 1178 if (WARN_ON(v6_cork->opt)) 1179 return -EINVAL; 1180 1181 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1182 if (unlikely(!v6_cork->opt)) 1183 return -ENOBUFS; 1184 1185 v6_cork->opt->tot_len = sizeof(*opt); 1186 v6_cork->opt->opt_flen = opt->opt_flen; 1187 v6_cork->opt->opt_nflen = opt->opt_nflen; 1188 1189 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1190 sk->sk_allocation); 1191 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1192 return -ENOBUFS; 1193 1194 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1195 sk->sk_allocation); 1196 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1197 return -ENOBUFS; 1198 1199 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1200 sk->sk_allocation); 1201 if (opt->hopopt && !v6_cork->opt->hopopt) 1202 return -ENOBUFS; 1203 1204 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1205 sk->sk_allocation); 1206 if (opt->srcrt && !v6_cork->opt->srcrt) 1207 return -ENOBUFS; 1208 1209 /* need source address above miyazawa*/ 1210 } 1211 dst_hold(&rt->dst); 1212 cork->base.dst = &rt->dst; 1213 cork->fl.u.ip6 = *fl6; 1214 v6_cork->hop_limit = ipc6->hlimit; 1215 v6_cork->tclass = ipc6->tclass; 1216 if (rt->dst.flags & DST_XFRM_TUNNEL) 1217 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1218 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1219 else 1220 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1221 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1222 if (np->frag_size < mtu) { 1223 if (np->frag_size) 1224 mtu = np->frag_size; 1225 } 1226 if (mtu < IPV6_MIN_MTU) 1227 return -EINVAL; 1228 cork->base.fragsize = mtu; 1229 cork->base.gso_size = ipc6->gso_size; 1230 cork->base.tx_flags = 0; 1231 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1232 1233 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1234 cork->base.flags |= IPCORK_ALLFRAG; 1235 cork->base.length = 0; 1236 1237 cork->base.transmit_time = ipc6->sockc.transmit_time; 1238 1239 return 0; 1240 } 1241 1242 static int __ip6_append_data(struct sock *sk, 1243 struct flowi6 *fl6, 1244 struct sk_buff_head *queue, 1245 struct inet_cork *cork, 1246 struct inet6_cork *v6_cork, 1247 struct page_frag *pfrag, 1248 int getfrag(void *from, char *to, int offset, 1249 int len, int odd, struct sk_buff *skb), 1250 void *from, int length, int transhdrlen, 1251 unsigned int flags, struct ipcm6_cookie *ipc6) 1252 { 1253 struct sk_buff *skb, *skb_prev = NULL; 1254 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1255 struct ubuf_info *uarg = NULL; 1256 int exthdrlen = 0; 1257 int dst_exthdrlen = 0; 1258 int hh_len; 1259 int copy; 1260 int err; 1261 int offset = 0; 1262 u32 tskey = 0; 1263 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1264 struct ipv6_txoptions *opt = v6_cork->opt; 1265 int csummode = CHECKSUM_NONE; 1266 unsigned int maxnonfragsize, headersize; 1267 unsigned int wmem_alloc_delta = 0; 1268 bool paged, extra_uref; 1269 1270 skb = skb_peek_tail(queue); 1271 if (!skb) { 1272 exthdrlen = opt ? opt->opt_flen : 0; 1273 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1274 } 1275 1276 paged = !!cork->gso_size; 1277 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1278 orig_mtu = mtu; 1279 1280 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 1281 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1282 tskey = sk->sk_tskey++; 1283 1284 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1285 1286 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1287 (opt ? opt->opt_nflen : 0); 1288 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1289 sizeof(struct frag_hdr); 1290 1291 headersize = sizeof(struct ipv6hdr) + 1292 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1293 (dst_allfrag(&rt->dst) ? 1294 sizeof(struct frag_hdr) : 0) + 1295 rt->rt6i_nfheader_len; 1296 1297 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1298 * the first fragment 1299 */ 1300 if (headersize + transhdrlen > mtu) 1301 goto emsgsize; 1302 1303 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1304 (sk->sk_protocol == IPPROTO_UDP || 1305 sk->sk_protocol == IPPROTO_RAW)) { 1306 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1307 sizeof(struct ipv6hdr)); 1308 goto emsgsize; 1309 } 1310 1311 if (ip6_sk_ignore_df(sk)) 1312 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1313 else 1314 maxnonfragsize = mtu; 1315 1316 if (cork->length + length > maxnonfragsize - headersize) { 1317 emsgsize: 1318 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1319 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1320 return -EMSGSIZE; 1321 } 1322 1323 /* CHECKSUM_PARTIAL only with no extension headers and when 1324 * we are not going to fragment 1325 */ 1326 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1327 headersize == sizeof(struct ipv6hdr) && 1328 length <= mtu - headersize && 1329 (!(flags & MSG_MORE) || cork->gso_size) && 1330 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1331 csummode = CHECKSUM_PARTIAL; 1332 1333 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1334 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1335 if (!uarg) 1336 return -ENOBUFS; 1337 extra_uref = true; 1338 if (rt->dst.dev->features & NETIF_F_SG && 1339 csummode == CHECKSUM_PARTIAL) { 1340 paged = true; 1341 } else { 1342 uarg->zerocopy = 0; 1343 skb_zcopy_set(skb, uarg, &extra_uref); 1344 } 1345 } 1346 1347 /* 1348 * Let's try using as much space as possible. 1349 * Use MTU if total length of the message fits into the MTU. 1350 * Otherwise, we need to reserve fragment header and 1351 * fragment alignment (= 8-15 octects, in total). 1352 * 1353 * Note that we may need to "move" the data from the tail of 1354 * of the buffer to the new fragment when we split 1355 * the message. 1356 * 1357 * FIXME: It may be fragmented into multiple chunks 1358 * at once if non-fragmentable extension headers 1359 * are too large. 1360 * --yoshfuji 1361 */ 1362 1363 cork->length += length; 1364 if (!skb) 1365 goto alloc_new_skb; 1366 1367 while (length > 0) { 1368 /* Check if the remaining data fits into current packet. */ 1369 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1370 if (copy < length) 1371 copy = maxfraglen - skb->len; 1372 1373 if (copy <= 0) { 1374 char *data; 1375 unsigned int datalen; 1376 unsigned int fraglen; 1377 unsigned int fraggap; 1378 unsigned int alloclen; 1379 unsigned int pagedlen; 1380 alloc_new_skb: 1381 /* There's no room in the current skb */ 1382 if (skb) 1383 fraggap = skb->len - maxfraglen; 1384 else 1385 fraggap = 0; 1386 /* update mtu and maxfraglen if necessary */ 1387 if (!skb || !skb_prev) 1388 ip6_append_data_mtu(&mtu, &maxfraglen, 1389 fragheaderlen, skb, rt, 1390 orig_mtu); 1391 1392 skb_prev = skb; 1393 1394 /* 1395 * If remaining data exceeds the mtu, 1396 * we know we need more fragment(s). 1397 */ 1398 datalen = length + fraggap; 1399 1400 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1401 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1402 fraglen = datalen + fragheaderlen; 1403 pagedlen = 0; 1404 1405 if ((flags & MSG_MORE) && 1406 !(rt->dst.dev->features&NETIF_F_SG)) 1407 alloclen = mtu; 1408 else if (!paged) 1409 alloclen = fraglen; 1410 else { 1411 alloclen = min_t(int, fraglen, MAX_HEADER); 1412 pagedlen = fraglen - alloclen; 1413 } 1414 1415 alloclen += dst_exthdrlen; 1416 1417 if (datalen != length + fraggap) { 1418 /* 1419 * this is not the last fragment, the trailer 1420 * space is regarded as data space. 1421 */ 1422 datalen += rt->dst.trailer_len; 1423 } 1424 1425 alloclen += rt->dst.trailer_len; 1426 fraglen = datalen + fragheaderlen; 1427 1428 /* 1429 * We just reserve space for fragment header. 1430 * Note: this may be overallocation if the message 1431 * (without MSG_MORE) fits into the MTU. 1432 */ 1433 alloclen += sizeof(struct frag_hdr); 1434 1435 copy = datalen - transhdrlen - fraggap - pagedlen; 1436 if (copy < 0) { 1437 err = -EINVAL; 1438 goto error; 1439 } 1440 if (transhdrlen) { 1441 skb = sock_alloc_send_skb(sk, 1442 alloclen + hh_len, 1443 (flags & MSG_DONTWAIT), &err); 1444 } else { 1445 skb = NULL; 1446 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1447 2 * sk->sk_sndbuf) 1448 skb = alloc_skb(alloclen + hh_len, 1449 sk->sk_allocation); 1450 if (unlikely(!skb)) 1451 err = -ENOBUFS; 1452 } 1453 if (!skb) 1454 goto error; 1455 /* 1456 * Fill in the control structures 1457 */ 1458 skb->protocol = htons(ETH_P_IPV6); 1459 skb->ip_summed = csummode; 1460 skb->csum = 0; 1461 /* reserve for fragmentation and ipsec header */ 1462 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1463 dst_exthdrlen); 1464 1465 /* 1466 * Find where to start putting bytes 1467 */ 1468 data = skb_put(skb, fraglen - pagedlen); 1469 skb_set_network_header(skb, exthdrlen); 1470 data += fragheaderlen; 1471 skb->transport_header = (skb->network_header + 1472 fragheaderlen); 1473 if (fraggap) { 1474 skb->csum = skb_copy_and_csum_bits( 1475 skb_prev, maxfraglen, 1476 data + transhdrlen, fraggap, 0); 1477 skb_prev->csum = csum_sub(skb_prev->csum, 1478 skb->csum); 1479 data += fraggap; 1480 pskb_trim_unique(skb_prev, maxfraglen); 1481 } 1482 if (copy > 0 && 1483 getfrag(from, data + transhdrlen, offset, 1484 copy, fraggap, skb) < 0) { 1485 err = -EFAULT; 1486 kfree_skb(skb); 1487 goto error; 1488 } 1489 1490 offset += copy; 1491 length -= copy + transhdrlen; 1492 transhdrlen = 0; 1493 exthdrlen = 0; 1494 dst_exthdrlen = 0; 1495 1496 /* Only the initial fragment is time stamped */ 1497 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1498 cork->tx_flags = 0; 1499 skb_shinfo(skb)->tskey = tskey; 1500 tskey = 0; 1501 skb_zcopy_set(skb, uarg, &extra_uref); 1502 1503 if ((flags & MSG_CONFIRM) && !skb_prev) 1504 skb_set_dst_pending_confirm(skb, 1); 1505 1506 /* 1507 * Put the packet on the pending queue 1508 */ 1509 if (!skb->destructor) { 1510 skb->destructor = sock_wfree; 1511 skb->sk = sk; 1512 wmem_alloc_delta += skb->truesize; 1513 } 1514 __skb_queue_tail(queue, skb); 1515 continue; 1516 } 1517 1518 if (copy > length) 1519 copy = length; 1520 1521 if (!(rt->dst.dev->features&NETIF_F_SG) && 1522 skb_tailroom(skb) >= copy) { 1523 unsigned int off; 1524 1525 off = skb->len; 1526 if (getfrag(from, skb_put(skb, copy), 1527 offset, copy, off, skb) < 0) { 1528 __skb_trim(skb, off); 1529 err = -EFAULT; 1530 goto error; 1531 } 1532 } else if (!uarg || !uarg->zerocopy) { 1533 int i = skb_shinfo(skb)->nr_frags; 1534 1535 err = -ENOMEM; 1536 if (!sk_page_frag_refill(sk, pfrag)) 1537 goto error; 1538 1539 if (!skb_can_coalesce(skb, i, pfrag->page, 1540 pfrag->offset)) { 1541 err = -EMSGSIZE; 1542 if (i == MAX_SKB_FRAGS) 1543 goto error; 1544 1545 __skb_fill_page_desc(skb, i, pfrag->page, 1546 pfrag->offset, 0); 1547 skb_shinfo(skb)->nr_frags = ++i; 1548 get_page(pfrag->page); 1549 } 1550 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1551 if (getfrag(from, 1552 page_address(pfrag->page) + pfrag->offset, 1553 offset, copy, skb->len, skb) < 0) 1554 goto error_efault; 1555 1556 pfrag->offset += copy; 1557 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1558 skb->len += copy; 1559 skb->data_len += copy; 1560 skb->truesize += copy; 1561 wmem_alloc_delta += copy; 1562 } else { 1563 err = skb_zerocopy_iter_dgram(skb, from, copy); 1564 if (err < 0) 1565 goto error; 1566 } 1567 offset += copy; 1568 length -= copy; 1569 } 1570 1571 if (wmem_alloc_delta) 1572 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1573 return 0; 1574 1575 error_efault: 1576 err = -EFAULT; 1577 error: 1578 sock_zerocopy_put_abort(uarg, extra_uref); 1579 cork->length -= length; 1580 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1581 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1582 return err; 1583 } 1584 1585 int ip6_append_data(struct sock *sk, 1586 int getfrag(void *from, char *to, int offset, int len, 1587 int odd, struct sk_buff *skb), 1588 void *from, int length, int transhdrlen, 1589 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1590 struct rt6_info *rt, unsigned int flags) 1591 { 1592 struct inet_sock *inet = inet_sk(sk); 1593 struct ipv6_pinfo *np = inet6_sk(sk); 1594 int exthdrlen; 1595 int err; 1596 1597 if (flags&MSG_PROBE) 1598 return 0; 1599 if (skb_queue_empty(&sk->sk_write_queue)) { 1600 /* 1601 * setup for corking 1602 */ 1603 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1604 ipc6, rt, fl6); 1605 if (err) 1606 return err; 1607 1608 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1609 length += exthdrlen; 1610 transhdrlen += exthdrlen; 1611 } else { 1612 fl6 = &inet->cork.fl.u.ip6; 1613 transhdrlen = 0; 1614 } 1615 1616 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1617 &np->cork, sk_page_frag(sk), getfrag, 1618 from, length, transhdrlen, flags, ipc6); 1619 } 1620 EXPORT_SYMBOL_GPL(ip6_append_data); 1621 1622 static void ip6_cork_release(struct inet_cork_full *cork, 1623 struct inet6_cork *v6_cork) 1624 { 1625 if (v6_cork->opt) { 1626 kfree(v6_cork->opt->dst0opt); 1627 kfree(v6_cork->opt->dst1opt); 1628 kfree(v6_cork->opt->hopopt); 1629 kfree(v6_cork->opt->srcrt); 1630 kfree(v6_cork->opt); 1631 v6_cork->opt = NULL; 1632 } 1633 1634 if (cork->base.dst) { 1635 dst_release(cork->base.dst); 1636 cork->base.dst = NULL; 1637 cork->base.flags &= ~IPCORK_ALLFRAG; 1638 } 1639 memset(&cork->fl, 0, sizeof(cork->fl)); 1640 } 1641 1642 struct sk_buff *__ip6_make_skb(struct sock *sk, 1643 struct sk_buff_head *queue, 1644 struct inet_cork_full *cork, 1645 struct inet6_cork *v6_cork) 1646 { 1647 struct sk_buff *skb, *tmp_skb; 1648 struct sk_buff **tail_skb; 1649 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1650 struct ipv6_pinfo *np = inet6_sk(sk); 1651 struct net *net = sock_net(sk); 1652 struct ipv6hdr *hdr; 1653 struct ipv6_txoptions *opt = v6_cork->opt; 1654 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1655 struct flowi6 *fl6 = &cork->fl.u.ip6; 1656 unsigned char proto = fl6->flowi6_proto; 1657 1658 skb = __skb_dequeue(queue); 1659 if (!skb) 1660 goto out; 1661 tail_skb = &(skb_shinfo(skb)->frag_list); 1662 1663 /* move skb->data to ip header from ext header */ 1664 if (skb->data < skb_network_header(skb)) 1665 __skb_pull(skb, skb_network_offset(skb)); 1666 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1667 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1668 *tail_skb = tmp_skb; 1669 tail_skb = &(tmp_skb->next); 1670 skb->len += tmp_skb->len; 1671 skb->data_len += tmp_skb->len; 1672 skb->truesize += tmp_skb->truesize; 1673 tmp_skb->destructor = NULL; 1674 tmp_skb->sk = NULL; 1675 } 1676 1677 /* Allow local fragmentation. */ 1678 skb->ignore_df = ip6_sk_ignore_df(sk); 1679 1680 *final_dst = fl6->daddr; 1681 __skb_pull(skb, skb_network_header_len(skb)); 1682 if (opt && opt->opt_flen) 1683 ipv6_push_frag_opts(skb, opt, &proto); 1684 if (opt && opt->opt_nflen) 1685 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1686 1687 skb_push(skb, sizeof(struct ipv6hdr)); 1688 skb_reset_network_header(skb); 1689 hdr = ipv6_hdr(skb); 1690 1691 ip6_flow_hdr(hdr, v6_cork->tclass, 1692 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1693 ip6_autoflowlabel(net, np), fl6)); 1694 hdr->hop_limit = v6_cork->hop_limit; 1695 hdr->nexthdr = proto; 1696 hdr->saddr = fl6->saddr; 1697 hdr->daddr = *final_dst; 1698 1699 skb->priority = sk->sk_priority; 1700 skb->mark = sk->sk_mark; 1701 1702 skb->tstamp = cork->base.transmit_time; 1703 1704 skb_dst_set(skb, dst_clone(&rt->dst)); 1705 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1706 if (proto == IPPROTO_ICMPV6) { 1707 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1708 1709 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1710 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1711 } 1712 1713 ip6_cork_release(cork, v6_cork); 1714 out: 1715 return skb; 1716 } 1717 1718 int ip6_send_skb(struct sk_buff *skb) 1719 { 1720 struct net *net = sock_net(skb->sk); 1721 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1722 int err; 1723 1724 err = ip6_local_out(net, skb->sk, skb); 1725 if (err) { 1726 if (err > 0) 1727 err = net_xmit_errno(err); 1728 if (err) 1729 IP6_INC_STATS(net, rt->rt6i_idev, 1730 IPSTATS_MIB_OUTDISCARDS); 1731 } 1732 1733 return err; 1734 } 1735 1736 int ip6_push_pending_frames(struct sock *sk) 1737 { 1738 struct sk_buff *skb; 1739 1740 skb = ip6_finish_skb(sk); 1741 if (!skb) 1742 return 0; 1743 1744 return ip6_send_skb(skb); 1745 } 1746 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1747 1748 static void __ip6_flush_pending_frames(struct sock *sk, 1749 struct sk_buff_head *queue, 1750 struct inet_cork_full *cork, 1751 struct inet6_cork *v6_cork) 1752 { 1753 struct sk_buff *skb; 1754 1755 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1756 if (skb_dst(skb)) 1757 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1758 IPSTATS_MIB_OUTDISCARDS); 1759 kfree_skb(skb); 1760 } 1761 1762 ip6_cork_release(cork, v6_cork); 1763 } 1764 1765 void ip6_flush_pending_frames(struct sock *sk) 1766 { 1767 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1768 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1769 } 1770 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1771 1772 struct sk_buff *ip6_make_skb(struct sock *sk, 1773 int getfrag(void *from, char *to, int offset, 1774 int len, int odd, struct sk_buff *skb), 1775 void *from, int length, int transhdrlen, 1776 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1777 struct rt6_info *rt, unsigned int flags, 1778 struct inet_cork_full *cork) 1779 { 1780 struct inet6_cork v6_cork; 1781 struct sk_buff_head queue; 1782 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1783 int err; 1784 1785 if (flags & MSG_PROBE) 1786 return NULL; 1787 1788 __skb_queue_head_init(&queue); 1789 1790 cork->base.flags = 0; 1791 cork->base.addr = 0; 1792 cork->base.opt = NULL; 1793 cork->base.dst = NULL; 1794 v6_cork.opt = NULL; 1795 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); 1796 if (err) { 1797 ip6_cork_release(cork, &v6_cork); 1798 return ERR_PTR(err); 1799 } 1800 if (ipc6->dontfrag < 0) 1801 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1802 1803 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1804 ¤t->task_frag, getfrag, from, 1805 length + exthdrlen, transhdrlen + exthdrlen, 1806 flags, ipc6); 1807 if (err) { 1808 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1809 return ERR_PTR(err); 1810 } 1811 1812 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 1813 } 1814