1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 struct dst_entry *dst = skb_dst(skb);
63 struct net_device *dev = dst->dev;
64 struct inet6_dev *idev = ip6_dst_idev(dst);
65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
68 struct neighbour *neigh;
69 int ret;
70
71 /* Be paranoid, rather than too clever. */
72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 /* Make sure idev stays alive */
74 rcu_read_lock();
75 skb = skb_expand_head(skb, hh_len);
76 if (!skb) {
77 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
78 rcu_read_unlock();
79 return -ENOMEM;
80 }
81 rcu_read_unlock();
82 }
83
84 hdr = ipv6_hdr(skb);
85 daddr = &hdr->daddr;
86 if (ipv6_addr_is_multicast(daddr)) {
87 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
88 ((mroute6_is_socket(net, skb) &&
89 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
90 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
91 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
92
93 /* Do not check for IFF_ALLMULTI; multicast routing
94 is not supported in any case.
95 */
96 if (newskb)
97 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
98 net, sk, newskb, NULL, newskb->dev,
99 dev_loopback_xmit);
100
101 if (hdr->hop_limit == 0) {
102 IP6_INC_STATS(net, idev,
103 IPSTATS_MIB_OUTDISCARDS);
104 kfree_skb(skb);
105 return 0;
106 }
107 }
108
109 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
110 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
111 !(dev->flags & IFF_LOOPBACK)) {
112 kfree_skb(skb);
113 return 0;
114 }
115 }
116
117 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
118 int res = lwtunnel_xmit(skb);
119
120 if (res != LWTUNNEL_XMIT_CONTINUE)
121 return res;
122 }
123
124 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
125
126 rcu_read_lock();
127 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
128 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
129
130 if (unlikely(IS_ERR_OR_NULL(neigh))) {
131 if (unlikely(!neigh))
132 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
133 if (IS_ERR(neigh)) {
134 rcu_read_unlock();
135 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
136 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
137 return -EINVAL;
138 }
139 }
140 sock_confirm_neigh(skb, neigh);
141 ret = neigh_output(neigh, skb, false);
142 rcu_read_unlock();
143 return ret;
144 }
145
146 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)147 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
148 struct sk_buff *skb, unsigned int mtu)
149 {
150 struct sk_buff *segs, *nskb;
151 netdev_features_t features;
152 int ret = 0;
153
154 /* Please see corresponding comment in ip_finish_output_gso
155 * describing the cases where GSO segment length exceeds the
156 * egress MTU.
157 */
158 features = netif_skb_features(skb);
159 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
160 if (IS_ERR_OR_NULL(segs)) {
161 kfree_skb(skb);
162 return -ENOMEM;
163 }
164
165 consume_skb(skb);
166
167 skb_list_walk_safe(segs, segs, nskb) {
168 int err;
169
170 skb_mark_not_on_list(segs);
171 /* Last GSO segment can be smaller than gso_size (and MTU).
172 * Adding a fragment header would produce an "atomic fragment",
173 * which is considered harmful (RFC-8021). Avoid that.
174 */
175 err = segs->len > mtu ?
176 ip6_fragment(net, sk, segs, ip6_finish_output2) :
177 ip6_finish_output2(net, sk, segs);
178 if (err && ret == 0)
179 ret = err;
180 }
181
182 return ret;
183 }
184
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)185 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
186 {
187 unsigned int mtu;
188
189 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
190 /* Policy lookup after SNAT yielded a new policy */
191 if (skb_dst(skb)->xfrm) {
192 IP6CB(skb)->flags |= IP6SKB_REROUTED;
193 return dst_output(net, sk, skb);
194 }
195 #endif
196
197 mtu = ip6_skb_dst_mtu(skb);
198 if (skb_is_gso(skb) &&
199 !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
200 !skb_gso_validate_network_len(skb, mtu))
201 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
202
203 if ((skb->len > mtu && !skb_is_gso(skb)) ||
204 dst_allfrag(skb_dst(skb)) ||
205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
206 return ip6_fragment(net, sk, skb, ip6_finish_output2);
207 else
208 return ip6_finish_output2(net, sk, skb);
209 }
210
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 int ret;
214
215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 switch (ret) {
217 case NET_XMIT_SUCCESS:
218 case NET_XMIT_CN:
219 return __ip6_finish_output(net, sk, skb) ? : ret;
220 default:
221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 return ret;
223 }
224 }
225
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
229 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
230
231 skb->protocol = htons(ETH_P_IPV6);
232 skb->dev = dev;
233
234 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
235 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
236 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
237 return 0;
238 }
239
240 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
241 net, sk, skb, indev, dev,
242 ip6_finish_output,
243 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
244 }
245 EXPORT_SYMBOL(ip6_output);
246
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)247 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
248 {
249 if (!np->autoflowlabel_set)
250 return ip6_default_np_autolabel(net);
251 else
252 return np->autoflowlabel;
253 }
254
255 /*
256 * xmit an sk_buff (used by TCP, SCTP and DCCP)
257 * Note : socket lock is not held for SYNACK packets, but might be modified
258 * by calls to skb_set_owner_w() and ipv6_local_error(),
259 * which are using proper atomic operations or spinlocks.
260 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)261 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
262 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
263 {
264 struct net *net = sock_net(sk);
265 const struct ipv6_pinfo *np = inet6_sk(sk);
266 struct in6_addr *first_hop = &fl6->daddr;
267 struct dst_entry *dst = skb_dst(skb);
268 struct net_device *dev = dst->dev;
269 struct inet6_dev *idev = ip6_dst_idev(dst);
270 struct hop_jumbo_hdr *hop_jumbo;
271 int hoplen = sizeof(*hop_jumbo);
272 unsigned int head_room;
273 struct ipv6hdr *hdr;
274 u8 proto = fl6->flowi6_proto;
275 int seg_len = skb->len;
276 int hlimit = -1;
277 u32 mtu;
278
279 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
280 if (opt)
281 head_room += opt->opt_nflen + opt->opt_flen;
282
283 if (unlikely(head_room > skb_headroom(skb))) {
284 /* Make sure idev stays alive */
285 rcu_read_lock();
286 skb = skb_expand_head(skb, head_room);
287 if (!skb) {
288 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
289 rcu_read_unlock();
290 return -ENOBUFS;
291 }
292 rcu_read_unlock();
293 }
294
295 if (opt) {
296 seg_len += opt->opt_nflen + opt->opt_flen;
297
298 if (opt->opt_flen)
299 ipv6_push_frag_opts(skb, opt, &proto);
300
301 if (opt->opt_nflen)
302 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
303 &fl6->saddr);
304 }
305
306 if (unlikely(seg_len > IPV6_MAXPLEN)) {
307 hop_jumbo = skb_push(skb, hoplen);
308
309 hop_jumbo->nexthdr = proto;
310 hop_jumbo->hdrlen = 0;
311 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
312 hop_jumbo->tlv_len = 4;
313 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
314
315 proto = IPPROTO_HOPOPTS;
316 seg_len = 0;
317 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
318 }
319
320 skb_push(skb, sizeof(struct ipv6hdr));
321 skb_reset_network_header(skb);
322 hdr = ipv6_hdr(skb);
323
324 /*
325 * Fill in the IPv6 header
326 */
327 if (np)
328 hlimit = np->hop_limit;
329 if (hlimit < 0)
330 hlimit = ip6_dst_hoplimit(dst);
331
332 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
333 ip6_autoflowlabel(net, np), fl6));
334
335 hdr->payload_len = htons(seg_len);
336 hdr->nexthdr = proto;
337 hdr->hop_limit = hlimit;
338
339 hdr->saddr = fl6->saddr;
340 hdr->daddr = *first_hop;
341
342 skb->protocol = htons(ETH_P_IPV6);
343 skb->priority = priority;
344 skb->mark = mark;
345
346 mtu = dst_mtu(dst);
347 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
348 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
349
350 /* if egress device is enslaved to an L3 master device pass the
351 * skb to its handler for processing
352 */
353 skb = l3mdev_ip6_out((struct sock *)sk, skb);
354 if (unlikely(!skb))
355 return 0;
356
357 /* hooks should never assume socket lock is held.
358 * we promote our socket to non const
359 */
360 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
361 net, (struct sock *)sk, skb, NULL, dev,
362 dst_output);
363 }
364
365 skb->dev = dev;
366 /* ipv6_local_error() does not require socket lock,
367 * we promote our socket to non const
368 */
369 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
370
371 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
372 kfree_skb(skb);
373 return -EMSGSIZE;
374 }
375 EXPORT_SYMBOL(ip6_xmit);
376
ip6_call_ra_chain(struct sk_buff * skb,int sel)377 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
378 {
379 struct ip6_ra_chain *ra;
380 struct sock *last = NULL;
381
382 read_lock(&ip6_ra_lock);
383 for (ra = ip6_ra_chain; ra; ra = ra->next) {
384 struct sock *sk = ra->sk;
385 if (sk && ra->sel == sel &&
386 (!sk->sk_bound_dev_if ||
387 sk->sk_bound_dev_if == skb->dev->ifindex)) {
388 struct ipv6_pinfo *np = inet6_sk(sk);
389
390 if (np && np->rtalert_isolate &&
391 !net_eq(sock_net(sk), dev_net(skb->dev))) {
392 continue;
393 }
394 if (last) {
395 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
396 if (skb2)
397 rawv6_rcv(last, skb2);
398 }
399 last = sk;
400 }
401 }
402
403 if (last) {
404 rawv6_rcv(last, skb);
405 read_unlock(&ip6_ra_lock);
406 return 1;
407 }
408 read_unlock(&ip6_ra_lock);
409 return 0;
410 }
411
ip6_forward_proxy_check(struct sk_buff * skb)412 static int ip6_forward_proxy_check(struct sk_buff *skb)
413 {
414 struct ipv6hdr *hdr = ipv6_hdr(skb);
415 u8 nexthdr = hdr->nexthdr;
416 __be16 frag_off;
417 int offset;
418
419 if (ipv6_ext_hdr(nexthdr)) {
420 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
421 if (offset < 0)
422 return 0;
423 } else
424 offset = sizeof(struct ipv6hdr);
425
426 if (nexthdr == IPPROTO_ICMPV6) {
427 struct icmp6hdr *icmp6;
428
429 if (!pskb_may_pull(skb, (skb_network_header(skb) +
430 offset + 1 - skb->data)))
431 return 0;
432
433 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
434
435 switch (icmp6->icmp6_type) {
436 case NDISC_ROUTER_SOLICITATION:
437 case NDISC_ROUTER_ADVERTISEMENT:
438 case NDISC_NEIGHBOUR_SOLICITATION:
439 case NDISC_NEIGHBOUR_ADVERTISEMENT:
440 case NDISC_REDIRECT:
441 /* For reaction involving unicast neighbor discovery
442 * message destined to the proxied address, pass it to
443 * input function.
444 */
445 return 1;
446 default:
447 break;
448 }
449 }
450
451 /*
452 * The proxying router can't forward traffic sent to a link-local
453 * address, so signal the sender and discard the packet. This
454 * behavior is clarified by the MIPv6 specification.
455 */
456 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
457 dst_link_failure(skb);
458 return -1;
459 }
460
461 return 0;
462 }
463
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)464 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
465 struct sk_buff *skb)
466 {
467 struct dst_entry *dst = skb_dst(skb);
468
469 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
470
471 #ifdef CONFIG_NET_SWITCHDEV
472 if (skb->offload_l3_fwd_mark) {
473 consume_skb(skb);
474 return 0;
475 }
476 #endif
477
478 skb_clear_tstamp(skb);
479 return dst_output(net, sk, skb);
480 }
481
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)482 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
483 {
484 if (skb->len <= mtu)
485 return false;
486
487 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
488 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
489 return true;
490
491 if (skb->ignore_df)
492 return false;
493
494 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
495 return false;
496
497 return true;
498 }
499
ip6_forward(struct sk_buff * skb)500 int ip6_forward(struct sk_buff *skb)
501 {
502 struct dst_entry *dst = skb_dst(skb);
503 struct ipv6hdr *hdr = ipv6_hdr(skb);
504 struct inet6_skb_parm *opt = IP6CB(skb);
505 struct net *net = dev_net(dst->dev);
506 struct inet6_dev *idev;
507 SKB_DR(reason);
508 u32 mtu;
509
510 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
511 if (net->ipv6.devconf_all->forwarding == 0)
512 goto error;
513
514 if (skb->pkt_type != PACKET_HOST)
515 goto drop;
516
517 if (unlikely(skb->sk))
518 goto drop;
519
520 if (skb_warn_if_lro(skb))
521 goto drop;
522
523 if (!net->ipv6.devconf_all->disable_policy &&
524 (!idev || !idev->cnf.disable_policy) &&
525 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
526 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
527 goto drop;
528 }
529
530 skb_forward_csum(skb);
531
532 /*
533 * We DO NOT make any processing on
534 * RA packets, pushing them to user level AS IS
535 * without ane WARRANTY that application will be able
536 * to interpret them. The reason is that we
537 * cannot make anything clever here.
538 *
539 * We are not end-node, so that if packet contains
540 * AH/ESP, we cannot make anything.
541 * Defragmentation also would be mistake, RA packets
542 * cannot be fragmented, because there is no warranty
543 * that different fragments will go along one path. --ANK
544 */
545 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
546 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
547 return 0;
548 }
549
550 /*
551 * check and decrement ttl
552 */
553 if (hdr->hop_limit <= 1) {
554 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
555 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
556
557 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
558 return -ETIMEDOUT;
559 }
560
561 /* XXX: idev->cnf.proxy_ndp? */
562 if (net->ipv6.devconf_all->proxy_ndp &&
563 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
564 int proxied = ip6_forward_proxy_check(skb);
565 if (proxied > 0) {
566 /* It's tempting to decrease the hop limit
567 * here by 1, as we do at the end of the
568 * function too.
569 *
570 * But that would be incorrect, as proxying is
571 * not forwarding. The ip6_input function
572 * will handle this packet locally, and it
573 * depends on the hop limit being unchanged.
574 *
575 * One example is the NDP hop limit, that
576 * always has to stay 255, but other would be
577 * similar checks around RA packets, where the
578 * user can even change the desired limit.
579 */
580 return ip6_input(skb);
581 } else if (proxied < 0) {
582 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
583 goto drop;
584 }
585 }
586
587 if (!xfrm6_route_forward(skb)) {
588 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
589 SKB_DR_SET(reason, XFRM_POLICY);
590 goto drop;
591 }
592 dst = skb_dst(skb);
593
594 /* IPv6 specs say nothing about it, but it is clear that we cannot
595 send redirects to source routed frames.
596 We don't send redirects to frames decapsulated from IPsec.
597 */
598 if (IP6CB(skb)->iif == dst->dev->ifindex &&
599 opt->srcrt == 0 && !skb_sec_path(skb)) {
600 struct in6_addr *target = NULL;
601 struct inet_peer *peer;
602 struct rt6_info *rt;
603
604 /*
605 * incoming and outgoing devices are the same
606 * send a redirect.
607 */
608
609 rt = dst_rt6_info(dst);
610 if (rt->rt6i_flags & RTF_GATEWAY)
611 target = &rt->rt6i_gateway;
612 else
613 target = &hdr->daddr;
614
615 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
616
617 /* Limit redirects both by destination (here)
618 and by source (inside ndisc_send_redirect)
619 */
620 if (inet_peer_xrlim_allow(peer, 1*HZ))
621 ndisc_send_redirect(skb, target);
622 if (peer)
623 inet_putpeer(peer);
624 } else {
625 int addrtype = ipv6_addr_type(&hdr->saddr);
626
627 /* This check is security critical. */
628 if (addrtype == IPV6_ADDR_ANY ||
629 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
630 goto error;
631 if (addrtype & IPV6_ADDR_LINKLOCAL) {
632 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
633 ICMPV6_NOT_NEIGHBOUR, 0);
634 goto error;
635 }
636 }
637
638 mtu = ip6_dst_mtu_maybe_forward(dst, true);
639 if (mtu < IPV6_MIN_MTU)
640 mtu = IPV6_MIN_MTU;
641
642 if (ip6_pkt_too_big(skb, mtu)) {
643 /* Again, force OUTPUT device used as source address */
644 skb->dev = dst->dev;
645 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
646 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
647 __IP6_INC_STATS(net, ip6_dst_idev(dst),
648 IPSTATS_MIB_FRAGFAILS);
649 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
650 return -EMSGSIZE;
651 }
652
653 if (skb_cow(skb, dst->dev->hard_header_len)) {
654 __IP6_INC_STATS(net, ip6_dst_idev(dst),
655 IPSTATS_MIB_OUTDISCARDS);
656 goto drop;
657 }
658
659 hdr = ipv6_hdr(skb);
660
661 /* Mangling hops number delayed to point after skb COW */
662
663 hdr->hop_limit--;
664
665 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
666 net, NULL, skb, skb->dev, dst->dev,
667 ip6_forward_finish);
668
669 error:
670 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
671 SKB_DR_SET(reason, IP_INADDRERRORS);
672 drop:
673 kfree_skb_reason(skb, reason);
674 return -EINVAL;
675 }
676
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)677 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
678 {
679 to->pkt_type = from->pkt_type;
680 to->priority = from->priority;
681 to->protocol = from->protocol;
682 skb_dst_drop(to);
683 skb_dst_set(to, dst_clone(skb_dst(from)));
684 to->dev = from->dev;
685 to->mark = from->mark;
686
687 skb_copy_hash(to, from);
688
689 #ifdef CONFIG_NET_SCHED
690 to->tc_index = from->tc_index;
691 #endif
692 nf_copy(to, from);
693 skb_ext_copy(to, from);
694 skb_copy_secmark(to, from);
695 }
696
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)697 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
698 u8 nexthdr, __be32 frag_id,
699 struct ip6_fraglist_iter *iter)
700 {
701 unsigned int first_len;
702 struct frag_hdr *fh;
703
704 /* BUILD HEADER */
705 *prevhdr = NEXTHDR_FRAGMENT;
706 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
707 if (!iter->tmp_hdr)
708 return -ENOMEM;
709
710 iter->frag = skb_shinfo(skb)->frag_list;
711 skb_frag_list_init(skb);
712
713 iter->offset = 0;
714 iter->hlen = hlen;
715 iter->frag_id = frag_id;
716 iter->nexthdr = nexthdr;
717
718 __skb_pull(skb, hlen);
719 fh = __skb_push(skb, sizeof(struct frag_hdr));
720 __skb_push(skb, hlen);
721 skb_reset_network_header(skb);
722 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
723
724 fh->nexthdr = nexthdr;
725 fh->reserved = 0;
726 fh->frag_off = htons(IP6_MF);
727 fh->identification = frag_id;
728
729 first_len = skb_pagelen(skb);
730 skb->data_len = first_len - skb_headlen(skb);
731 skb->len = first_len;
732 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
733
734 return 0;
735 }
736 EXPORT_SYMBOL(ip6_fraglist_init);
737
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)738 void ip6_fraglist_prepare(struct sk_buff *skb,
739 struct ip6_fraglist_iter *iter)
740 {
741 struct sk_buff *frag = iter->frag;
742 unsigned int hlen = iter->hlen;
743 struct frag_hdr *fh;
744
745 frag->ip_summed = CHECKSUM_NONE;
746 skb_reset_transport_header(frag);
747 fh = __skb_push(frag, sizeof(struct frag_hdr));
748 __skb_push(frag, hlen);
749 skb_reset_network_header(frag);
750 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
751 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
752 fh->nexthdr = iter->nexthdr;
753 fh->reserved = 0;
754 fh->frag_off = htons(iter->offset);
755 if (frag->next)
756 fh->frag_off |= htons(IP6_MF);
757 fh->identification = iter->frag_id;
758 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
759 ip6_copy_metadata(frag, skb);
760 }
761 EXPORT_SYMBOL(ip6_fraglist_prepare);
762
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)763 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
764 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
765 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
766 {
767 state->prevhdr = prevhdr;
768 state->nexthdr = nexthdr;
769 state->frag_id = frag_id;
770
771 state->hlen = hlen;
772 state->mtu = mtu;
773
774 state->left = skb->len - hlen; /* Space per frame */
775 state->ptr = hlen; /* Where to start from */
776
777 state->hroom = hdr_room;
778 state->troom = needed_tailroom;
779
780 state->offset = 0;
781 }
782 EXPORT_SYMBOL(ip6_frag_init);
783
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)784 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
785 {
786 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
787 struct sk_buff *frag;
788 struct frag_hdr *fh;
789 unsigned int len;
790
791 len = state->left;
792 /* IF: it doesn't fit, use 'mtu' - the data space left */
793 if (len > state->mtu)
794 len = state->mtu;
795 /* IF: we are not sending up to and including the packet end
796 then align the next start on an eight byte boundary */
797 if (len < state->left)
798 len &= ~7;
799
800 /* Allocate buffer */
801 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
802 state->hroom + state->troom, GFP_ATOMIC);
803 if (!frag)
804 return ERR_PTR(-ENOMEM);
805
806 /*
807 * Set up data on packet
808 */
809
810 ip6_copy_metadata(frag, skb);
811 skb_reserve(frag, state->hroom);
812 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
813 skb_reset_network_header(frag);
814 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
815 frag->transport_header = (frag->network_header + state->hlen +
816 sizeof(struct frag_hdr));
817
818 /*
819 * Charge the memory for the fragment to any owner
820 * it might possess
821 */
822 if (skb->sk)
823 skb_set_owner_w(frag, skb->sk);
824
825 /*
826 * Copy the packet header into the new buffer.
827 */
828 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
829
830 fragnexthdr_offset = skb_network_header(frag);
831 fragnexthdr_offset += prevhdr - skb_network_header(skb);
832 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
833
834 /*
835 * Build fragment header.
836 */
837 fh->nexthdr = state->nexthdr;
838 fh->reserved = 0;
839 fh->identification = state->frag_id;
840
841 /*
842 * Copy a block of the IP datagram.
843 */
844 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
845 len));
846 state->left -= len;
847
848 fh->frag_off = htons(state->offset);
849 if (state->left > 0)
850 fh->frag_off |= htons(IP6_MF);
851 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
852
853 state->ptr += len;
854 state->offset += len;
855
856 return frag;
857 }
858 EXPORT_SYMBOL(ip6_frag_next);
859
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))860 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
861 int (*output)(struct net *, struct sock *, struct sk_buff *))
862 {
863 struct sk_buff *frag;
864 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
865 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
866 inet6_sk(skb->sk) : NULL;
867 bool mono_delivery_time = skb->mono_delivery_time;
868 struct ip6_frag_state state;
869 unsigned int mtu, hlen, nexthdr_offset;
870 ktime_t tstamp = skb->tstamp;
871 int hroom, err = 0;
872 __be32 frag_id;
873 u8 *prevhdr, nexthdr = 0;
874
875 err = ip6_find_1stfragopt(skb, &prevhdr);
876 if (err < 0)
877 goto fail;
878 hlen = err;
879 nexthdr = *prevhdr;
880 nexthdr_offset = prevhdr - skb_network_header(skb);
881
882 mtu = ip6_skb_dst_mtu(skb);
883
884 /* We must not fragment if the socket is set to force MTU discovery
885 * or if the skb it not generated by a local socket.
886 */
887 if (unlikely(!skb->ignore_df && skb->len > mtu))
888 goto fail_toobig;
889
890 if (IP6CB(skb)->frag_max_size) {
891 if (IP6CB(skb)->frag_max_size > mtu)
892 goto fail_toobig;
893
894 /* don't send fragments larger than what we received */
895 mtu = IP6CB(skb)->frag_max_size;
896 if (mtu < IPV6_MIN_MTU)
897 mtu = IPV6_MIN_MTU;
898 }
899
900 if (np && np->frag_size < mtu) {
901 if (np->frag_size)
902 mtu = np->frag_size;
903 }
904 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
905 goto fail_toobig;
906 mtu -= hlen + sizeof(struct frag_hdr);
907
908 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
909 &ipv6_hdr(skb)->saddr);
910
911 if (skb->ip_summed == CHECKSUM_PARTIAL &&
912 (err = skb_checksum_help(skb)))
913 goto fail;
914
915 prevhdr = skb_network_header(skb) + nexthdr_offset;
916 hroom = LL_RESERVED_SPACE(rt->dst.dev);
917 if (skb_has_frag_list(skb)) {
918 unsigned int first_len = skb_pagelen(skb);
919 struct ip6_fraglist_iter iter;
920 struct sk_buff *frag2;
921
922 if (first_len - hlen > mtu ||
923 ((first_len - hlen) & 7) ||
924 skb_cloned(skb) ||
925 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
926 goto slow_path;
927
928 skb_walk_frags(skb, frag) {
929 /* Correct geometry. */
930 if (frag->len > mtu ||
931 ((frag->len & 7) && frag->next) ||
932 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
933 goto slow_path_clean;
934
935 /* Partially cloned skb? */
936 if (skb_shared(frag))
937 goto slow_path_clean;
938
939 BUG_ON(frag->sk);
940 if (skb->sk) {
941 frag->sk = skb->sk;
942 frag->destructor = sock_wfree;
943 }
944 skb->truesize -= frag->truesize;
945 }
946
947 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
948 &iter);
949 if (err < 0)
950 goto fail;
951
952 /* We prevent @rt from being freed. */
953 rcu_read_lock();
954
955 for (;;) {
956 /* Prepare header of the next frame,
957 * before previous one went down. */
958 if (iter.frag)
959 ip6_fraglist_prepare(skb, &iter);
960
961 skb_set_delivery_time(skb, tstamp, mono_delivery_time);
962 err = output(net, sk, skb);
963 if (!err)
964 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
965 IPSTATS_MIB_FRAGCREATES);
966
967 if (err || !iter.frag)
968 break;
969
970 skb = ip6_fraglist_next(&iter);
971 }
972
973 kfree(iter.tmp_hdr);
974
975 if (err == 0) {
976 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
977 IPSTATS_MIB_FRAGOKS);
978 rcu_read_unlock();
979 return 0;
980 }
981
982 kfree_skb_list(iter.frag);
983
984 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
985 IPSTATS_MIB_FRAGFAILS);
986 rcu_read_unlock();
987 return err;
988
989 slow_path_clean:
990 skb_walk_frags(skb, frag2) {
991 if (frag2 == frag)
992 break;
993 frag2->sk = NULL;
994 frag2->destructor = NULL;
995 skb->truesize += frag2->truesize;
996 }
997 }
998
999 slow_path:
1000 /*
1001 * Fragment the datagram.
1002 */
1003
1004 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1005 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1006 &state);
1007
1008 /*
1009 * Keep copying data until we run out.
1010 */
1011
1012 while (state.left > 0) {
1013 frag = ip6_frag_next(skb, &state);
1014 if (IS_ERR(frag)) {
1015 err = PTR_ERR(frag);
1016 goto fail;
1017 }
1018
1019 /*
1020 * Put this fragment into the sending queue.
1021 */
1022 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1023 err = output(net, sk, frag);
1024 if (err)
1025 goto fail;
1026
1027 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 IPSTATS_MIB_FRAGCREATES);
1029 }
1030 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1031 IPSTATS_MIB_FRAGOKS);
1032 consume_skb(skb);
1033 return err;
1034
1035 fail_toobig:
1036 if (skb->sk && dst_allfrag(skb_dst(skb)))
1037 sk_gso_disable(skb->sk);
1038
1039 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1040 err = -EMSGSIZE;
1041
1042 fail:
1043 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1044 IPSTATS_MIB_FRAGFAILS);
1045 kfree_skb(skb);
1046 return err;
1047 }
1048
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1049 static inline int ip6_rt_check(const struct rt6key *rt_key,
1050 const struct in6_addr *fl_addr,
1051 const struct in6_addr *addr_cache)
1052 {
1053 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1054 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1055 }
1056
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1057 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1058 struct dst_entry *dst,
1059 const struct flowi6 *fl6)
1060 {
1061 struct ipv6_pinfo *np = inet6_sk(sk);
1062 struct rt6_info *rt;
1063
1064 if (!dst)
1065 goto out;
1066
1067 if (dst->ops->family != AF_INET6) {
1068 dst_release(dst);
1069 return NULL;
1070 }
1071
1072 rt = dst_rt6_info(dst);
1073 /* Yes, checking route validity in not connected
1074 * case is not very simple. Take into account,
1075 * that we do not support routing by source, TOS,
1076 * and MSG_DONTROUTE --ANK (980726)
1077 *
1078 * 1. ip6_rt_check(): If route was host route,
1079 * check that cached destination is current.
1080 * If it is network route, we still may
1081 * check its validity using saved pointer
1082 * to the last used address: daddr_cache.
1083 * We do not want to save whole address now,
1084 * (because main consumer of this service
1085 * is tcp, which has not this problem),
1086 * so that the last trick works only on connected
1087 * sockets.
1088 * 2. oif also should be the same.
1089 */
1090 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1091 #ifdef CONFIG_IPV6_SUBTREES
1092 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1093 #endif
1094 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1095 dst_release(dst);
1096 dst = NULL;
1097 }
1098
1099 out:
1100 return dst;
1101 }
1102
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1103 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1104 struct dst_entry **dst, struct flowi6 *fl6)
1105 {
1106 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1107 struct neighbour *n;
1108 struct rt6_info *rt;
1109 #endif
1110 int err;
1111 int flags = 0;
1112
1113 /* The correct way to handle this would be to do
1114 * ip6_route_get_saddr, and then ip6_route_output; however,
1115 * the route-specific preferred source forces the
1116 * ip6_route_output call _before_ ip6_route_get_saddr.
1117 *
1118 * In source specific routing (no src=any default route),
1119 * ip6_route_output will fail given src=any saddr, though, so
1120 * that's why we try it again later.
1121 */
1122 if (ipv6_addr_any(&fl6->saddr)) {
1123 struct fib6_info *from;
1124 struct rt6_info *rt;
1125
1126 *dst = ip6_route_output(net, sk, fl6);
1127 rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1128
1129 rcu_read_lock();
1130 from = rt ? rcu_dereference(rt->from) : NULL;
1131 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1132 sk ? inet6_sk(sk)->srcprefs : 0,
1133 fl6->flowi6_l3mdev,
1134 &fl6->saddr);
1135 rcu_read_unlock();
1136
1137 if (err)
1138 goto out_err_release;
1139
1140 /* If we had an erroneous initial result, pretend it
1141 * never existed and let the SA-enabled version take
1142 * over.
1143 */
1144 if ((*dst)->error) {
1145 dst_release(*dst);
1146 *dst = NULL;
1147 }
1148
1149 if (fl6->flowi6_oif)
1150 flags |= RT6_LOOKUP_F_IFACE;
1151 }
1152
1153 if (!*dst)
1154 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1155
1156 err = (*dst)->error;
1157 if (err)
1158 goto out_err_release;
1159
1160 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1161 /*
1162 * Here if the dst entry we've looked up
1163 * has a neighbour entry that is in the INCOMPLETE
1164 * state and the src address from the flow is
1165 * marked as OPTIMISTIC, we release the found
1166 * dst entry and replace it instead with the
1167 * dst entry of the nexthop router
1168 */
1169 rt = dst_rt6_info(*dst);
1170 rcu_read_lock();
1171 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1172 rt6_nexthop(rt, &fl6->daddr));
1173 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1174 rcu_read_unlock();
1175
1176 if (err) {
1177 struct inet6_ifaddr *ifp;
1178 struct flowi6 fl_gw6;
1179 int redirect;
1180
1181 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1182 (*dst)->dev, 1);
1183
1184 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1185 if (ifp)
1186 in6_ifa_put(ifp);
1187
1188 if (redirect) {
1189 /*
1190 * We need to get the dst entry for the
1191 * default router instead
1192 */
1193 dst_release(*dst);
1194 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1195 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1196 *dst = ip6_route_output(net, sk, &fl_gw6);
1197 err = (*dst)->error;
1198 if (err)
1199 goto out_err_release;
1200 }
1201 }
1202 #endif
1203 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1204 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1205 err = -EAFNOSUPPORT;
1206 goto out_err_release;
1207 }
1208
1209 return 0;
1210
1211 out_err_release:
1212 dst_release(*dst);
1213 *dst = NULL;
1214
1215 if (err == -ENETUNREACH)
1216 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1217 return err;
1218 }
1219
1220 /**
1221 * ip6_dst_lookup - perform route lookup on flow
1222 * @net: Network namespace to perform lookup in
1223 * @sk: socket which provides route info
1224 * @dst: pointer to dst_entry * for result
1225 * @fl6: flow to lookup
1226 *
1227 * This function performs a route lookup on the given flow.
1228 *
1229 * It returns zero on success, or a standard errno code on error.
1230 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1231 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1232 struct flowi6 *fl6)
1233 {
1234 *dst = NULL;
1235 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1236 }
1237 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1238
1239 /**
1240 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1241 * @net: Network namespace to perform lookup in
1242 * @sk: socket which provides route info
1243 * @fl6: flow to lookup
1244 * @final_dst: final destination address for ipsec lookup
1245 *
1246 * This function performs a route lookup on the given flow.
1247 *
1248 * It returns a valid dst pointer on success, or a pointer encoded
1249 * error code.
1250 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1251 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1252 const struct in6_addr *final_dst)
1253 {
1254 struct dst_entry *dst = NULL;
1255 int err;
1256
1257 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1258 if (err)
1259 return ERR_PTR(err);
1260 if (final_dst)
1261 fl6->daddr = *final_dst;
1262
1263 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1264 }
1265 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1266
1267 /**
1268 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1269 * @sk: socket which provides the dst cache and route info
1270 * @fl6: flow to lookup
1271 * @final_dst: final destination address for ipsec lookup
1272 * @connected: whether @sk is connected or not
1273 *
1274 * This function performs a route lookup on the given flow with the
1275 * possibility of using the cached route in the socket if it is valid.
1276 * It will take the socket dst lock when operating on the dst cache.
1277 * As a result, this function can only be used in process context.
1278 *
1279 * In addition, for a connected socket, cache the dst in the socket
1280 * if the current cache is not valid.
1281 *
1282 * It returns a valid dst pointer on success, or a pointer encoded
1283 * error code.
1284 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1285 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1286 const struct in6_addr *final_dst,
1287 bool connected)
1288 {
1289 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1290
1291 dst = ip6_sk_dst_check(sk, dst, fl6);
1292 if (dst)
1293 return dst;
1294
1295 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1296 if (connected && !IS_ERR(dst))
1297 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1298
1299 return dst;
1300 }
1301 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1302
1303 /**
1304 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1305 * @skb: Packet for which lookup is done
1306 * @dev: Tunnel device
1307 * @net: Network namespace of tunnel device
1308 * @sock: Socket which provides route info
1309 * @saddr: Memory to store the src ip address
1310 * @info: Tunnel information
1311 * @protocol: IP protocol
1312 * @use_cache: Flag to enable cache usage
1313 * This function performs a route lookup on a tunnel
1314 *
1315 * It returns a valid dst pointer and stores src address to be used in
1316 * tunnel in param saddr on success, else a pointer encoded error code.
1317 */
1318
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1319 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1320 struct net_device *dev,
1321 struct net *net,
1322 struct socket *sock,
1323 struct in6_addr *saddr,
1324 const struct ip_tunnel_info *info,
1325 u8 protocol,
1326 bool use_cache)
1327 {
1328 struct dst_entry *dst = NULL;
1329 #ifdef CONFIG_DST_CACHE
1330 struct dst_cache *dst_cache;
1331 #endif
1332 struct flowi6 fl6;
1333 __u8 prio;
1334
1335 #ifdef CONFIG_DST_CACHE
1336 dst_cache = (struct dst_cache *)&info->dst_cache;
1337 if (use_cache) {
1338 dst = dst_cache_get_ip6(dst_cache, saddr);
1339 if (dst)
1340 return dst;
1341 }
1342 #endif
1343 memset(&fl6, 0, sizeof(fl6));
1344 fl6.flowi6_mark = skb->mark;
1345 fl6.flowi6_proto = protocol;
1346 fl6.daddr = info->key.u.ipv6.dst;
1347 fl6.saddr = info->key.u.ipv6.src;
1348 prio = info->key.tos;
1349 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1350
1351 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1352 NULL);
1353 if (IS_ERR(dst)) {
1354 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1355 return ERR_PTR(-ENETUNREACH);
1356 }
1357 if (dst->dev == dev) { /* is this necessary? */
1358 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1359 dst_release(dst);
1360 return ERR_PTR(-ELOOP);
1361 }
1362 #ifdef CONFIG_DST_CACHE
1363 if (use_cache)
1364 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1365 #endif
1366 *saddr = fl6.saddr;
1367 return dst;
1368 }
1369 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1370
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1371 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1372 gfp_t gfp)
1373 {
1374 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1375 }
1376
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1377 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1378 gfp_t gfp)
1379 {
1380 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1381 }
1382
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1383 static void ip6_append_data_mtu(unsigned int *mtu,
1384 int *maxfraglen,
1385 unsigned int fragheaderlen,
1386 struct sk_buff *skb,
1387 struct rt6_info *rt,
1388 unsigned int orig_mtu)
1389 {
1390 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1391 if (!skb) {
1392 /* first fragment, reserve header_len */
1393 *mtu = orig_mtu - rt->dst.header_len;
1394
1395 } else {
1396 /*
1397 * this fragment is not first, the headers
1398 * space is regarded as data space.
1399 */
1400 *mtu = orig_mtu;
1401 }
1402 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1403 + fragheaderlen - sizeof(struct frag_hdr);
1404 }
1405 }
1406
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1407 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1408 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1409 struct rt6_info *rt)
1410 {
1411 struct ipv6_pinfo *np = inet6_sk(sk);
1412 unsigned int mtu;
1413 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1414
1415 /* callers pass dst together with a reference, set it first so
1416 * ip6_cork_release() can put it down even in case of an error.
1417 */
1418 cork->base.dst = &rt->dst;
1419
1420 /*
1421 * setup for corking
1422 */
1423 if (opt) {
1424 if (WARN_ON(v6_cork->opt))
1425 return -EINVAL;
1426
1427 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1428 if (unlikely(!nopt))
1429 return -ENOBUFS;
1430
1431 nopt->tot_len = sizeof(*opt);
1432 nopt->opt_flen = opt->opt_flen;
1433 nopt->opt_nflen = opt->opt_nflen;
1434
1435 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1436 if (opt->dst0opt && !nopt->dst0opt)
1437 return -ENOBUFS;
1438
1439 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1440 if (opt->dst1opt && !nopt->dst1opt)
1441 return -ENOBUFS;
1442
1443 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1444 if (opt->hopopt && !nopt->hopopt)
1445 return -ENOBUFS;
1446
1447 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1448 if (opt->srcrt && !nopt->srcrt)
1449 return -ENOBUFS;
1450
1451 /* need source address above miyazawa*/
1452 }
1453 v6_cork->hop_limit = ipc6->hlimit;
1454 v6_cork->tclass = ipc6->tclass;
1455 if (rt->dst.flags & DST_XFRM_TUNNEL)
1456 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1457 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1458 else
1459 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1460 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1461 if (np->frag_size < mtu) {
1462 if (np->frag_size)
1463 mtu = np->frag_size;
1464 }
1465 cork->base.fragsize = mtu;
1466 cork->base.gso_size = ipc6->gso_size;
1467 cork->base.tx_flags = 0;
1468 cork->base.mark = ipc6->sockc.mark;
1469 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1470
1471 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1472 cork->base.flags |= IPCORK_ALLFRAG;
1473 cork->base.length = 0;
1474
1475 cork->base.transmit_time = ipc6->sockc.transmit_time;
1476
1477 return 0;
1478 }
1479
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1480 static int __ip6_append_data(struct sock *sk,
1481 struct sk_buff_head *queue,
1482 struct inet_cork_full *cork_full,
1483 struct inet6_cork *v6_cork,
1484 struct page_frag *pfrag,
1485 int getfrag(void *from, char *to, int offset,
1486 int len, int odd, struct sk_buff *skb),
1487 void *from, size_t length, int transhdrlen,
1488 unsigned int flags, struct ipcm6_cookie *ipc6)
1489 {
1490 struct sk_buff *skb, *skb_prev = NULL;
1491 struct inet_cork *cork = &cork_full->base;
1492 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1493 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1494 struct ubuf_info *uarg = NULL;
1495 int exthdrlen = 0;
1496 int dst_exthdrlen = 0;
1497 int hh_len;
1498 int copy;
1499 int err;
1500 int offset = 0;
1501 bool zc = false;
1502 u32 tskey = 0;
1503 struct rt6_info *rt = dst_rt6_info(cork->dst);
1504 bool paged, hold_tskey, extra_uref = false;
1505 struct ipv6_txoptions *opt = v6_cork->opt;
1506 int csummode = CHECKSUM_NONE;
1507 unsigned int maxnonfragsize, headersize;
1508 unsigned int wmem_alloc_delta = 0;
1509
1510 skb = skb_peek_tail(queue);
1511 if (!skb) {
1512 exthdrlen = opt ? opt->opt_flen : 0;
1513 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1514 }
1515
1516 paged = !!cork->gso_size;
1517 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1518 orig_mtu = mtu;
1519
1520 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1521
1522 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1523 (opt ? opt->opt_nflen : 0);
1524
1525 headersize = sizeof(struct ipv6hdr) +
1526 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1527 (dst_allfrag(&rt->dst) ?
1528 sizeof(struct frag_hdr) : 0) +
1529 rt->rt6i_nfheader_len;
1530
1531 if (mtu <= fragheaderlen ||
1532 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1533 goto emsgsize;
1534
1535 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1536 sizeof(struct frag_hdr);
1537
1538 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1539 * the first fragment
1540 */
1541 if (headersize + transhdrlen > mtu)
1542 goto emsgsize;
1543
1544 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1545 (sk->sk_protocol == IPPROTO_UDP ||
1546 sk->sk_protocol == IPPROTO_ICMPV6 ||
1547 sk->sk_protocol == IPPROTO_RAW)) {
1548 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1549 sizeof(struct ipv6hdr));
1550 goto emsgsize;
1551 }
1552
1553 if (ip6_sk_ignore_df(sk))
1554 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1555 else
1556 maxnonfragsize = mtu;
1557
1558 if (cork->length + length > maxnonfragsize - headersize) {
1559 emsgsize:
1560 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1561 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1562 return -EMSGSIZE;
1563 }
1564
1565 /* CHECKSUM_PARTIAL only with no extension headers and when
1566 * we are not going to fragment
1567 */
1568 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1569 headersize == sizeof(struct ipv6hdr) &&
1570 length <= mtu - headersize &&
1571 (!(flags & MSG_MORE) || cork->gso_size) &&
1572 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1573 csummode = CHECKSUM_PARTIAL;
1574
1575 if ((flags & MSG_ZEROCOPY) && length) {
1576 struct msghdr *msg = from;
1577
1578 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1579 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1580 return -EINVAL;
1581
1582 /* Leave uarg NULL if can't zerocopy, callers should
1583 * be able to handle it.
1584 */
1585 if ((rt->dst.dev->features & NETIF_F_SG) &&
1586 csummode == CHECKSUM_PARTIAL) {
1587 paged = true;
1588 zc = true;
1589 uarg = msg->msg_ubuf;
1590 }
1591 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1592 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1593 if (!uarg)
1594 return -ENOBUFS;
1595 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1596 if (rt->dst.dev->features & NETIF_F_SG &&
1597 csummode == CHECKSUM_PARTIAL) {
1598 paged = true;
1599 zc = true;
1600 } else {
1601 uarg_to_msgzc(uarg)->zerocopy = 0;
1602 skb_zcopy_set(skb, uarg, &extra_uref);
1603 }
1604 }
1605 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1606 if (inet_test_bit(HDRINCL, sk))
1607 return -EPERM;
1608 if (rt->dst.dev->features & NETIF_F_SG &&
1609 getfrag == ip_generic_getfrag)
1610 /* We need an empty buffer to attach stuff to */
1611 paged = true;
1612 else
1613 flags &= ~MSG_SPLICE_PAGES;
1614 }
1615
1616 hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP &&
1617 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID;
1618 if (hold_tskey)
1619 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1620
1621 /*
1622 * Let's try using as much space as possible.
1623 * Use MTU if total length of the message fits into the MTU.
1624 * Otherwise, we need to reserve fragment header and
1625 * fragment alignment (= 8-15 octects, in total).
1626 *
1627 * Note that we may need to "move" the data from the tail
1628 * of the buffer to the new fragment when we split
1629 * the message.
1630 *
1631 * FIXME: It may be fragmented into multiple chunks
1632 * at once if non-fragmentable extension headers
1633 * are too large.
1634 * --yoshfuji
1635 */
1636
1637 cork->length += length;
1638 if (!skb)
1639 goto alloc_new_skb;
1640
1641 while (length > 0) {
1642 /* Check if the remaining data fits into current packet. */
1643 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1644 if (copy < length)
1645 copy = maxfraglen - skb->len;
1646
1647 if (copy <= 0) {
1648 char *data;
1649 unsigned int datalen;
1650 unsigned int fraglen;
1651 unsigned int fraggap;
1652 unsigned int alloclen, alloc_extra;
1653 unsigned int pagedlen;
1654 alloc_new_skb:
1655 /* There's no room in the current skb */
1656 if (skb)
1657 fraggap = skb->len - maxfraglen;
1658 else
1659 fraggap = 0;
1660 /* update mtu and maxfraglen if necessary */
1661 if (!skb || !skb_prev)
1662 ip6_append_data_mtu(&mtu, &maxfraglen,
1663 fragheaderlen, skb, rt,
1664 orig_mtu);
1665
1666 skb_prev = skb;
1667
1668 /*
1669 * If remaining data exceeds the mtu,
1670 * we know we need more fragment(s).
1671 */
1672 datalen = length + fraggap;
1673
1674 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1675 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1676 fraglen = datalen + fragheaderlen;
1677 pagedlen = 0;
1678
1679 alloc_extra = hh_len;
1680 alloc_extra += dst_exthdrlen;
1681 alloc_extra += rt->dst.trailer_len;
1682
1683 /* We just reserve space for fragment header.
1684 * Note: this may be overallocation if the message
1685 * (without MSG_MORE) fits into the MTU.
1686 */
1687 alloc_extra += sizeof(struct frag_hdr);
1688
1689 if ((flags & MSG_MORE) &&
1690 !(rt->dst.dev->features&NETIF_F_SG))
1691 alloclen = mtu;
1692 else if (!paged &&
1693 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1694 !(rt->dst.dev->features & NETIF_F_SG)))
1695 alloclen = fraglen;
1696 else {
1697 alloclen = fragheaderlen + transhdrlen;
1698 pagedlen = datalen - transhdrlen;
1699 }
1700 alloclen += alloc_extra;
1701
1702 if (datalen != length + fraggap) {
1703 /*
1704 * this is not the last fragment, the trailer
1705 * space is regarded as data space.
1706 */
1707 datalen += rt->dst.trailer_len;
1708 }
1709
1710 fraglen = datalen + fragheaderlen;
1711
1712 copy = datalen - transhdrlen - fraggap - pagedlen;
1713 /* [!] NOTE: copy may be negative if pagedlen>0
1714 * because then the equation may reduces to -fraggap.
1715 */
1716 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1717 err = -EINVAL;
1718 goto error;
1719 }
1720 if (transhdrlen) {
1721 skb = sock_alloc_send_skb(sk, alloclen,
1722 (flags & MSG_DONTWAIT), &err);
1723 } else {
1724 skb = NULL;
1725 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1726 2 * sk->sk_sndbuf)
1727 skb = alloc_skb(alloclen,
1728 sk->sk_allocation);
1729 if (unlikely(!skb))
1730 err = -ENOBUFS;
1731 }
1732 if (!skb)
1733 goto error;
1734 /*
1735 * Fill in the control structures
1736 */
1737 skb->protocol = htons(ETH_P_IPV6);
1738 skb->ip_summed = csummode;
1739 skb->csum = 0;
1740 /* reserve for fragmentation and ipsec header */
1741 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1742 dst_exthdrlen);
1743
1744 /*
1745 * Find where to start putting bytes
1746 */
1747 data = skb_put(skb, fraglen - pagedlen);
1748 skb_set_network_header(skb, exthdrlen);
1749 data += fragheaderlen;
1750 skb->transport_header = (skb->network_header +
1751 fragheaderlen);
1752 if (fraggap) {
1753 skb->csum = skb_copy_and_csum_bits(
1754 skb_prev, maxfraglen,
1755 data + transhdrlen, fraggap);
1756 skb_prev->csum = csum_sub(skb_prev->csum,
1757 skb->csum);
1758 data += fraggap;
1759 pskb_trim_unique(skb_prev, maxfraglen);
1760 }
1761 if (copy > 0 &&
1762 getfrag(from, data + transhdrlen, offset,
1763 copy, fraggap, skb) < 0) {
1764 err = -EFAULT;
1765 kfree_skb(skb);
1766 goto error;
1767 } else if (flags & MSG_SPLICE_PAGES) {
1768 copy = 0;
1769 }
1770
1771 offset += copy;
1772 length -= copy + transhdrlen;
1773 transhdrlen = 0;
1774 exthdrlen = 0;
1775 dst_exthdrlen = 0;
1776
1777 /* Only the initial fragment is time stamped */
1778 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1779 cork->tx_flags = 0;
1780 skb_shinfo(skb)->tskey = tskey;
1781 tskey = 0;
1782 skb_zcopy_set(skb, uarg, &extra_uref);
1783
1784 if ((flags & MSG_CONFIRM) && !skb_prev)
1785 skb_set_dst_pending_confirm(skb, 1);
1786
1787 /*
1788 * Put the packet on the pending queue
1789 */
1790 if (!skb->destructor) {
1791 skb->destructor = sock_wfree;
1792 skb->sk = sk;
1793 wmem_alloc_delta += skb->truesize;
1794 }
1795 __skb_queue_tail(queue, skb);
1796 continue;
1797 }
1798
1799 if (copy > length)
1800 copy = length;
1801
1802 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1803 skb_tailroom(skb) >= copy) {
1804 unsigned int off;
1805
1806 off = skb->len;
1807 if (getfrag(from, skb_put(skb, copy),
1808 offset, copy, off, skb) < 0) {
1809 __skb_trim(skb, off);
1810 err = -EFAULT;
1811 goto error;
1812 }
1813 } else if (flags & MSG_SPLICE_PAGES) {
1814 struct msghdr *msg = from;
1815
1816 err = -EIO;
1817 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1818 goto error;
1819
1820 err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1821 sk->sk_allocation);
1822 if (err < 0)
1823 goto error;
1824 copy = err;
1825 wmem_alloc_delta += copy;
1826 } else if (!zc) {
1827 int i = skb_shinfo(skb)->nr_frags;
1828
1829 err = -ENOMEM;
1830 if (!sk_page_frag_refill(sk, pfrag))
1831 goto error;
1832
1833 skb_zcopy_downgrade_managed(skb);
1834 if (!skb_can_coalesce(skb, i, pfrag->page,
1835 pfrag->offset)) {
1836 err = -EMSGSIZE;
1837 if (i == MAX_SKB_FRAGS)
1838 goto error;
1839
1840 __skb_fill_page_desc(skb, i, pfrag->page,
1841 pfrag->offset, 0);
1842 skb_shinfo(skb)->nr_frags = ++i;
1843 get_page(pfrag->page);
1844 }
1845 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1846 if (getfrag(from,
1847 page_address(pfrag->page) + pfrag->offset,
1848 offset, copy, skb->len, skb) < 0)
1849 goto error_efault;
1850
1851 pfrag->offset += copy;
1852 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1853 skb->len += copy;
1854 skb->data_len += copy;
1855 skb->truesize += copy;
1856 wmem_alloc_delta += copy;
1857 } else {
1858 err = skb_zerocopy_iter_dgram(skb, from, copy);
1859 if (err < 0)
1860 goto error;
1861 }
1862 offset += copy;
1863 length -= copy;
1864 }
1865
1866 if (wmem_alloc_delta)
1867 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1868 return 0;
1869
1870 error_efault:
1871 err = -EFAULT;
1872 error:
1873 net_zcopy_put_abort(uarg, extra_uref);
1874 cork->length -= length;
1875 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1876 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1877 if (hold_tskey)
1878 atomic_dec(&sk->sk_tskey);
1879 return err;
1880 }
1881
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1882 int ip6_append_data(struct sock *sk,
1883 int getfrag(void *from, char *to, int offset, int len,
1884 int odd, struct sk_buff *skb),
1885 void *from, size_t length, int transhdrlen,
1886 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1887 struct rt6_info *rt, unsigned int flags)
1888 {
1889 struct inet_sock *inet = inet_sk(sk);
1890 struct ipv6_pinfo *np = inet6_sk(sk);
1891 int exthdrlen;
1892 int err;
1893
1894 if (flags&MSG_PROBE)
1895 return 0;
1896 if (skb_queue_empty(&sk->sk_write_queue)) {
1897 /*
1898 * setup for corking
1899 */
1900 dst_hold(&rt->dst);
1901 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1902 ipc6, rt);
1903 if (err)
1904 return err;
1905
1906 inet->cork.fl.u.ip6 = *fl6;
1907 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1908 length += exthdrlen;
1909 transhdrlen += exthdrlen;
1910 } else {
1911 transhdrlen = 0;
1912 }
1913
1914 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1915 &np->cork, sk_page_frag(sk), getfrag,
1916 from, length, transhdrlen, flags, ipc6);
1917 }
1918 EXPORT_SYMBOL_GPL(ip6_append_data);
1919
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1920 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1921 {
1922 struct dst_entry *dst = cork->base.dst;
1923
1924 cork->base.dst = NULL;
1925 cork->base.flags &= ~IPCORK_ALLFRAG;
1926 skb_dst_set(skb, dst);
1927 }
1928
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1929 static void ip6_cork_release(struct inet_cork_full *cork,
1930 struct inet6_cork *v6_cork)
1931 {
1932 if (v6_cork->opt) {
1933 struct ipv6_txoptions *opt = v6_cork->opt;
1934
1935 kfree(opt->dst0opt);
1936 kfree(opt->dst1opt);
1937 kfree(opt->hopopt);
1938 kfree(opt->srcrt);
1939 kfree(opt);
1940 v6_cork->opt = NULL;
1941 }
1942
1943 if (cork->base.dst) {
1944 dst_release(cork->base.dst);
1945 cork->base.dst = NULL;
1946 cork->base.flags &= ~IPCORK_ALLFRAG;
1947 }
1948 }
1949
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1950 struct sk_buff *__ip6_make_skb(struct sock *sk,
1951 struct sk_buff_head *queue,
1952 struct inet_cork_full *cork,
1953 struct inet6_cork *v6_cork)
1954 {
1955 struct sk_buff *skb, *tmp_skb;
1956 struct sk_buff **tail_skb;
1957 struct in6_addr *final_dst;
1958 struct ipv6_pinfo *np = inet6_sk(sk);
1959 struct net *net = sock_net(sk);
1960 struct ipv6hdr *hdr;
1961 struct ipv6_txoptions *opt = v6_cork->opt;
1962 struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1963 struct flowi6 *fl6 = &cork->fl.u.ip6;
1964 unsigned char proto = fl6->flowi6_proto;
1965
1966 skb = __skb_dequeue(queue);
1967 if (!skb)
1968 goto out;
1969 tail_skb = &(skb_shinfo(skb)->frag_list);
1970
1971 /* move skb->data to ip header from ext header */
1972 if (skb->data < skb_network_header(skb))
1973 __skb_pull(skb, skb_network_offset(skb));
1974 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1975 __skb_pull(tmp_skb, skb_network_header_len(skb));
1976 *tail_skb = tmp_skb;
1977 tail_skb = &(tmp_skb->next);
1978 skb->len += tmp_skb->len;
1979 skb->data_len += tmp_skb->len;
1980 skb->truesize += tmp_skb->truesize;
1981 tmp_skb->destructor = NULL;
1982 tmp_skb->sk = NULL;
1983 }
1984
1985 /* Allow local fragmentation. */
1986 skb->ignore_df = ip6_sk_ignore_df(sk);
1987 __skb_pull(skb, skb_network_header_len(skb));
1988
1989 final_dst = &fl6->daddr;
1990 if (opt && opt->opt_flen)
1991 ipv6_push_frag_opts(skb, opt, &proto);
1992 if (opt && opt->opt_nflen)
1993 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1994
1995 skb_push(skb, sizeof(struct ipv6hdr));
1996 skb_reset_network_header(skb);
1997 hdr = ipv6_hdr(skb);
1998
1999 ip6_flow_hdr(hdr, v6_cork->tclass,
2000 ip6_make_flowlabel(net, skb, fl6->flowlabel,
2001 ip6_autoflowlabel(net, np), fl6));
2002 hdr->hop_limit = v6_cork->hop_limit;
2003 hdr->nexthdr = proto;
2004 hdr->saddr = fl6->saddr;
2005 hdr->daddr = *final_dst;
2006
2007 skb->priority = sk->sk_priority;
2008 skb->mark = cork->base.mark;
2009 skb->tstamp = cork->base.transmit_time;
2010
2011 ip6_cork_steal_dst(skb, cork);
2012 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
2013 if (proto == IPPROTO_ICMPV6) {
2014 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
2015 u8 icmp6_type;
2016
2017 if (sk->sk_socket->type == SOCK_RAW &&
2018 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
2019 icmp6_type = fl6->fl6_icmp_type;
2020 else
2021 icmp6_type = icmp6_hdr(skb)->icmp6_type;
2022 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2023 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2024 }
2025
2026 ip6_cork_release(cork, v6_cork);
2027 out:
2028 return skb;
2029 }
2030
ip6_send_skb(struct sk_buff * skb)2031 int ip6_send_skb(struct sk_buff *skb)
2032 {
2033 struct net *net = sock_net(skb->sk);
2034 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
2035 int err;
2036
2037 rcu_read_lock();
2038 err = ip6_local_out(net, skb->sk, skb);
2039 if (err) {
2040 if (err > 0)
2041 err = net_xmit_errno(err);
2042 if (err)
2043 IP6_INC_STATS(net, rt->rt6i_idev,
2044 IPSTATS_MIB_OUTDISCARDS);
2045 }
2046
2047 rcu_read_unlock();
2048 return err;
2049 }
2050
ip6_push_pending_frames(struct sock * sk)2051 int ip6_push_pending_frames(struct sock *sk)
2052 {
2053 struct sk_buff *skb;
2054
2055 skb = ip6_finish_skb(sk);
2056 if (!skb)
2057 return 0;
2058
2059 return ip6_send_skb(skb);
2060 }
2061 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2062
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)2063 static void __ip6_flush_pending_frames(struct sock *sk,
2064 struct sk_buff_head *queue,
2065 struct inet_cork_full *cork,
2066 struct inet6_cork *v6_cork)
2067 {
2068 struct sk_buff *skb;
2069
2070 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2071 if (skb_dst(skb))
2072 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2073 IPSTATS_MIB_OUTDISCARDS);
2074 kfree_skb(skb);
2075 }
2076
2077 ip6_cork_release(cork, v6_cork);
2078 }
2079
ip6_flush_pending_frames(struct sock * sk)2080 void ip6_flush_pending_frames(struct sock *sk)
2081 {
2082 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2083 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2084 }
2085 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2086
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2087 struct sk_buff *ip6_make_skb(struct sock *sk,
2088 int getfrag(void *from, char *to, int offset,
2089 int len, int odd, struct sk_buff *skb),
2090 void *from, size_t length, int transhdrlen,
2091 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2092 unsigned int flags, struct inet_cork_full *cork)
2093 {
2094 struct inet6_cork v6_cork;
2095 struct sk_buff_head queue;
2096 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2097 int err;
2098
2099 if (flags & MSG_PROBE) {
2100 dst_release(&rt->dst);
2101 return NULL;
2102 }
2103
2104 __skb_queue_head_init(&queue);
2105
2106 cork->base.flags = 0;
2107 cork->base.addr = 0;
2108 cork->base.opt = NULL;
2109 v6_cork.opt = NULL;
2110 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2111 if (err) {
2112 ip6_cork_release(cork, &v6_cork);
2113 return ERR_PTR(err);
2114 }
2115 if (ipc6->dontfrag < 0)
2116 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2117
2118 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2119 ¤t->task_frag, getfrag, from,
2120 length + exthdrlen, transhdrlen + exthdrlen,
2121 flags, ipc6);
2122 if (err) {
2123 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2124 return ERR_PTR(err);
2125 }
2126
2127 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2128 }
2129