1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 struct dst_entry *dst = skb_dst(skb);
63 struct net_device *dev = dst->dev;
64 struct inet6_dev *idev = ip6_dst_idev(dst);
65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
68 struct neighbour *neigh;
69 int ret;
70
71 /* Be paranoid, rather than too clever. */
72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 /* Make sure idev stays alive */
74 rcu_read_lock();
75 skb = skb_expand_head(skb, hh_len);
76 if (!skb) {
77 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
78 rcu_read_unlock();
79 return -ENOMEM;
80 }
81 rcu_read_unlock();
82 }
83
84 hdr = ipv6_hdr(skb);
85 daddr = &hdr->daddr;
86 if (ipv6_addr_is_multicast(daddr)) {
87 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
88 ((mroute6_is_socket(net, skb) &&
89 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
90 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
91 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
92
93 /* Do not check for IFF_ALLMULTI; multicast routing
94 is not supported in any case.
95 */
96 if (newskb)
97 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
98 net, sk, newskb, NULL, newskb->dev,
99 dev_loopback_xmit);
100
101 if (hdr->hop_limit == 0) {
102 IP6_INC_STATS(net, idev,
103 IPSTATS_MIB_OUTDISCARDS);
104 kfree_skb(skb);
105 return 0;
106 }
107 }
108
109 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
110 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
111 !(dev->flags & IFF_LOOPBACK)) {
112 kfree_skb(skb);
113 return 0;
114 }
115 }
116
117 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
118 int res = lwtunnel_xmit(skb);
119
120 if (res != LWTUNNEL_XMIT_CONTINUE)
121 return res;
122 }
123
124 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
125
126 rcu_read_lock();
127 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
128 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
129
130 if (unlikely(IS_ERR_OR_NULL(neigh))) {
131 if (unlikely(!neigh))
132 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
133 if (IS_ERR(neigh)) {
134 rcu_read_unlock();
135 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
136 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
137 return -EINVAL;
138 }
139 }
140 sock_confirm_neigh(skb, neigh);
141 ret = neigh_output(neigh, skb, false);
142 rcu_read_unlock();
143 return ret;
144 }
145
146 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)147 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
148 struct sk_buff *skb, unsigned int mtu)
149 {
150 struct sk_buff *segs, *nskb;
151 netdev_features_t features;
152 int ret = 0;
153
154 /* Please see corresponding comment in ip_finish_output_gso
155 * describing the cases where GSO segment length exceeds the
156 * egress MTU.
157 */
158 features = netif_skb_features(skb);
159 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
160 if (IS_ERR_OR_NULL(segs)) {
161 kfree_skb(skb);
162 return -ENOMEM;
163 }
164
165 consume_skb(skb);
166
167 skb_list_walk_safe(segs, segs, nskb) {
168 int err;
169
170 skb_mark_not_on_list(segs);
171 /* Last GSO segment can be smaller than gso_size (and MTU).
172 * Adding a fragment header would produce an "atomic fragment",
173 * which is considered harmful (RFC-8021). Avoid that.
174 */
175 err = segs->len > mtu ?
176 ip6_fragment(net, sk, segs, ip6_finish_output2) :
177 ip6_finish_output2(net, sk, segs);
178 if (err && ret == 0)
179 ret = err;
180 }
181
182 return ret;
183 }
184
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)185 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
186 {
187 unsigned int mtu;
188
189 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
190 /* Policy lookup after SNAT yielded a new policy */
191 if (skb_dst(skb)->xfrm) {
192 IP6CB(skb)->flags |= IP6SKB_REROUTED;
193 return dst_output(net, sk, skb);
194 }
195 #endif
196
197 mtu = ip6_skb_dst_mtu(skb);
198 if (skb_is_gso(skb) &&
199 !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
200 !skb_gso_validate_network_len(skb, mtu))
201 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
202
203 if ((skb->len > mtu && !skb_is_gso(skb)) ||
204 dst_allfrag(skb_dst(skb)) ||
205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
206 return ip6_fragment(net, sk, skb, ip6_finish_output2);
207 else
208 return ip6_finish_output2(net, sk, skb);
209 }
210
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 int ret;
214
215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 switch (ret) {
217 case NET_XMIT_SUCCESS:
218 case NET_XMIT_CN:
219 return __ip6_finish_output(net, sk, skb) ? : ret;
220 default:
221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 return ret;
223 }
224 }
225
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
229 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
230
231 skb->protocol = htons(ETH_P_IPV6);
232 skb->dev = dev;
233
234 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
235 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
236 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
237 return 0;
238 }
239
240 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
241 net, sk, skb, indev, dev,
242 ip6_finish_output,
243 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
244 }
245 EXPORT_SYMBOL(ip6_output);
246
ip6_autoflowlabel(struct net * net,const struct ipv6_pinfo * np)247 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
248 {
249 if (!np->autoflowlabel_set)
250 return ip6_default_np_autolabel(net);
251 else
252 return np->autoflowlabel;
253 }
254
255 /*
256 * xmit an sk_buff (used by TCP, SCTP and DCCP)
257 * Note : socket lock is not held for SYNACK packets, but might be modified
258 * by calls to skb_set_owner_w() and ipv6_local_error(),
259 * which are using proper atomic operations or spinlocks.
260 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)261 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
262 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
263 {
264 struct net *net = sock_net(sk);
265 const struct ipv6_pinfo *np = inet6_sk(sk);
266 struct in6_addr *first_hop = &fl6->daddr;
267 struct dst_entry *dst = skb_dst(skb);
268 struct net_device *dev = dst->dev;
269 struct inet6_dev *idev = ip6_dst_idev(dst);
270 struct hop_jumbo_hdr *hop_jumbo;
271 int hoplen = sizeof(*hop_jumbo);
272 unsigned int head_room;
273 struct ipv6hdr *hdr;
274 u8 proto = fl6->flowi6_proto;
275 int seg_len = skb->len;
276 int hlimit = -1;
277 u32 mtu;
278
279 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
280 if (opt)
281 head_room += opt->opt_nflen + opt->opt_flen;
282
283 if (unlikely(head_room > skb_headroom(skb))) {
284 /* Make sure idev stays alive */
285 rcu_read_lock();
286 skb = skb_expand_head(skb, head_room);
287 if (!skb) {
288 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
289 rcu_read_unlock();
290 return -ENOBUFS;
291 }
292 rcu_read_unlock();
293 }
294
295 if (opt) {
296 seg_len += opt->opt_nflen + opt->opt_flen;
297
298 if (opt->opt_flen)
299 ipv6_push_frag_opts(skb, opt, &proto);
300
301 if (opt->opt_nflen)
302 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
303 &fl6->saddr);
304 }
305
306 if (unlikely(seg_len > IPV6_MAXPLEN)) {
307 hop_jumbo = skb_push(skb, hoplen);
308
309 hop_jumbo->nexthdr = proto;
310 hop_jumbo->hdrlen = 0;
311 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
312 hop_jumbo->tlv_len = 4;
313 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
314
315 proto = IPPROTO_HOPOPTS;
316 seg_len = 0;
317 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
318 }
319
320 skb_push(skb, sizeof(struct ipv6hdr));
321 skb_reset_network_header(skb);
322 hdr = ipv6_hdr(skb);
323
324 /*
325 * Fill in the IPv6 header
326 */
327 if (np)
328 hlimit = np->hop_limit;
329 if (hlimit < 0)
330 hlimit = ip6_dst_hoplimit(dst);
331
332 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
333 ip6_autoflowlabel(net, np), fl6));
334
335 hdr->payload_len = htons(seg_len);
336 hdr->nexthdr = proto;
337 hdr->hop_limit = hlimit;
338
339 hdr->saddr = fl6->saddr;
340 hdr->daddr = *first_hop;
341
342 skb->protocol = htons(ETH_P_IPV6);
343 skb->priority = priority;
344 skb->mark = mark;
345
346 mtu = dst_mtu(dst);
347 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
348 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
349
350 /* if egress device is enslaved to an L3 master device pass the
351 * skb to its handler for processing
352 */
353 skb = l3mdev_ip6_out((struct sock *)sk, skb);
354 if (unlikely(!skb))
355 return 0;
356
357 /* hooks should never assume socket lock is held.
358 * we promote our socket to non const
359 */
360 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
361 net, (struct sock *)sk, skb, NULL, dev,
362 dst_output);
363 }
364
365 skb->dev = dev;
366 /* ipv6_local_error() does not require socket lock,
367 * we promote our socket to non const
368 */
369 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
370
371 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
372 kfree_skb(skb);
373 return -EMSGSIZE;
374 }
375 EXPORT_SYMBOL(ip6_xmit);
376
ip6_call_ra_chain(struct sk_buff * skb,int sel)377 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
378 {
379 struct ip6_ra_chain *ra;
380 struct sock *last = NULL;
381
382 read_lock(&ip6_ra_lock);
383 for (ra = ip6_ra_chain; ra; ra = ra->next) {
384 struct sock *sk = ra->sk;
385 if (sk && ra->sel == sel &&
386 (!sk->sk_bound_dev_if ||
387 sk->sk_bound_dev_if == skb->dev->ifindex)) {
388 struct ipv6_pinfo *np = inet6_sk(sk);
389
390 if (np && np->rtalert_isolate &&
391 !net_eq(sock_net(sk), dev_net(skb->dev))) {
392 continue;
393 }
394 if (last) {
395 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
396 if (skb2)
397 rawv6_rcv(last, skb2);
398 }
399 last = sk;
400 }
401 }
402
403 if (last) {
404 rawv6_rcv(last, skb);
405 read_unlock(&ip6_ra_lock);
406 return 1;
407 }
408 read_unlock(&ip6_ra_lock);
409 return 0;
410 }
411
ip6_forward_proxy_check(struct sk_buff * skb)412 static int ip6_forward_proxy_check(struct sk_buff *skb)
413 {
414 struct ipv6hdr *hdr = ipv6_hdr(skb);
415 u8 nexthdr = hdr->nexthdr;
416 __be16 frag_off;
417 int offset;
418
419 if (ipv6_ext_hdr(nexthdr)) {
420 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
421 if (offset < 0)
422 return 0;
423 } else
424 offset = sizeof(struct ipv6hdr);
425
426 if (nexthdr == IPPROTO_ICMPV6) {
427 struct icmp6hdr *icmp6;
428
429 if (!pskb_may_pull(skb, (skb_network_header(skb) +
430 offset + 1 - skb->data)))
431 return 0;
432
433 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
434
435 switch (icmp6->icmp6_type) {
436 case NDISC_ROUTER_SOLICITATION:
437 case NDISC_ROUTER_ADVERTISEMENT:
438 case NDISC_NEIGHBOUR_SOLICITATION:
439 case NDISC_NEIGHBOUR_ADVERTISEMENT:
440 case NDISC_REDIRECT:
441 /* For reaction involving unicast neighbor discovery
442 * message destined to the proxied address, pass it to
443 * input function.
444 */
445 return 1;
446 default:
447 break;
448 }
449 }
450
451 /*
452 * The proxying router can't forward traffic sent to a link-local
453 * address, so signal the sender and discard the packet. This
454 * behavior is clarified by the MIPv6 specification.
455 */
456 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
457 dst_link_failure(skb);
458 return -1;
459 }
460
461 return 0;
462 }
463
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)464 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
465 struct sk_buff *skb)
466 {
467 struct dst_entry *dst = skb_dst(skb);
468
469 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
470
471 #ifdef CONFIG_NET_SWITCHDEV
472 if (skb->offload_l3_fwd_mark) {
473 consume_skb(skb);
474 return 0;
475 }
476 #endif
477
478 skb_clear_tstamp(skb);
479 return dst_output(net, sk, skb);
480 }
481
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)482 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
483 {
484 if (skb->len <= mtu)
485 return false;
486
487 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
488 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
489 return true;
490
491 if (skb->ignore_df)
492 return false;
493
494 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
495 return false;
496
497 return true;
498 }
499
ip6_forward(struct sk_buff * skb)500 int ip6_forward(struct sk_buff *skb)
501 {
502 struct dst_entry *dst = skb_dst(skb);
503 struct ipv6hdr *hdr = ipv6_hdr(skb);
504 struct inet6_skb_parm *opt = IP6CB(skb);
505 struct net *net = dev_net(dst->dev);
506 struct inet6_dev *idev;
507 SKB_DR(reason);
508 u32 mtu;
509
510 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
511 if (net->ipv6.devconf_all->forwarding == 0)
512 goto error;
513
514 if (skb->pkt_type != PACKET_HOST)
515 goto drop;
516
517 if (unlikely(skb->sk))
518 goto drop;
519
520 if (skb_warn_if_lro(skb))
521 goto drop;
522
523 if (!net->ipv6.devconf_all->disable_policy &&
524 (!idev || !idev->cnf.disable_policy) &&
525 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
526 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
527 goto drop;
528 }
529
530 skb_forward_csum(skb);
531
532 /*
533 * We DO NOT make any processing on
534 * RA packets, pushing them to user level AS IS
535 * without ane WARRANTY that application will be able
536 * to interpret them. The reason is that we
537 * cannot make anything clever here.
538 *
539 * We are not end-node, so that if packet contains
540 * AH/ESP, we cannot make anything.
541 * Defragmentation also would be mistake, RA packets
542 * cannot be fragmented, because there is no warranty
543 * that different fragments will go along one path. --ANK
544 */
545 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
546 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
547 return 0;
548 }
549
550 /*
551 * check and decrement ttl
552 */
553 if (hdr->hop_limit <= 1) {
554 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
555 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
556
557 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
558 return -ETIMEDOUT;
559 }
560
561 /* XXX: idev->cnf.proxy_ndp? */
562 if (net->ipv6.devconf_all->proxy_ndp &&
563 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
564 int proxied = ip6_forward_proxy_check(skb);
565 if (proxied > 0) {
566 /* It's tempting to decrease the hop limit
567 * here by 1, as we do at the end of the
568 * function too.
569 *
570 * But that would be incorrect, as proxying is
571 * not forwarding. The ip6_input function
572 * will handle this packet locally, and it
573 * depends on the hop limit being unchanged.
574 *
575 * One example is the NDP hop limit, that
576 * always has to stay 255, but other would be
577 * similar checks around RA packets, where the
578 * user can even change the desired limit.
579 */
580 return ip6_input(skb);
581 } else if (proxied < 0) {
582 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
583 goto drop;
584 }
585 }
586
587 if (!xfrm6_route_forward(skb)) {
588 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
589 SKB_DR_SET(reason, XFRM_POLICY);
590 goto drop;
591 }
592 dst = skb_dst(skb);
593
594 /* IPv6 specs say nothing about it, but it is clear that we cannot
595 send redirects to source routed frames.
596 We don't send redirects to frames decapsulated from IPsec.
597 */
598 if (IP6CB(skb)->iif == dst->dev->ifindex &&
599 opt->srcrt == 0 && !skb_sec_path(skb)) {
600 struct in6_addr *target = NULL;
601 struct inet_peer *peer;
602 struct rt6_info *rt;
603
604 /*
605 * incoming and outgoing devices are the same
606 * send a redirect.
607 */
608
609 rt = (struct rt6_info *) dst;
610 if (rt->rt6i_flags & RTF_GATEWAY)
611 target = &rt->rt6i_gateway;
612 else
613 target = &hdr->daddr;
614
615 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
616
617 /* Limit redirects both by destination (here)
618 and by source (inside ndisc_send_redirect)
619 */
620 if (inet_peer_xrlim_allow(peer, 1*HZ))
621 ndisc_send_redirect(skb, target);
622 if (peer)
623 inet_putpeer(peer);
624 } else {
625 int addrtype = ipv6_addr_type(&hdr->saddr);
626
627 /* This check is security critical. */
628 if (addrtype == IPV6_ADDR_ANY ||
629 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
630 goto error;
631 if (addrtype & IPV6_ADDR_LINKLOCAL) {
632 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
633 ICMPV6_NOT_NEIGHBOUR, 0);
634 goto error;
635 }
636 }
637
638 mtu = ip6_dst_mtu_maybe_forward(dst, true);
639 if (mtu < IPV6_MIN_MTU)
640 mtu = IPV6_MIN_MTU;
641
642 if (ip6_pkt_too_big(skb, mtu)) {
643 /* Again, force OUTPUT device used as source address */
644 skb->dev = dst->dev;
645 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
646 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
647 __IP6_INC_STATS(net, ip6_dst_idev(dst),
648 IPSTATS_MIB_FRAGFAILS);
649 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
650 return -EMSGSIZE;
651 }
652
653 if (skb_cow(skb, dst->dev->hard_header_len)) {
654 __IP6_INC_STATS(net, ip6_dst_idev(dst),
655 IPSTATS_MIB_OUTDISCARDS);
656 goto drop;
657 }
658
659 hdr = ipv6_hdr(skb);
660
661 /* Mangling hops number delayed to point after skb COW */
662
663 hdr->hop_limit--;
664
665 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
666 net, NULL, skb, skb->dev, dst->dev,
667 ip6_forward_finish);
668
669 error:
670 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
671 SKB_DR_SET(reason, IP_INADDRERRORS);
672 drop:
673 kfree_skb_reason(skb, reason);
674 return -EINVAL;
675 }
676
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)677 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
678 {
679 to->pkt_type = from->pkt_type;
680 to->priority = from->priority;
681 to->protocol = from->protocol;
682 skb_dst_drop(to);
683 skb_dst_set(to, dst_clone(skb_dst(from)));
684 to->dev = from->dev;
685 to->mark = from->mark;
686
687 skb_copy_hash(to, from);
688
689 #ifdef CONFIG_NET_SCHED
690 to->tc_index = from->tc_index;
691 #endif
692 nf_copy(to, from);
693 skb_ext_copy(to, from);
694 skb_copy_secmark(to, from);
695 }
696
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)697 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
698 u8 nexthdr, __be32 frag_id,
699 struct ip6_fraglist_iter *iter)
700 {
701 unsigned int first_len;
702 struct frag_hdr *fh;
703
704 /* BUILD HEADER */
705 *prevhdr = NEXTHDR_FRAGMENT;
706 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
707 if (!iter->tmp_hdr)
708 return -ENOMEM;
709
710 iter->frag = skb_shinfo(skb)->frag_list;
711 skb_frag_list_init(skb);
712
713 iter->offset = 0;
714 iter->hlen = hlen;
715 iter->frag_id = frag_id;
716 iter->nexthdr = nexthdr;
717
718 __skb_pull(skb, hlen);
719 fh = __skb_push(skb, sizeof(struct frag_hdr));
720 __skb_push(skb, hlen);
721 skb_reset_network_header(skb);
722 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
723
724 fh->nexthdr = nexthdr;
725 fh->reserved = 0;
726 fh->frag_off = htons(IP6_MF);
727 fh->identification = frag_id;
728
729 first_len = skb_pagelen(skb);
730 skb->data_len = first_len - skb_headlen(skb);
731 skb->len = first_len;
732 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
733
734 return 0;
735 }
736 EXPORT_SYMBOL(ip6_fraglist_init);
737
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)738 void ip6_fraglist_prepare(struct sk_buff *skb,
739 struct ip6_fraglist_iter *iter)
740 {
741 struct sk_buff *frag = iter->frag;
742 unsigned int hlen = iter->hlen;
743 struct frag_hdr *fh;
744
745 frag->ip_summed = CHECKSUM_NONE;
746 skb_reset_transport_header(frag);
747 fh = __skb_push(frag, sizeof(struct frag_hdr));
748 __skb_push(frag, hlen);
749 skb_reset_network_header(frag);
750 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
751 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
752 fh->nexthdr = iter->nexthdr;
753 fh->reserved = 0;
754 fh->frag_off = htons(iter->offset);
755 if (frag->next)
756 fh->frag_off |= htons(IP6_MF);
757 fh->identification = iter->frag_id;
758 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
759 ip6_copy_metadata(frag, skb);
760 }
761 EXPORT_SYMBOL(ip6_fraglist_prepare);
762
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)763 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
764 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
765 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
766 {
767 state->prevhdr = prevhdr;
768 state->nexthdr = nexthdr;
769 state->frag_id = frag_id;
770
771 state->hlen = hlen;
772 state->mtu = mtu;
773
774 state->left = skb->len - hlen; /* Space per frame */
775 state->ptr = hlen; /* Where to start from */
776
777 state->hroom = hdr_room;
778 state->troom = needed_tailroom;
779
780 state->offset = 0;
781 }
782 EXPORT_SYMBOL(ip6_frag_init);
783
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)784 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
785 {
786 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
787 struct sk_buff *frag;
788 struct frag_hdr *fh;
789 unsigned int len;
790
791 len = state->left;
792 /* IF: it doesn't fit, use 'mtu' - the data space left */
793 if (len > state->mtu)
794 len = state->mtu;
795 /* IF: we are not sending up to and including the packet end
796 then align the next start on an eight byte boundary */
797 if (len < state->left)
798 len &= ~7;
799
800 /* Allocate buffer */
801 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
802 state->hroom + state->troom, GFP_ATOMIC);
803 if (!frag)
804 return ERR_PTR(-ENOMEM);
805
806 /*
807 * Set up data on packet
808 */
809
810 ip6_copy_metadata(frag, skb);
811 skb_reserve(frag, state->hroom);
812 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
813 skb_reset_network_header(frag);
814 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
815 frag->transport_header = (frag->network_header + state->hlen +
816 sizeof(struct frag_hdr));
817
818 /*
819 * Charge the memory for the fragment to any owner
820 * it might possess
821 */
822 if (skb->sk)
823 skb_set_owner_w(frag, skb->sk);
824
825 /*
826 * Copy the packet header into the new buffer.
827 */
828 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
829
830 fragnexthdr_offset = skb_network_header(frag);
831 fragnexthdr_offset += prevhdr - skb_network_header(skb);
832 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
833
834 /*
835 * Build fragment header.
836 */
837 fh->nexthdr = state->nexthdr;
838 fh->reserved = 0;
839 fh->identification = state->frag_id;
840
841 /*
842 * Copy a block of the IP datagram.
843 */
844 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
845 len));
846 state->left -= len;
847
848 fh->frag_off = htons(state->offset);
849 if (state->left > 0)
850 fh->frag_off |= htons(IP6_MF);
851 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
852
853 state->ptr += len;
854 state->offset += len;
855
856 return frag;
857 }
858 EXPORT_SYMBOL(ip6_frag_next);
859
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))860 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
861 int (*output)(struct net *, struct sock *, struct sk_buff *))
862 {
863 struct sk_buff *frag;
864 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
865 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
866 inet6_sk(skb->sk) : NULL;
867 bool mono_delivery_time = skb->mono_delivery_time;
868 struct ip6_frag_state state;
869 unsigned int mtu, hlen, nexthdr_offset;
870 ktime_t tstamp = skb->tstamp;
871 int hroom, err = 0;
872 __be32 frag_id;
873 u8 *prevhdr, nexthdr = 0;
874
875 err = ip6_find_1stfragopt(skb, &prevhdr);
876 if (err < 0)
877 goto fail;
878 hlen = err;
879 nexthdr = *prevhdr;
880 nexthdr_offset = prevhdr - skb_network_header(skb);
881
882 mtu = ip6_skb_dst_mtu(skb);
883
884 /* We must not fragment if the socket is set to force MTU discovery
885 * or if the skb it not generated by a local socket.
886 */
887 if (unlikely(!skb->ignore_df && skb->len > mtu))
888 goto fail_toobig;
889
890 if (IP6CB(skb)->frag_max_size) {
891 if (IP6CB(skb)->frag_max_size > mtu)
892 goto fail_toobig;
893
894 /* don't send fragments larger than what we received */
895 mtu = IP6CB(skb)->frag_max_size;
896 if (mtu < IPV6_MIN_MTU)
897 mtu = IPV6_MIN_MTU;
898 }
899
900 if (np && np->frag_size < mtu) {
901 if (np->frag_size)
902 mtu = np->frag_size;
903 }
904 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
905 goto fail_toobig;
906 mtu -= hlen + sizeof(struct frag_hdr);
907
908 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
909 &ipv6_hdr(skb)->saddr);
910
911 if (skb->ip_summed == CHECKSUM_PARTIAL &&
912 (err = skb_checksum_help(skb)))
913 goto fail;
914
915 prevhdr = skb_network_header(skb) + nexthdr_offset;
916 hroom = LL_RESERVED_SPACE(rt->dst.dev);
917 if (skb_has_frag_list(skb)) {
918 unsigned int first_len = skb_pagelen(skb);
919 struct ip6_fraglist_iter iter;
920 struct sk_buff *frag2;
921
922 if (first_len - hlen > mtu ||
923 ((first_len - hlen) & 7) ||
924 skb_cloned(skb) ||
925 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
926 goto slow_path;
927
928 skb_walk_frags(skb, frag) {
929 /* Correct geometry. */
930 if (frag->len > mtu ||
931 ((frag->len & 7) && frag->next) ||
932 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
933 goto slow_path_clean;
934
935 /* Partially cloned skb? */
936 if (skb_shared(frag))
937 goto slow_path_clean;
938
939 BUG_ON(frag->sk);
940 if (skb->sk) {
941 frag->sk = skb->sk;
942 frag->destructor = sock_wfree;
943 }
944 skb->truesize -= frag->truesize;
945 }
946
947 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
948 &iter);
949 if (err < 0)
950 goto fail;
951
952 /* We prevent @rt from being freed. */
953 rcu_read_lock();
954
955 for (;;) {
956 /* Prepare header of the next frame,
957 * before previous one went down. */
958 if (iter.frag)
959 ip6_fraglist_prepare(skb, &iter);
960
961 skb_set_delivery_time(skb, tstamp, mono_delivery_time);
962 err = output(net, sk, skb);
963 if (!err)
964 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
965 IPSTATS_MIB_FRAGCREATES);
966
967 if (err || !iter.frag)
968 break;
969
970 skb = ip6_fraglist_next(&iter);
971 }
972
973 kfree(iter.tmp_hdr);
974
975 if (err == 0) {
976 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
977 IPSTATS_MIB_FRAGOKS);
978 rcu_read_unlock();
979 return 0;
980 }
981
982 kfree_skb_list(iter.frag);
983
984 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
985 IPSTATS_MIB_FRAGFAILS);
986 rcu_read_unlock();
987 return err;
988
989 slow_path_clean:
990 skb_walk_frags(skb, frag2) {
991 if (frag2 == frag)
992 break;
993 frag2->sk = NULL;
994 frag2->destructor = NULL;
995 skb->truesize += frag2->truesize;
996 }
997 }
998
999 slow_path:
1000 /*
1001 * Fragment the datagram.
1002 */
1003
1004 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1005 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1006 &state);
1007
1008 /*
1009 * Keep copying data until we run out.
1010 */
1011
1012 while (state.left > 0) {
1013 frag = ip6_frag_next(skb, &state);
1014 if (IS_ERR(frag)) {
1015 err = PTR_ERR(frag);
1016 goto fail;
1017 }
1018
1019 /*
1020 * Put this fragment into the sending queue.
1021 */
1022 skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1023 err = output(net, sk, frag);
1024 if (err)
1025 goto fail;
1026
1027 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1028 IPSTATS_MIB_FRAGCREATES);
1029 }
1030 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1031 IPSTATS_MIB_FRAGOKS);
1032 consume_skb(skb);
1033 return err;
1034
1035 fail_toobig:
1036 if (skb->sk && dst_allfrag(skb_dst(skb)))
1037 sk_gso_disable(skb->sk);
1038
1039 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1040 err = -EMSGSIZE;
1041
1042 fail:
1043 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1044 IPSTATS_MIB_FRAGFAILS);
1045 kfree_skb(skb);
1046 return err;
1047 }
1048
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1049 static inline int ip6_rt_check(const struct rt6key *rt_key,
1050 const struct in6_addr *fl_addr,
1051 const struct in6_addr *addr_cache)
1052 {
1053 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1054 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1055 }
1056
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1057 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1058 struct dst_entry *dst,
1059 const struct flowi6 *fl6)
1060 {
1061 struct ipv6_pinfo *np = inet6_sk(sk);
1062 struct rt6_info *rt;
1063
1064 if (!dst)
1065 goto out;
1066
1067 if (dst->ops->family != AF_INET6) {
1068 dst_release(dst);
1069 return NULL;
1070 }
1071
1072 rt = (struct rt6_info *)dst;
1073 /* Yes, checking route validity in not connected
1074 * case is not very simple. Take into account,
1075 * that we do not support routing by source, TOS,
1076 * and MSG_DONTROUTE --ANK (980726)
1077 *
1078 * 1. ip6_rt_check(): If route was host route,
1079 * check that cached destination is current.
1080 * If it is network route, we still may
1081 * check its validity using saved pointer
1082 * to the last used address: daddr_cache.
1083 * We do not want to save whole address now,
1084 * (because main consumer of this service
1085 * is tcp, which has not this problem),
1086 * so that the last trick works only on connected
1087 * sockets.
1088 * 2. oif also should be the same.
1089 */
1090 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1091 #ifdef CONFIG_IPV6_SUBTREES
1092 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1093 #endif
1094 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1095 dst_release(dst);
1096 dst = NULL;
1097 }
1098
1099 out:
1100 return dst;
1101 }
1102
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1103 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1104 struct dst_entry **dst, struct flowi6 *fl6)
1105 {
1106 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1107 struct neighbour *n;
1108 struct rt6_info *rt;
1109 #endif
1110 int err;
1111 int flags = 0;
1112
1113 /* The correct way to handle this would be to do
1114 * ip6_route_get_saddr, and then ip6_route_output; however,
1115 * the route-specific preferred source forces the
1116 * ip6_route_output call _before_ ip6_route_get_saddr.
1117 *
1118 * In source specific routing (no src=any default route),
1119 * ip6_route_output will fail given src=any saddr, though, so
1120 * that's why we try it again later.
1121 */
1122 if (ipv6_addr_any(&fl6->saddr)) {
1123 struct fib6_info *from;
1124 struct rt6_info *rt;
1125
1126 *dst = ip6_route_output(net, sk, fl6);
1127 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1128
1129 rcu_read_lock();
1130 from = rt ? rcu_dereference(rt->from) : NULL;
1131 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1132 sk ? inet6_sk(sk)->srcprefs : 0,
1133 fl6->flowi6_l3mdev,
1134 &fl6->saddr);
1135 rcu_read_unlock();
1136
1137 if (err)
1138 goto out_err_release;
1139
1140 /* If we had an erroneous initial result, pretend it
1141 * never existed and let the SA-enabled version take
1142 * over.
1143 */
1144 if ((*dst)->error) {
1145 dst_release(*dst);
1146 *dst = NULL;
1147 }
1148
1149 if (fl6->flowi6_oif)
1150 flags |= RT6_LOOKUP_F_IFACE;
1151 }
1152
1153 if (!*dst)
1154 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1155
1156 err = (*dst)->error;
1157 if (err)
1158 goto out_err_release;
1159
1160 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1161 /*
1162 * Here if the dst entry we've looked up
1163 * has a neighbour entry that is in the INCOMPLETE
1164 * state and the src address from the flow is
1165 * marked as OPTIMISTIC, we release the found
1166 * dst entry and replace it instead with the
1167 * dst entry of the nexthop router
1168 */
1169 rt = (struct rt6_info *) *dst;
1170 rcu_read_lock();
1171 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1172 rt6_nexthop(rt, &fl6->daddr));
1173 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1174 rcu_read_unlock();
1175
1176 if (err) {
1177 struct inet6_ifaddr *ifp;
1178 struct flowi6 fl_gw6;
1179 int redirect;
1180
1181 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1182 (*dst)->dev, 1);
1183
1184 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1185 if (ifp)
1186 in6_ifa_put(ifp);
1187
1188 if (redirect) {
1189 /*
1190 * We need to get the dst entry for the
1191 * default router instead
1192 */
1193 dst_release(*dst);
1194 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1195 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1196 *dst = ip6_route_output(net, sk, &fl_gw6);
1197 err = (*dst)->error;
1198 if (err)
1199 goto out_err_release;
1200 }
1201 }
1202 #endif
1203 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1204 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1205 err = -EAFNOSUPPORT;
1206 goto out_err_release;
1207 }
1208
1209 return 0;
1210
1211 out_err_release:
1212 dst_release(*dst);
1213 *dst = NULL;
1214
1215 if (err == -ENETUNREACH)
1216 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1217 return err;
1218 }
1219
1220 /**
1221 * ip6_dst_lookup - perform route lookup on flow
1222 * @net: Network namespace to perform lookup in
1223 * @sk: socket which provides route info
1224 * @dst: pointer to dst_entry * for result
1225 * @fl6: flow to lookup
1226 *
1227 * This function performs a route lookup on the given flow.
1228 *
1229 * It returns zero on success, or a standard errno code on error.
1230 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1231 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1232 struct flowi6 *fl6)
1233 {
1234 *dst = NULL;
1235 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1236 }
1237 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1238
1239 /**
1240 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1241 * @net: Network namespace to perform lookup in
1242 * @sk: socket which provides route info
1243 * @fl6: flow to lookup
1244 * @final_dst: final destination address for ipsec lookup
1245 *
1246 * This function performs a route lookup on the given flow.
1247 *
1248 * It returns a valid dst pointer on success, or a pointer encoded
1249 * error code.
1250 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1251 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1252 const struct in6_addr *final_dst)
1253 {
1254 struct dst_entry *dst = NULL;
1255 int err;
1256
1257 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1258 if (err)
1259 return ERR_PTR(err);
1260 if (final_dst)
1261 fl6->daddr = *final_dst;
1262
1263 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1264 }
1265 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1266
1267 /**
1268 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1269 * @sk: socket which provides the dst cache and route info
1270 * @fl6: flow to lookup
1271 * @final_dst: final destination address for ipsec lookup
1272 * @connected: whether @sk is connected or not
1273 *
1274 * This function performs a route lookup on the given flow with the
1275 * possibility of using the cached route in the socket if it is valid.
1276 * It will take the socket dst lock when operating on the dst cache.
1277 * As a result, this function can only be used in process context.
1278 *
1279 * In addition, for a connected socket, cache the dst in the socket
1280 * if the current cache is not valid.
1281 *
1282 * It returns a valid dst pointer on success, or a pointer encoded
1283 * error code.
1284 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1285 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1286 const struct in6_addr *final_dst,
1287 bool connected)
1288 {
1289 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1290
1291 dst = ip6_sk_dst_check(sk, dst, fl6);
1292 if (dst)
1293 return dst;
1294
1295 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1296 if (connected && !IS_ERR(dst))
1297 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1298
1299 return dst;
1300 }
1301 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1302
1303 /**
1304 * ip6_dst_lookup_tunnel - perform route lookup on tunnel
1305 * @skb: Packet for which lookup is done
1306 * @dev: Tunnel device
1307 * @net: Network namespace of tunnel device
1308 * @sock: Socket which provides route info
1309 * @saddr: Memory to store the src ip address
1310 * @info: Tunnel information
1311 * @protocol: IP protocol
1312 * @use_cache: Flag to enable cache usage
1313 * This function performs a route lookup on a tunnel
1314 *
1315 * It returns a valid dst pointer and stores src address to be used in
1316 * tunnel in param saddr on success, else a pointer encoded error code.
1317 */
1318
ip6_dst_lookup_tunnel(struct sk_buff * skb,struct net_device * dev,struct net * net,struct socket * sock,struct in6_addr * saddr,const struct ip_tunnel_info * info,u8 protocol,bool use_cache)1319 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1320 struct net_device *dev,
1321 struct net *net,
1322 struct socket *sock,
1323 struct in6_addr *saddr,
1324 const struct ip_tunnel_info *info,
1325 u8 protocol,
1326 bool use_cache)
1327 {
1328 struct dst_entry *dst = NULL;
1329 #ifdef CONFIG_DST_CACHE
1330 struct dst_cache *dst_cache;
1331 #endif
1332 struct flowi6 fl6;
1333 __u8 prio;
1334
1335 #ifdef CONFIG_DST_CACHE
1336 dst_cache = (struct dst_cache *)&info->dst_cache;
1337 if (use_cache) {
1338 dst = dst_cache_get_ip6(dst_cache, saddr);
1339 if (dst)
1340 return dst;
1341 }
1342 #endif
1343 memset(&fl6, 0, sizeof(fl6));
1344 fl6.flowi6_mark = skb->mark;
1345 fl6.flowi6_proto = protocol;
1346 fl6.daddr = info->key.u.ipv6.dst;
1347 fl6.saddr = info->key.u.ipv6.src;
1348 prio = info->key.tos;
1349 fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1350
1351 dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1352 NULL);
1353 if (IS_ERR(dst)) {
1354 netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1355 return ERR_PTR(-ENETUNREACH);
1356 }
1357 if (dst->dev == dev) { /* is this necessary? */
1358 netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1359 dst_release(dst);
1360 return ERR_PTR(-ELOOP);
1361 }
1362 #ifdef CONFIG_DST_CACHE
1363 if (use_cache)
1364 dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1365 #endif
1366 *saddr = fl6.saddr;
1367 return dst;
1368 }
1369 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1370
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1371 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1372 gfp_t gfp)
1373 {
1374 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1375 }
1376
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1377 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1378 gfp_t gfp)
1379 {
1380 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1381 }
1382
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1383 static void ip6_append_data_mtu(unsigned int *mtu,
1384 int *maxfraglen,
1385 unsigned int fragheaderlen,
1386 struct sk_buff *skb,
1387 struct rt6_info *rt,
1388 unsigned int orig_mtu)
1389 {
1390 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1391 if (!skb) {
1392 /* first fragment, reserve header_len */
1393 *mtu = orig_mtu - rt->dst.header_len;
1394
1395 } else {
1396 /*
1397 * this fragment is not first, the headers
1398 * space is regarded as data space.
1399 */
1400 *mtu = orig_mtu;
1401 }
1402 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1403 + fragheaderlen - sizeof(struct frag_hdr);
1404 }
1405 }
1406
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1407 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1408 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1409 struct rt6_info *rt)
1410 {
1411 struct ipv6_pinfo *np = inet6_sk(sk);
1412 unsigned int mtu;
1413 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1414
1415 /* callers pass dst together with a reference, set it first so
1416 * ip6_cork_release() can put it down even in case of an error.
1417 */
1418 cork->base.dst = &rt->dst;
1419
1420 /*
1421 * setup for corking
1422 */
1423 if (opt) {
1424 if (WARN_ON(v6_cork->opt))
1425 return -EINVAL;
1426
1427 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1428 if (unlikely(!nopt))
1429 return -ENOBUFS;
1430
1431 nopt->tot_len = sizeof(*opt);
1432 nopt->opt_flen = opt->opt_flen;
1433 nopt->opt_nflen = opt->opt_nflen;
1434
1435 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1436 if (opt->dst0opt && !nopt->dst0opt)
1437 return -ENOBUFS;
1438
1439 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1440 if (opt->dst1opt && !nopt->dst1opt)
1441 return -ENOBUFS;
1442
1443 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1444 if (opt->hopopt && !nopt->hopopt)
1445 return -ENOBUFS;
1446
1447 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1448 if (opt->srcrt && !nopt->srcrt)
1449 return -ENOBUFS;
1450
1451 /* need source address above miyazawa*/
1452 }
1453 v6_cork->hop_limit = ipc6->hlimit;
1454 v6_cork->tclass = ipc6->tclass;
1455 if (rt->dst.flags & DST_XFRM_TUNNEL)
1456 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1457 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1458 else
1459 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1460 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1461 if (np->frag_size < mtu) {
1462 if (np->frag_size)
1463 mtu = np->frag_size;
1464 }
1465 cork->base.fragsize = mtu;
1466 cork->base.gso_size = ipc6->gso_size;
1467 cork->base.tx_flags = 0;
1468 cork->base.mark = ipc6->sockc.mark;
1469 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1470
1471 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1472 cork->base.flags |= IPCORK_ALLFRAG;
1473 cork->base.length = 0;
1474
1475 cork->base.transmit_time = ipc6->sockc.transmit_time;
1476
1477 return 0;
1478 }
1479
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags,struct ipcm6_cookie * ipc6)1480 static int __ip6_append_data(struct sock *sk,
1481 struct sk_buff_head *queue,
1482 struct inet_cork_full *cork_full,
1483 struct inet6_cork *v6_cork,
1484 struct page_frag *pfrag,
1485 int getfrag(void *from, char *to, int offset,
1486 int len, int odd, struct sk_buff *skb),
1487 void *from, size_t length, int transhdrlen,
1488 unsigned int flags, struct ipcm6_cookie *ipc6)
1489 {
1490 struct sk_buff *skb, *skb_prev = NULL;
1491 struct inet_cork *cork = &cork_full->base;
1492 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1493 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1494 struct ubuf_info *uarg = NULL;
1495 int exthdrlen = 0;
1496 int dst_exthdrlen = 0;
1497 int hh_len;
1498 int copy;
1499 int err;
1500 int offset = 0;
1501 bool zc = false;
1502 u32 tskey = 0;
1503 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1504 struct ipv6_txoptions *opt = v6_cork->opt;
1505 int csummode = CHECKSUM_NONE;
1506 unsigned int maxnonfragsize, headersize;
1507 unsigned int wmem_alloc_delta = 0;
1508 bool paged, extra_uref = false;
1509
1510 skb = skb_peek_tail(queue);
1511 if (!skb) {
1512 exthdrlen = opt ? opt->opt_flen : 0;
1513 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1514 }
1515
1516 paged = !!cork->gso_size;
1517 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1518 orig_mtu = mtu;
1519
1520 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1521 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
1522 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1523
1524 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1525
1526 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1527 (opt ? opt->opt_nflen : 0);
1528
1529 headersize = sizeof(struct ipv6hdr) +
1530 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1531 (dst_allfrag(&rt->dst) ?
1532 sizeof(struct frag_hdr) : 0) +
1533 rt->rt6i_nfheader_len;
1534
1535 if (mtu <= fragheaderlen ||
1536 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1537 goto emsgsize;
1538
1539 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1540 sizeof(struct frag_hdr);
1541
1542 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1543 * the first fragment
1544 */
1545 if (headersize + transhdrlen > mtu)
1546 goto emsgsize;
1547
1548 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1549 (sk->sk_protocol == IPPROTO_UDP ||
1550 sk->sk_protocol == IPPROTO_ICMPV6 ||
1551 sk->sk_protocol == IPPROTO_RAW)) {
1552 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1553 sizeof(struct ipv6hdr));
1554 goto emsgsize;
1555 }
1556
1557 if (ip6_sk_ignore_df(sk))
1558 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1559 else
1560 maxnonfragsize = mtu;
1561
1562 if (cork->length + length > maxnonfragsize - headersize) {
1563 emsgsize:
1564 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1565 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1566 return -EMSGSIZE;
1567 }
1568
1569 /* CHECKSUM_PARTIAL only with no extension headers and when
1570 * we are not going to fragment
1571 */
1572 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1573 headersize == sizeof(struct ipv6hdr) &&
1574 length <= mtu - headersize &&
1575 (!(flags & MSG_MORE) || cork->gso_size) &&
1576 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1577 csummode = CHECKSUM_PARTIAL;
1578
1579 if ((flags & MSG_ZEROCOPY) && length) {
1580 struct msghdr *msg = from;
1581
1582 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1583 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1584 return -EINVAL;
1585
1586 /* Leave uarg NULL if can't zerocopy, callers should
1587 * be able to handle it.
1588 */
1589 if ((rt->dst.dev->features & NETIF_F_SG) &&
1590 csummode == CHECKSUM_PARTIAL) {
1591 paged = true;
1592 zc = true;
1593 uarg = msg->msg_ubuf;
1594 }
1595 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1596 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1597 if (!uarg)
1598 return -ENOBUFS;
1599 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1600 if (rt->dst.dev->features & NETIF_F_SG &&
1601 csummode == CHECKSUM_PARTIAL) {
1602 paged = true;
1603 zc = true;
1604 } else {
1605 uarg_to_msgzc(uarg)->zerocopy = 0;
1606 skb_zcopy_set(skb, uarg, &extra_uref);
1607 }
1608 }
1609 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1610 if (inet_test_bit(HDRINCL, sk))
1611 return -EPERM;
1612 if (rt->dst.dev->features & NETIF_F_SG &&
1613 getfrag == ip_generic_getfrag)
1614 /* We need an empty buffer to attach stuff to */
1615 paged = true;
1616 else
1617 flags &= ~MSG_SPLICE_PAGES;
1618 }
1619
1620 /*
1621 * Let's try using as much space as possible.
1622 * Use MTU if total length of the message fits into the MTU.
1623 * Otherwise, we need to reserve fragment header and
1624 * fragment alignment (= 8-15 octects, in total).
1625 *
1626 * Note that we may need to "move" the data from the tail
1627 * of the buffer to the new fragment when we split
1628 * the message.
1629 *
1630 * FIXME: It may be fragmented into multiple chunks
1631 * at once if non-fragmentable extension headers
1632 * are too large.
1633 * --yoshfuji
1634 */
1635
1636 cork->length += length;
1637 if (!skb)
1638 goto alloc_new_skb;
1639
1640 while (length > 0) {
1641 /* Check if the remaining data fits into current packet. */
1642 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1643 if (copy < length)
1644 copy = maxfraglen - skb->len;
1645
1646 if (copy <= 0) {
1647 char *data;
1648 unsigned int datalen;
1649 unsigned int fraglen;
1650 unsigned int fraggap;
1651 unsigned int alloclen, alloc_extra;
1652 unsigned int pagedlen;
1653 alloc_new_skb:
1654 /* There's no room in the current skb */
1655 if (skb)
1656 fraggap = skb->len - maxfraglen;
1657 else
1658 fraggap = 0;
1659 /* update mtu and maxfraglen if necessary */
1660 if (!skb || !skb_prev)
1661 ip6_append_data_mtu(&mtu, &maxfraglen,
1662 fragheaderlen, skb, rt,
1663 orig_mtu);
1664
1665 skb_prev = skb;
1666
1667 /*
1668 * If remaining data exceeds the mtu,
1669 * we know we need more fragment(s).
1670 */
1671 datalen = length + fraggap;
1672
1673 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1674 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1675 fraglen = datalen + fragheaderlen;
1676 pagedlen = 0;
1677
1678 alloc_extra = hh_len;
1679 alloc_extra += dst_exthdrlen;
1680 alloc_extra += rt->dst.trailer_len;
1681
1682 /* We just reserve space for fragment header.
1683 * Note: this may be overallocation if the message
1684 * (without MSG_MORE) fits into the MTU.
1685 */
1686 alloc_extra += sizeof(struct frag_hdr);
1687
1688 if ((flags & MSG_MORE) &&
1689 !(rt->dst.dev->features&NETIF_F_SG))
1690 alloclen = mtu;
1691 else if (!paged &&
1692 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1693 !(rt->dst.dev->features & NETIF_F_SG)))
1694 alloclen = fraglen;
1695 else {
1696 alloclen = fragheaderlen + transhdrlen;
1697 pagedlen = datalen - transhdrlen;
1698 }
1699 alloclen += alloc_extra;
1700
1701 if (datalen != length + fraggap) {
1702 /*
1703 * this is not the last fragment, the trailer
1704 * space is regarded as data space.
1705 */
1706 datalen += rt->dst.trailer_len;
1707 }
1708
1709 fraglen = datalen + fragheaderlen;
1710
1711 copy = datalen - transhdrlen - fraggap - pagedlen;
1712 /* [!] NOTE: copy may be negative if pagedlen>0
1713 * because then the equation may reduces to -fraggap.
1714 */
1715 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1716 err = -EINVAL;
1717 goto error;
1718 }
1719 if (transhdrlen) {
1720 skb = sock_alloc_send_skb(sk, alloclen,
1721 (flags & MSG_DONTWAIT), &err);
1722 } else {
1723 skb = NULL;
1724 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1725 2 * sk->sk_sndbuf)
1726 skb = alloc_skb(alloclen,
1727 sk->sk_allocation);
1728 if (unlikely(!skb))
1729 err = -ENOBUFS;
1730 }
1731 if (!skb)
1732 goto error;
1733 /*
1734 * Fill in the control structures
1735 */
1736 skb->protocol = htons(ETH_P_IPV6);
1737 skb->ip_summed = csummode;
1738 skb->csum = 0;
1739 /* reserve for fragmentation and ipsec header */
1740 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1741 dst_exthdrlen);
1742
1743 /*
1744 * Find where to start putting bytes
1745 */
1746 data = skb_put(skb, fraglen - pagedlen);
1747 skb_set_network_header(skb, exthdrlen);
1748 data += fragheaderlen;
1749 skb->transport_header = (skb->network_header +
1750 fragheaderlen);
1751 if (fraggap) {
1752 skb->csum = skb_copy_and_csum_bits(
1753 skb_prev, maxfraglen,
1754 data + transhdrlen, fraggap);
1755 skb_prev->csum = csum_sub(skb_prev->csum,
1756 skb->csum);
1757 data += fraggap;
1758 pskb_trim_unique(skb_prev, maxfraglen);
1759 }
1760 if (copy > 0 &&
1761 getfrag(from, data + transhdrlen, offset,
1762 copy, fraggap, skb) < 0) {
1763 err = -EFAULT;
1764 kfree_skb(skb);
1765 goto error;
1766 } else if (flags & MSG_SPLICE_PAGES) {
1767 copy = 0;
1768 }
1769
1770 offset += copy;
1771 length -= copy + transhdrlen;
1772 transhdrlen = 0;
1773 exthdrlen = 0;
1774 dst_exthdrlen = 0;
1775
1776 /* Only the initial fragment is time stamped */
1777 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1778 cork->tx_flags = 0;
1779 skb_shinfo(skb)->tskey = tskey;
1780 tskey = 0;
1781 skb_zcopy_set(skb, uarg, &extra_uref);
1782
1783 if ((flags & MSG_CONFIRM) && !skb_prev)
1784 skb_set_dst_pending_confirm(skb, 1);
1785
1786 /*
1787 * Put the packet on the pending queue
1788 */
1789 if (!skb->destructor) {
1790 skb->destructor = sock_wfree;
1791 skb->sk = sk;
1792 wmem_alloc_delta += skb->truesize;
1793 }
1794 __skb_queue_tail(queue, skb);
1795 continue;
1796 }
1797
1798 if (copy > length)
1799 copy = length;
1800
1801 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1802 skb_tailroom(skb) >= copy) {
1803 unsigned int off;
1804
1805 off = skb->len;
1806 if (getfrag(from, skb_put(skb, copy),
1807 offset, copy, off, skb) < 0) {
1808 __skb_trim(skb, off);
1809 err = -EFAULT;
1810 goto error;
1811 }
1812 } else if (flags & MSG_SPLICE_PAGES) {
1813 struct msghdr *msg = from;
1814
1815 err = -EIO;
1816 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1817 goto error;
1818
1819 err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1820 sk->sk_allocation);
1821 if (err < 0)
1822 goto error;
1823 copy = err;
1824 wmem_alloc_delta += copy;
1825 } else if (!zc) {
1826 int i = skb_shinfo(skb)->nr_frags;
1827
1828 err = -ENOMEM;
1829 if (!sk_page_frag_refill(sk, pfrag))
1830 goto error;
1831
1832 skb_zcopy_downgrade_managed(skb);
1833 if (!skb_can_coalesce(skb, i, pfrag->page,
1834 pfrag->offset)) {
1835 err = -EMSGSIZE;
1836 if (i == MAX_SKB_FRAGS)
1837 goto error;
1838
1839 __skb_fill_page_desc(skb, i, pfrag->page,
1840 pfrag->offset, 0);
1841 skb_shinfo(skb)->nr_frags = ++i;
1842 get_page(pfrag->page);
1843 }
1844 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1845 if (getfrag(from,
1846 page_address(pfrag->page) + pfrag->offset,
1847 offset, copy, skb->len, skb) < 0)
1848 goto error_efault;
1849
1850 pfrag->offset += copy;
1851 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1852 skb->len += copy;
1853 skb->data_len += copy;
1854 skb->truesize += copy;
1855 wmem_alloc_delta += copy;
1856 } else {
1857 err = skb_zerocopy_iter_dgram(skb, from, copy);
1858 if (err < 0)
1859 goto error;
1860 }
1861 offset += copy;
1862 length -= copy;
1863 }
1864
1865 if (wmem_alloc_delta)
1866 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1867 return 0;
1868
1869 error_efault:
1870 err = -EFAULT;
1871 error:
1872 net_zcopy_put_abort(uarg, extra_uref);
1873 cork->length -= length;
1874 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1875 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1876 return err;
1877 }
1878
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1879 int ip6_append_data(struct sock *sk,
1880 int getfrag(void *from, char *to, int offset, int len,
1881 int odd, struct sk_buff *skb),
1882 void *from, size_t length, int transhdrlen,
1883 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1884 struct rt6_info *rt, unsigned int flags)
1885 {
1886 struct inet_sock *inet = inet_sk(sk);
1887 struct ipv6_pinfo *np = inet6_sk(sk);
1888 int exthdrlen;
1889 int err;
1890
1891 if (flags&MSG_PROBE)
1892 return 0;
1893 if (skb_queue_empty(&sk->sk_write_queue)) {
1894 /*
1895 * setup for corking
1896 */
1897 dst_hold(&rt->dst);
1898 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1899 ipc6, rt);
1900 if (err)
1901 return err;
1902
1903 inet->cork.fl.u.ip6 = *fl6;
1904 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1905 length += exthdrlen;
1906 transhdrlen += exthdrlen;
1907 } else {
1908 transhdrlen = 0;
1909 }
1910
1911 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1912 &np->cork, sk_page_frag(sk), getfrag,
1913 from, length, transhdrlen, flags, ipc6);
1914 }
1915 EXPORT_SYMBOL_GPL(ip6_append_data);
1916
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1917 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1918 {
1919 struct dst_entry *dst = cork->base.dst;
1920
1921 cork->base.dst = NULL;
1922 cork->base.flags &= ~IPCORK_ALLFRAG;
1923 skb_dst_set(skb, dst);
1924 }
1925
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1926 static void ip6_cork_release(struct inet_cork_full *cork,
1927 struct inet6_cork *v6_cork)
1928 {
1929 if (v6_cork->opt) {
1930 struct ipv6_txoptions *opt = v6_cork->opt;
1931
1932 kfree(opt->dst0opt);
1933 kfree(opt->dst1opt);
1934 kfree(opt->hopopt);
1935 kfree(opt->srcrt);
1936 kfree(opt);
1937 v6_cork->opt = NULL;
1938 }
1939
1940 if (cork->base.dst) {
1941 dst_release(cork->base.dst);
1942 cork->base.dst = NULL;
1943 cork->base.flags &= ~IPCORK_ALLFRAG;
1944 }
1945 }
1946
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1947 struct sk_buff *__ip6_make_skb(struct sock *sk,
1948 struct sk_buff_head *queue,
1949 struct inet_cork_full *cork,
1950 struct inet6_cork *v6_cork)
1951 {
1952 struct sk_buff *skb, *tmp_skb;
1953 struct sk_buff **tail_skb;
1954 struct in6_addr *final_dst;
1955 struct ipv6_pinfo *np = inet6_sk(sk);
1956 struct net *net = sock_net(sk);
1957 struct ipv6hdr *hdr;
1958 struct ipv6_txoptions *opt = v6_cork->opt;
1959 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1960 struct flowi6 *fl6 = &cork->fl.u.ip6;
1961 unsigned char proto = fl6->flowi6_proto;
1962
1963 skb = __skb_dequeue(queue);
1964 if (!skb)
1965 goto out;
1966 tail_skb = &(skb_shinfo(skb)->frag_list);
1967
1968 /* move skb->data to ip header from ext header */
1969 if (skb->data < skb_network_header(skb))
1970 __skb_pull(skb, skb_network_offset(skb));
1971 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1972 __skb_pull(tmp_skb, skb_network_header_len(skb));
1973 *tail_skb = tmp_skb;
1974 tail_skb = &(tmp_skb->next);
1975 skb->len += tmp_skb->len;
1976 skb->data_len += tmp_skb->len;
1977 skb->truesize += tmp_skb->truesize;
1978 tmp_skb->destructor = NULL;
1979 tmp_skb->sk = NULL;
1980 }
1981
1982 /* Allow local fragmentation. */
1983 skb->ignore_df = ip6_sk_ignore_df(sk);
1984 __skb_pull(skb, skb_network_header_len(skb));
1985
1986 final_dst = &fl6->daddr;
1987 if (opt && opt->opt_flen)
1988 ipv6_push_frag_opts(skb, opt, &proto);
1989 if (opt && opt->opt_nflen)
1990 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1991
1992 skb_push(skb, sizeof(struct ipv6hdr));
1993 skb_reset_network_header(skb);
1994 hdr = ipv6_hdr(skb);
1995
1996 ip6_flow_hdr(hdr, v6_cork->tclass,
1997 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1998 ip6_autoflowlabel(net, np), fl6));
1999 hdr->hop_limit = v6_cork->hop_limit;
2000 hdr->nexthdr = proto;
2001 hdr->saddr = fl6->saddr;
2002 hdr->daddr = *final_dst;
2003
2004 skb->priority = sk->sk_priority;
2005 skb->mark = cork->base.mark;
2006 skb->tstamp = cork->base.transmit_time;
2007
2008 ip6_cork_steal_dst(skb, cork);
2009 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
2010 if (proto == IPPROTO_ICMPV6) {
2011 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
2012 u8 icmp6_type;
2013
2014 if (sk->sk_socket->type == SOCK_RAW &&
2015 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
2016 icmp6_type = fl6->fl6_icmp_type;
2017 else
2018 icmp6_type = icmp6_hdr(skb)->icmp6_type;
2019 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2020 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2021 }
2022
2023 ip6_cork_release(cork, v6_cork);
2024 out:
2025 return skb;
2026 }
2027
ip6_send_skb(struct sk_buff * skb)2028 int ip6_send_skb(struct sk_buff *skb)
2029 {
2030 struct net *net = sock_net(skb->sk);
2031 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2032 int err;
2033
2034 rcu_read_lock();
2035 err = ip6_local_out(net, skb->sk, skb);
2036 if (err) {
2037 if (err > 0)
2038 err = net_xmit_errno(err);
2039 if (err)
2040 IP6_INC_STATS(net, rt->rt6i_idev,
2041 IPSTATS_MIB_OUTDISCARDS);
2042 }
2043
2044 rcu_read_unlock();
2045 return err;
2046 }
2047
ip6_push_pending_frames(struct sock * sk)2048 int ip6_push_pending_frames(struct sock *sk)
2049 {
2050 struct sk_buff *skb;
2051
2052 skb = ip6_finish_skb(sk);
2053 if (!skb)
2054 return 0;
2055
2056 return ip6_send_skb(skb);
2057 }
2058 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2059
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)2060 static void __ip6_flush_pending_frames(struct sock *sk,
2061 struct sk_buff_head *queue,
2062 struct inet_cork_full *cork,
2063 struct inet6_cork *v6_cork)
2064 {
2065 struct sk_buff *skb;
2066
2067 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2068 if (skb_dst(skb))
2069 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2070 IPSTATS_MIB_OUTDISCARDS);
2071 kfree_skb(skb);
2072 }
2073
2074 ip6_cork_release(cork, v6_cork);
2075 }
2076
ip6_flush_pending_frames(struct sock * sk)2077 void ip6_flush_pending_frames(struct sock *sk)
2078 {
2079 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2080 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2081 }
2082 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2083
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2084 struct sk_buff *ip6_make_skb(struct sock *sk,
2085 int getfrag(void *from, char *to, int offset,
2086 int len, int odd, struct sk_buff *skb),
2087 void *from, size_t length, int transhdrlen,
2088 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2089 unsigned int flags, struct inet_cork_full *cork)
2090 {
2091 struct inet6_cork v6_cork;
2092 struct sk_buff_head queue;
2093 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2094 int err;
2095
2096 if (flags & MSG_PROBE) {
2097 dst_release(&rt->dst);
2098 return NULL;
2099 }
2100
2101 __skb_queue_head_init(&queue);
2102
2103 cork->base.flags = 0;
2104 cork->base.addr = 0;
2105 cork->base.opt = NULL;
2106 v6_cork.opt = NULL;
2107 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2108 if (err) {
2109 ip6_cork_release(cork, &v6_cork);
2110 return ERR_PTR(err);
2111 }
2112 if (ipc6->dontfrag < 0)
2113 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2114
2115 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2116 ¤t->task_frag, getfrag, from,
2117 length + exthdrlen, transhdrlen + exthdrlen,
2118 flags, ipc6);
2119 if (err) {
2120 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2121 return ERR_PTR(err);
2122 }
2123
2124 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2125 }
2126