xref: /openbmc/linux/io_uring/net.c (revision e6ed68cb)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
10 
11 #include <uapi/linux/io_uring.h>
12 
13 #include "io_uring.h"
14 #include "kbuf.h"
15 #include "alloc_cache.h"
16 #include "net.h"
17 #include "notif.h"
18 #include "rsrc.h"
19 
20 #if defined(CONFIG_NET)
21 struct io_shutdown {
22 	struct file			*file;
23 	int				how;
24 };
25 
26 struct io_accept {
27 	struct file			*file;
28 	struct sockaddr __user		*addr;
29 	int __user			*addr_len;
30 	int				flags;
31 	u32				file_slot;
32 	unsigned long			nofile;
33 };
34 
35 struct io_socket {
36 	struct file			*file;
37 	int				domain;
38 	int				type;
39 	int				protocol;
40 	int				flags;
41 	u32				file_slot;
42 	unsigned long			nofile;
43 };
44 
45 struct io_connect {
46 	struct file			*file;
47 	struct sockaddr __user		*addr;
48 	int				addr_len;
49 	bool				in_progress;
50 	bool				seen_econnaborted;
51 };
52 
53 struct io_sr_msg {
54 	struct file			*file;
55 	union {
56 		struct compat_msghdr __user	*umsg_compat;
57 		struct user_msghdr __user	*umsg;
58 		void __user			*buf;
59 	};
60 	unsigned			len;
61 	unsigned			done_io;
62 	unsigned			msg_flags;
63 	unsigned			nr_multishot_loops;
64 	u16				flags;
65 	/* initialised and used only by !msg send variants */
66 	u16				addr_len;
67 	u16				buf_group;
68 	void __user			*addr;
69 	void __user			*msg_control;
70 	/* used only for send zerocopy */
71 	struct io_kiocb 		*notif;
72 };
73 
74 /*
75  * Number of times we'll try and do receives if there's more data. If we
76  * exceed this limit, then add us to the back of the queue and retry from
77  * there. This helps fairness between flooding clients.
78  */
79 #define MULTISHOT_MAX_RETRY	32
80 
81 static inline bool io_check_multishot(struct io_kiocb *req,
82 				      unsigned int issue_flags)
83 {
84 	/*
85 	 * When ->locked_cq is set we only allow to post CQEs from the original
86 	 * task context. Usual request completions will be handled in other
87 	 * generic paths but multipoll may decide to post extra cqes.
88 	 */
89 	return !(issue_flags & IO_URING_F_IOWQ) ||
90 		!(issue_flags & IO_URING_F_MULTISHOT) ||
91 		!req->ctx->task_complete;
92 }
93 
94 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
95 {
96 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
97 
98 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
99 		     sqe->buf_index || sqe->splice_fd_in))
100 		return -EINVAL;
101 
102 	shutdown->how = READ_ONCE(sqe->len);
103 	req->flags |= REQ_F_FORCE_ASYNC;
104 	return 0;
105 }
106 
107 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
108 {
109 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
110 	struct socket *sock;
111 	int ret;
112 
113 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
114 
115 	sock = sock_from_file(req->file);
116 	if (unlikely(!sock))
117 		return -ENOTSOCK;
118 
119 	ret = __sys_shutdown_sock(sock, shutdown->how);
120 	io_req_set_res(req, ret, 0);
121 	return IOU_OK;
122 }
123 
124 static bool io_net_retry(struct socket *sock, int flags)
125 {
126 	if (!(flags & MSG_WAITALL))
127 		return false;
128 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
129 }
130 
131 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
132 {
133 	struct io_async_msghdr *hdr = req->async_data;
134 
135 	if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED)
136 		return;
137 
138 	/* Let normal cleanup path reap it if we fail adding to the cache */
139 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
140 		req->async_data = NULL;
141 		req->flags &= ~REQ_F_ASYNC_DATA;
142 	}
143 }
144 
145 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
146 						  unsigned int issue_flags)
147 {
148 	struct io_ring_ctx *ctx = req->ctx;
149 	struct io_cache_entry *entry;
150 	struct io_async_msghdr *hdr;
151 
152 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
153 		entry = io_alloc_cache_get(&ctx->netmsg_cache);
154 		if (entry) {
155 			hdr = container_of(entry, struct io_async_msghdr, cache);
156 			hdr->free_iov = NULL;
157 			req->flags |= REQ_F_ASYNC_DATA;
158 			req->async_data = hdr;
159 			return hdr;
160 		}
161 	}
162 
163 	if (!io_alloc_async_data(req)) {
164 		hdr = req->async_data;
165 		hdr->free_iov = NULL;
166 		return hdr;
167 	}
168 	return NULL;
169 }
170 
171 static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req)
172 {
173 	/* ->prep_async is always called from the submission context */
174 	return io_msg_alloc_async(req, 0);
175 }
176 
177 static int io_setup_async_msg(struct io_kiocb *req,
178 			      struct io_async_msghdr *kmsg,
179 			      unsigned int issue_flags)
180 {
181 	struct io_async_msghdr *async_msg;
182 
183 	if (req_has_async_data(req))
184 		return -EAGAIN;
185 	async_msg = io_msg_alloc_async(req, issue_flags);
186 	if (!async_msg) {
187 		kfree(kmsg->free_iov);
188 		return -ENOMEM;
189 	}
190 	req->flags |= REQ_F_NEED_CLEANUP;
191 	memcpy(async_msg, kmsg, sizeof(*kmsg));
192 	if (async_msg->msg.msg_name)
193 		async_msg->msg.msg_name = &async_msg->addr;
194 
195 	if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs)
196 		return -EAGAIN;
197 
198 	/* if were using fast_iov, set it to the new one */
199 	if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
200 		size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
201 		async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
202 	}
203 
204 	return -EAGAIN;
205 }
206 
207 #ifdef CONFIG_COMPAT
208 static int io_compat_msg_copy_hdr(struct io_kiocb *req,
209 				  struct io_async_msghdr *iomsg,
210 				  struct compat_msghdr *msg, int ddir)
211 {
212 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
213 	struct compat_iovec __user *uiov;
214 	int ret;
215 
216 	if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg)))
217 		return -EFAULT;
218 
219 	uiov = compat_ptr(msg->msg_iov);
220 	if (req->flags & REQ_F_BUFFER_SELECT) {
221 		compat_ssize_t clen;
222 
223 		iomsg->free_iov = NULL;
224 		if (msg->msg_iovlen == 0) {
225 			sr->len = 0;
226 		} else if (msg->msg_iovlen > 1) {
227 			return -EINVAL;
228 		} else {
229 			if (!access_ok(uiov, sizeof(*uiov)))
230 				return -EFAULT;
231 			if (__get_user(clen, &uiov->iov_len))
232 				return -EFAULT;
233 			if (clen < 0)
234 				return -EINVAL;
235 			sr->len = clen;
236 		}
237 
238 		return 0;
239 	}
240 
241 	iomsg->free_iov = iomsg->fast_iov;
242 	ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen,
243 				UIO_FASTIOV, &iomsg->free_iov,
244 				&iomsg->msg.msg_iter, true);
245 	if (unlikely(ret < 0))
246 		return ret;
247 
248 	return 0;
249 }
250 #endif
251 
252 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
253 			   struct user_msghdr *msg, int ddir)
254 {
255 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
256 	int ret;
257 
258 	if (copy_from_user(msg, sr->umsg, sizeof(*sr->umsg)))
259 		return -EFAULT;
260 
261 	if (req->flags & REQ_F_BUFFER_SELECT) {
262 		if (msg->msg_iovlen == 0) {
263 			sr->len = iomsg->fast_iov[0].iov_len = 0;
264 			iomsg->fast_iov[0].iov_base = NULL;
265 			iomsg->free_iov = NULL;
266 		} else if (msg->msg_iovlen > 1) {
267 			return -EINVAL;
268 		} else {
269 			if (copy_from_user(iomsg->fast_iov, msg->msg_iov,
270 					   sizeof(*msg->msg_iov)))
271 				return -EFAULT;
272 			sr->len = iomsg->fast_iov[0].iov_len;
273 			iomsg->free_iov = NULL;
274 		}
275 
276 		return 0;
277 	}
278 
279 	iomsg->free_iov = iomsg->fast_iov;
280 	ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV,
281 				&iomsg->free_iov, &iomsg->msg.msg_iter, false);
282 	if (unlikely(ret < 0))
283 		return ret;
284 
285 	return 0;
286 }
287 
288 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
289 			       struct io_async_msghdr *iomsg)
290 {
291 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
292 	struct user_msghdr msg;
293 	int ret;
294 
295 	iomsg->msg.msg_name = &iomsg->addr;
296 	iomsg->msg.msg_iter.nr_segs = 0;
297 
298 #ifdef CONFIG_COMPAT
299 	if (unlikely(req->ctx->compat)) {
300 		struct compat_msghdr cmsg;
301 
302 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE);
303 		if (unlikely(ret))
304 			return ret;
305 
306 		return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL);
307 	}
308 #endif
309 
310 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE);
311 	if (unlikely(ret))
312 		return ret;
313 
314 	ret = __copy_msghdr(&iomsg->msg, &msg, NULL);
315 
316 	/* save msg_control as sys_sendmsg() overwrites it */
317 	sr->msg_control = iomsg->msg.msg_control_user;
318 	return ret;
319 }
320 
321 int io_send_prep_async(struct io_kiocb *req)
322 {
323 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
324 	struct io_async_msghdr *io;
325 	int ret;
326 
327 	if (!zc->addr || req_has_async_data(req))
328 		return 0;
329 	io = io_msg_alloc_async_prep(req);
330 	if (!io)
331 		return -ENOMEM;
332 	ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr);
333 	return ret;
334 }
335 
336 static int io_setup_async_addr(struct io_kiocb *req,
337 			      struct sockaddr_storage *addr_storage,
338 			      unsigned int issue_flags)
339 {
340 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
341 	struct io_async_msghdr *io;
342 
343 	if (!sr->addr || req_has_async_data(req))
344 		return -EAGAIN;
345 	io = io_msg_alloc_async(req, issue_flags);
346 	if (!io)
347 		return -ENOMEM;
348 	memcpy(&io->addr, addr_storage, sizeof(io->addr));
349 	return -EAGAIN;
350 }
351 
352 int io_sendmsg_prep_async(struct io_kiocb *req)
353 {
354 	int ret;
355 
356 	if (!io_msg_alloc_async_prep(req))
357 		return -ENOMEM;
358 	ret = io_sendmsg_copy_hdr(req, req->async_data);
359 	if (!ret)
360 		req->flags |= REQ_F_NEED_CLEANUP;
361 	return ret;
362 }
363 
364 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
365 {
366 	struct io_async_msghdr *io = req->async_data;
367 
368 	kfree(io->free_iov);
369 }
370 
371 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
372 {
373 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
374 
375 	if (req->opcode == IORING_OP_SEND) {
376 		if (READ_ONCE(sqe->__pad3[0]))
377 			return -EINVAL;
378 		sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
379 		sr->addr_len = READ_ONCE(sqe->addr_len);
380 	} else if (sqe->addr2 || sqe->file_index) {
381 		return -EINVAL;
382 	}
383 
384 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
385 	sr->len = READ_ONCE(sqe->len);
386 	sr->flags = READ_ONCE(sqe->ioprio);
387 	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
388 		return -EINVAL;
389 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
390 	if (sr->msg_flags & MSG_DONTWAIT)
391 		req->flags |= REQ_F_NOWAIT;
392 
393 #ifdef CONFIG_COMPAT
394 	if (req->ctx->compat)
395 		sr->msg_flags |= MSG_CMSG_COMPAT;
396 #endif
397 	sr->done_io = 0;
398 	return 0;
399 }
400 
401 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
402 {
403 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
404 	struct io_async_msghdr iomsg, *kmsg;
405 	struct socket *sock;
406 	unsigned flags;
407 	int min_ret = 0;
408 	int ret;
409 
410 	sock = sock_from_file(req->file);
411 	if (unlikely(!sock))
412 		return -ENOTSOCK;
413 
414 	if (req_has_async_data(req)) {
415 		kmsg = req->async_data;
416 		kmsg->msg.msg_control_user = sr->msg_control;
417 	} else {
418 		ret = io_sendmsg_copy_hdr(req, &iomsg);
419 		if (ret)
420 			return ret;
421 		kmsg = &iomsg;
422 	}
423 
424 	if (!(req->flags & REQ_F_POLLED) &&
425 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
426 		return io_setup_async_msg(req, kmsg, issue_flags);
427 
428 	flags = sr->msg_flags;
429 	if (issue_flags & IO_URING_F_NONBLOCK)
430 		flags |= MSG_DONTWAIT;
431 	if (flags & MSG_WAITALL)
432 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
433 
434 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
435 
436 	if (ret < min_ret) {
437 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
438 			return io_setup_async_msg(req, kmsg, issue_flags);
439 		if (ret > 0 && io_net_retry(sock, flags)) {
440 			kmsg->msg.msg_controllen = 0;
441 			kmsg->msg.msg_control = NULL;
442 			sr->done_io += ret;
443 			req->flags |= REQ_F_PARTIAL_IO;
444 			return io_setup_async_msg(req, kmsg, issue_flags);
445 		}
446 		if (ret == -ERESTARTSYS)
447 			ret = -EINTR;
448 		req_set_fail(req);
449 	}
450 	/* fast path, check for non-NULL to avoid function call */
451 	if (kmsg->free_iov)
452 		kfree(kmsg->free_iov);
453 	req->flags &= ~REQ_F_NEED_CLEANUP;
454 	io_netmsg_recycle(req, issue_flags);
455 	if (ret >= 0)
456 		ret += sr->done_io;
457 	else if (sr->done_io)
458 		ret = sr->done_io;
459 	io_req_set_res(req, ret, 0);
460 	return IOU_OK;
461 }
462 
463 int io_send(struct io_kiocb *req, unsigned int issue_flags)
464 {
465 	struct sockaddr_storage __address;
466 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
467 	struct msghdr msg;
468 	struct socket *sock;
469 	unsigned flags;
470 	int min_ret = 0;
471 	int ret;
472 
473 	msg.msg_name = NULL;
474 	msg.msg_control = NULL;
475 	msg.msg_controllen = 0;
476 	msg.msg_namelen = 0;
477 	msg.msg_ubuf = NULL;
478 
479 	if (sr->addr) {
480 		if (req_has_async_data(req)) {
481 			struct io_async_msghdr *io = req->async_data;
482 
483 			msg.msg_name = &io->addr;
484 		} else {
485 			ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address);
486 			if (unlikely(ret < 0))
487 				return ret;
488 			msg.msg_name = (struct sockaddr *)&__address;
489 		}
490 		msg.msg_namelen = sr->addr_len;
491 	}
492 
493 	if (!(req->flags & REQ_F_POLLED) &&
494 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
495 		return io_setup_async_addr(req, &__address, issue_flags);
496 
497 	sock = sock_from_file(req->file);
498 	if (unlikely(!sock))
499 		return -ENOTSOCK;
500 
501 	ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
502 	if (unlikely(ret))
503 		return ret;
504 
505 	flags = sr->msg_flags;
506 	if (issue_flags & IO_URING_F_NONBLOCK)
507 		flags |= MSG_DONTWAIT;
508 	if (flags & MSG_WAITALL)
509 		min_ret = iov_iter_count(&msg.msg_iter);
510 
511 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
512 	msg.msg_flags = flags;
513 	ret = sock_sendmsg(sock, &msg);
514 	if (ret < min_ret) {
515 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
516 			return io_setup_async_addr(req, &__address, issue_flags);
517 
518 		if (ret > 0 && io_net_retry(sock, flags)) {
519 			sr->len -= ret;
520 			sr->buf += ret;
521 			sr->done_io += ret;
522 			req->flags |= REQ_F_PARTIAL_IO;
523 			return io_setup_async_addr(req, &__address, issue_flags);
524 		}
525 		if (ret == -ERESTARTSYS)
526 			ret = -EINTR;
527 		req_set_fail(req);
528 	}
529 	if (ret >= 0)
530 		ret += sr->done_io;
531 	else if (sr->done_io)
532 		ret = sr->done_io;
533 	io_req_set_res(req, ret, 0);
534 	return IOU_OK;
535 }
536 
537 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
538 				 struct io_async_msghdr *iomsg,
539 				 size_t namelen, size_t controllen)
540 {
541 	if ((req->flags & (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) ==
542 			  (REQ_F_APOLL_MULTISHOT|REQ_F_BUFFER_SELECT)) {
543 		int hdr;
544 
545 		if (unlikely(namelen < 0))
546 			return -EOVERFLOW;
547 		if (check_add_overflow(sizeof(struct io_uring_recvmsg_out),
548 					namelen, &hdr))
549 			return -EOVERFLOW;
550 		if (check_add_overflow(hdr, controllen, &hdr))
551 			return -EOVERFLOW;
552 
553 		iomsg->namelen = namelen;
554 		iomsg->controllen = controllen;
555 		return 0;
556 	}
557 
558 	return 0;
559 }
560 
561 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
562 			       struct io_async_msghdr *iomsg)
563 {
564 	struct user_msghdr msg;
565 	int ret;
566 
567 	iomsg->msg.msg_name = &iomsg->addr;
568 	iomsg->msg.msg_iter.nr_segs = 0;
569 
570 #ifdef CONFIG_COMPAT
571 	if (unlikely(req->ctx->compat)) {
572 		struct compat_msghdr cmsg;
573 
574 		ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST);
575 		if (unlikely(ret))
576 			return ret;
577 
578 		ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr);
579 		if (unlikely(ret))
580 			return ret;
581 
582 		return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen,
583 						cmsg.msg_controllen);
584 	}
585 #endif
586 
587 	ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST);
588 	if (unlikely(ret))
589 		return ret;
590 
591 	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
592 	if (unlikely(ret))
593 		return ret;
594 
595 	return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen,
596 					msg.msg_controllen);
597 }
598 
599 int io_recvmsg_prep_async(struct io_kiocb *req)
600 {
601 	struct io_async_msghdr *iomsg;
602 	int ret;
603 
604 	if (!io_msg_alloc_async_prep(req))
605 		return -ENOMEM;
606 	iomsg = req->async_data;
607 	ret = io_recvmsg_copy_hdr(req, iomsg);
608 	if (!ret)
609 		req->flags |= REQ_F_NEED_CLEANUP;
610 	return ret;
611 }
612 
613 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
614 
615 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
616 {
617 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
618 
619 	if (unlikely(sqe->file_index || sqe->addr2))
620 		return -EINVAL;
621 
622 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
623 	sr->len = READ_ONCE(sqe->len);
624 	sr->flags = READ_ONCE(sqe->ioprio);
625 	if (sr->flags & ~(RECVMSG_FLAGS))
626 		return -EINVAL;
627 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
628 	if (sr->msg_flags & MSG_DONTWAIT)
629 		req->flags |= REQ_F_NOWAIT;
630 	if (sr->msg_flags & MSG_ERRQUEUE)
631 		req->flags |= REQ_F_CLEAR_POLLIN;
632 	if (sr->flags & IORING_RECV_MULTISHOT) {
633 		if (!(req->flags & REQ_F_BUFFER_SELECT))
634 			return -EINVAL;
635 		if (sr->msg_flags & MSG_WAITALL)
636 			return -EINVAL;
637 		if (req->opcode == IORING_OP_RECV && sr->len)
638 			return -EINVAL;
639 		req->flags |= REQ_F_APOLL_MULTISHOT;
640 		/*
641 		 * Store the buffer group for this multishot receive separately,
642 		 * as if we end up doing an io-wq based issue that selects a
643 		 * buffer, it has to be committed immediately and that will
644 		 * clear ->buf_list. This means we lose the link to the buffer
645 		 * list, and the eventual buffer put on completion then cannot
646 		 * restore it.
647 		 */
648 		sr->buf_group = req->buf_index;
649 	}
650 
651 #ifdef CONFIG_COMPAT
652 	if (req->ctx->compat)
653 		sr->msg_flags |= MSG_CMSG_COMPAT;
654 #endif
655 	sr->done_io = 0;
656 	sr->nr_multishot_loops = 0;
657 	return 0;
658 }
659 
660 static inline void io_recv_prep_retry(struct io_kiocb *req)
661 {
662 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
663 
664 	sr->done_io = 0;
665 	sr->len = 0; /* get from the provided buffer */
666 	req->buf_index = sr->buf_group;
667 }
668 
669 /*
670  * Finishes io_recv and io_recvmsg.
671  *
672  * Returns true if it is actually finished, or false if it should run
673  * again (for multishot).
674  */
675 static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
676 				  struct msghdr *msg, bool mshot_finished,
677 				  unsigned issue_flags)
678 {
679 	unsigned int cflags;
680 
681 	cflags = io_put_kbuf(req, issue_flags);
682 	if (msg->msg_inq && msg->msg_inq != -1)
683 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
684 
685 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
686 		io_req_set_res(req, *ret, cflags);
687 		*ret = IOU_OK;
688 		return true;
689 	}
690 
691 	if (mshot_finished)
692 		goto finish;
693 
694 	/*
695 	 * Fill CQE for this receive and see if we should keep trying to
696 	 * receive from this socket.
697 	 */
698 	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
699 				*ret, cflags | IORING_CQE_F_MORE)) {
700 		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
701 		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
702 
703 		io_recv_prep_retry(req);
704 		/* Known not-empty or unknown state, retry */
705 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
706 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
707 				return false;
708 			/* mshot retries exceeded, force a requeue */
709 			sr->nr_multishot_loops = 0;
710 			mshot_retry_ret = IOU_REQUEUE;
711 		}
712 		if (issue_flags & IO_URING_F_MULTISHOT)
713 			*ret = mshot_retry_ret;
714 		else
715 			*ret = -EAGAIN;
716 		return true;
717 	}
718 	/* Otherwise stop multishot but use the current result. */
719 finish:
720 	io_req_set_res(req, *ret, cflags);
721 
722 	if (issue_flags & IO_URING_F_MULTISHOT)
723 		*ret = IOU_STOP_MULTISHOT;
724 	else
725 		*ret = IOU_OK;
726 	return true;
727 }
728 
729 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
730 				     struct io_sr_msg *sr, void __user **buf,
731 				     size_t *len)
732 {
733 	unsigned long ubuf = (unsigned long) *buf;
734 	unsigned long hdr;
735 
736 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
737 		kmsg->controllen;
738 	if (*len < hdr)
739 		return -EFAULT;
740 
741 	if (kmsg->controllen) {
742 		unsigned long control = ubuf + hdr - kmsg->controllen;
743 
744 		kmsg->msg.msg_control_user = (void __user *) control;
745 		kmsg->msg.msg_controllen = kmsg->controllen;
746 	}
747 
748 	sr->buf = *buf; /* stash for later copy */
749 	*buf = (void __user *) (ubuf + hdr);
750 	kmsg->payloadlen = *len = *len - hdr;
751 	return 0;
752 }
753 
754 struct io_recvmsg_multishot_hdr {
755 	struct io_uring_recvmsg_out msg;
756 	struct sockaddr_storage addr;
757 };
758 
759 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
760 				struct io_async_msghdr *kmsg,
761 				unsigned int flags, bool *finished)
762 {
763 	int err;
764 	int copy_len;
765 	struct io_recvmsg_multishot_hdr hdr;
766 
767 	if (kmsg->namelen)
768 		kmsg->msg.msg_name = &hdr.addr;
769 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
770 	kmsg->msg.msg_namelen = 0;
771 
772 	if (sock->file->f_flags & O_NONBLOCK)
773 		flags |= MSG_DONTWAIT;
774 
775 	err = sock_recvmsg(sock, &kmsg->msg, flags);
776 	*finished = err <= 0;
777 	if (err < 0)
778 		return err;
779 
780 	hdr.msg = (struct io_uring_recvmsg_out) {
781 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
782 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
783 	};
784 
785 	hdr.msg.payloadlen = err;
786 	if (err > kmsg->payloadlen)
787 		err = kmsg->payloadlen;
788 
789 	copy_len = sizeof(struct io_uring_recvmsg_out);
790 	if (kmsg->msg.msg_namelen > kmsg->namelen)
791 		copy_len += kmsg->namelen;
792 	else
793 		copy_len += kmsg->msg.msg_namelen;
794 
795 	/*
796 	 *      "fromlen shall refer to the value before truncation.."
797 	 *                      1003.1g
798 	 */
799 	hdr.msg.namelen = kmsg->msg.msg_namelen;
800 
801 	/* ensure that there is no gap between hdr and sockaddr_storage */
802 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
803 		     sizeof(struct io_uring_recvmsg_out));
804 	if (copy_to_user(io->buf, &hdr, copy_len)) {
805 		*finished = true;
806 		return -EFAULT;
807 	}
808 
809 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
810 			kmsg->controllen + err;
811 }
812 
813 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
814 {
815 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
816 	struct io_async_msghdr iomsg, *kmsg;
817 	struct socket *sock;
818 	unsigned flags;
819 	int ret, min_ret = 0;
820 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
821 	bool mshot_finished = true;
822 
823 	sock = sock_from_file(req->file);
824 	if (unlikely(!sock))
825 		return -ENOTSOCK;
826 
827 	if (req_has_async_data(req)) {
828 		kmsg = req->async_data;
829 	} else {
830 		ret = io_recvmsg_copy_hdr(req, &iomsg);
831 		if (ret)
832 			return ret;
833 		kmsg = &iomsg;
834 	}
835 
836 	if (!(req->flags & REQ_F_POLLED) &&
837 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
838 		return io_setup_async_msg(req, kmsg, issue_flags);
839 
840 	if (!io_check_multishot(req, issue_flags))
841 		return io_setup_async_msg(req, kmsg, issue_flags);
842 
843 retry_multishot:
844 	if (io_do_buffer_select(req)) {
845 		void __user *buf;
846 		size_t len = sr->len;
847 
848 		buf = io_buffer_select(req, &len, issue_flags);
849 		if (!buf)
850 			return -ENOBUFS;
851 
852 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
853 			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
854 			if (ret) {
855 				io_kbuf_recycle(req, issue_flags);
856 				return ret;
857 			}
858 		}
859 
860 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
861 	}
862 
863 	flags = sr->msg_flags;
864 	if (force_nonblock)
865 		flags |= MSG_DONTWAIT;
866 
867 	kmsg->msg.msg_get_inq = 1;
868 	kmsg->msg.msg_inq = -1;
869 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
870 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
871 					   &mshot_finished);
872 	} else {
873 		/* disable partial retry for recvmsg with cmsg attached */
874 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
875 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
876 
877 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
878 					 kmsg->uaddr, flags);
879 	}
880 
881 	if (ret < min_ret) {
882 		if (ret == -EAGAIN && force_nonblock) {
883 			ret = io_setup_async_msg(req, kmsg, issue_flags);
884 			if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) {
885 				io_kbuf_recycle(req, issue_flags);
886 				return IOU_ISSUE_SKIP_COMPLETE;
887 			}
888 			return ret;
889 		}
890 		if (ret > 0 && io_net_retry(sock, flags)) {
891 			sr->done_io += ret;
892 			req->flags |= REQ_F_PARTIAL_IO;
893 			return io_setup_async_msg(req, kmsg, issue_flags);
894 		}
895 		if (ret == -ERESTARTSYS)
896 			ret = -EINTR;
897 		req_set_fail(req);
898 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
899 		req_set_fail(req);
900 	}
901 
902 	if (ret > 0)
903 		ret += sr->done_io;
904 	else if (sr->done_io)
905 		ret = sr->done_io;
906 	else
907 		io_kbuf_recycle(req, issue_flags);
908 
909 	if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
910 		goto retry_multishot;
911 
912 	if (mshot_finished) {
913 		/* fast path, check for non-NULL to avoid function call */
914 		if (kmsg->free_iov)
915 			kfree(kmsg->free_iov);
916 		io_netmsg_recycle(req, issue_flags);
917 		req->flags &= ~REQ_F_NEED_CLEANUP;
918 	}
919 
920 	return ret;
921 }
922 
923 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
924 {
925 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
926 	struct msghdr msg;
927 	struct socket *sock;
928 	unsigned flags;
929 	int ret, min_ret = 0;
930 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
931 	size_t len = sr->len;
932 
933 	if (!(req->flags & REQ_F_POLLED) &&
934 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
935 		return -EAGAIN;
936 
937 	if (!io_check_multishot(req, issue_flags))
938 		return -EAGAIN;
939 
940 	sock = sock_from_file(req->file);
941 	if (unlikely(!sock))
942 		return -ENOTSOCK;
943 
944 	msg.msg_name = NULL;
945 	msg.msg_namelen = 0;
946 	msg.msg_control = NULL;
947 	msg.msg_get_inq = 1;
948 	msg.msg_controllen = 0;
949 	msg.msg_iocb = NULL;
950 	msg.msg_ubuf = NULL;
951 
952 retry_multishot:
953 	if (io_do_buffer_select(req)) {
954 		void __user *buf;
955 
956 		buf = io_buffer_select(req, &len, issue_flags);
957 		if (!buf)
958 			return -ENOBUFS;
959 		sr->buf = buf;
960 		sr->len = len;
961 	}
962 
963 	ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
964 	if (unlikely(ret))
965 		goto out_free;
966 
967 	msg.msg_inq = -1;
968 	msg.msg_flags = 0;
969 
970 	flags = sr->msg_flags;
971 	if (force_nonblock)
972 		flags |= MSG_DONTWAIT;
973 	if (flags & MSG_WAITALL)
974 		min_ret = iov_iter_count(&msg.msg_iter);
975 
976 	ret = sock_recvmsg(sock, &msg, flags);
977 	if (ret < min_ret) {
978 		if (ret == -EAGAIN && force_nonblock) {
979 			if (issue_flags & IO_URING_F_MULTISHOT) {
980 				io_kbuf_recycle(req, issue_flags);
981 				return IOU_ISSUE_SKIP_COMPLETE;
982 			}
983 
984 			return -EAGAIN;
985 		}
986 		if (ret > 0 && io_net_retry(sock, flags)) {
987 			sr->len -= ret;
988 			sr->buf += ret;
989 			sr->done_io += ret;
990 			req->flags |= REQ_F_PARTIAL_IO;
991 			return -EAGAIN;
992 		}
993 		if (ret == -ERESTARTSYS)
994 			ret = -EINTR;
995 		req_set_fail(req);
996 	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
997 out_free:
998 		req_set_fail(req);
999 	}
1000 
1001 	if (ret > 0)
1002 		ret += sr->done_io;
1003 	else if (sr->done_io)
1004 		ret = sr->done_io;
1005 	else
1006 		io_kbuf_recycle(req, issue_flags);
1007 
1008 	if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags))
1009 		goto retry_multishot;
1010 
1011 	return ret;
1012 }
1013 
1014 void io_send_zc_cleanup(struct io_kiocb *req)
1015 {
1016 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1017 	struct io_async_msghdr *io;
1018 
1019 	if (req_has_async_data(req)) {
1020 		io = req->async_data;
1021 		/* might be ->fast_iov if *msg_copy_hdr failed */
1022 		if (io->free_iov != io->fast_iov)
1023 			kfree(io->free_iov);
1024 	}
1025 	if (zc->notif) {
1026 		io_notif_flush(zc->notif);
1027 		zc->notif = NULL;
1028 	}
1029 }
1030 
1031 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1032 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
1033 
1034 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1035 {
1036 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1037 	struct io_ring_ctx *ctx = req->ctx;
1038 	struct io_kiocb *notif;
1039 
1040 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1041 		return -EINVAL;
1042 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1043 	if (req->flags & REQ_F_CQE_SKIP)
1044 		return -EINVAL;
1045 
1046 	notif = zc->notif = io_alloc_notif(ctx);
1047 	if (!notif)
1048 		return -ENOMEM;
1049 	notif->cqe.user_data = req->cqe.user_data;
1050 	notif->cqe.res = 0;
1051 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1052 	req->flags |= REQ_F_NEED_CLEANUP;
1053 
1054 	zc->flags = READ_ONCE(sqe->ioprio);
1055 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1056 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1057 			return -EINVAL;
1058 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1059 			io_notif_set_extended(notif);
1060 			io_notif_to_data(notif)->zc_report = true;
1061 		}
1062 	}
1063 
1064 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1065 		unsigned idx = READ_ONCE(sqe->buf_index);
1066 
1067 		if (unlikely(idx >= ctx->nr_user_bufs))
1068 			return -EFAULT;
1069 		idx = array_index_nospec(idx, ctx->nr_user_bufs);
1070 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
1071 		io_req_set_rsrc_node(notif, ctx, 0);
1072 	}
1073 
1074 	if (req->opcode == IORING_OP_SEND_ZC) {
1075 		if (READ_ONCE(sqe->__pad3[0]))
1076 			return -EINVAL;
1077 		zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1078 		zc->addr_len = READ_ONCE(sqe->addr_len);
1079 	} else {
1080 		if (unlikely(sqe->addr2 || sqe->file_index))
1081 			return -EINVAL;
1082 		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
1083 			return -EINVAL;
1084 	}
1085 
1086 	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1087 	zc->len = READ_ONCE(sqe->len);
1088 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
1089 	if (zc->msg_flags & MSG_DONTWAIT)
1090 		req->flags |= REQ_F_NOWAIT;
1091 
1092 	zc->done_io = 0;
1093 
1094 #ifdef CONFIG_COMPAT
1095 	if (req->ctx->compat)
1096 		zc->msg_flags |= MSG_CMSG_COMPAT;
1097 #endif
1098 	return 0;
1099 }
1100 
1101 static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb,
1102 				 struct iov_iter *from, size_t length)
1103 {
1104 	skb_zcopy_downgrade_managed(skb);
1105 	return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1106 }
1107 
1108 static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
1109 			   struct iov_iter *from, size_t length)
1110 {
1111 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1112 	int frag = shinfo->nr_frags;
1113 	int ret = 0;
1114 	struct bvec_iter bi;
1115 	ssize_t copied = 0;
1116 	unsigned long truesize = 0;
1117 
1118 	if (!frag)
1119 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1120 	else if (unlikely(!skb_zcopy_managed(skb)))
1121 		return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1122 
1123 	bi.bi_size = min(from->count, length);
1124 	bi.bi_bvec_done = from->iov_offset;
1125 	bi.bi_idx = 0;
1126 
1127 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1128 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1129 
1130 		copied += v.bv_len;
1131 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1132 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1133 					   v.bv_offset, v.bv_len);
1134 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1135 	}
1136 	if (bi.bi_size)
1137 		ret = -EMSGSIZE;
1138 
1139 	shinfo->nr_frags = frag;
1140 	from->bvec += bi.bi_idx;
1141 	from->nr_segs -= bi.bi_idx;
1142 	from->count -= copied;
1143 	from->iov_offset = bi.bi_bvec_done;
1144 
1145 	skb->data_len += copied;
1146 	skb->len += copied;
1147 	skb->truesize += truesize;
1148 
1149 	if (sk && sk->sk_type == SOCK_STREAM) {
1150 		sk_wmem_queued_add(sk, truesize);
1151 		if (!skb_zcopy_pure(skb))
1152 			sk_mem_charge(sk, truesize);
1153 	} else {
1154 		refcount_add(truesize, &skb->sk->sk_wmem_alloc);
1155 	}
1156 	return ret;
1157 }
1158 
1159 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1160 {
1161 	struct sockaddr_storage __address;
1162 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1163 	struct msghdr msg;
1164 	struct socket *sock;
1165 	unsigned msg_flags;
1166 	int ret, min_ret = 0;
1167 
1168 	sock = sock_from_file(req->file);
1169 	if (unlikely(!sock))
1170 		return -ENOTSOCK;
1171 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1172 		return -EOPNOTSUPP;
1173 
1174 	msg.msg_name = NULL;
1175 	msg.msg_control = NULL;
1176 	msg.msg_controllen = 0;
1177 	msg.msg_namelen = 0;
1178 
1179 	if (zc->addr) {
1180 		if (req_has_async_data(req)) {
1181 			struct io_async_msghdr *io = req->async_data;
1182 
1183 			msg.msg_name = &io->addr;
1184 		} else {
1185 			ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address);
1186 			if (unlikely(ret < 0))
1187 				return ret;
1188 			msg.msg_name = (struct sockaddr *)&__address;
1189 		}
1190 		msg.msg_namelen = zc->addr_len;
1191 	}
1192 
1193 	if (!(req->flags & REQ_F_POLLED) &&
1194 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1195 		return io_setup_async_addr(req, &__address, issue_flags);
1196 
1197 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1198 		ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu,
1199 					(u64)(uintptr_t)zc->buf, zc->len);
1200 		if (unlikely(ret))
1201 			return ret;
1202 		msg.sg_from_iter = io_sg_from_iter;
1203 	} else {
1204 		io_notif_set_extended(zc->notif);
1205 		ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter);
1206 		if (unlikely(ret))
1207 			return ret;
1208 		ret = io_notif_account_mem(zc->notif, zc->len);
1209 		if (unlikely(ret))
1210 			return ret;
1211 		msg.sg_from_iter = io_sg_from_iter_iovec;
1212 	}
1213 
1214 	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
1215 	if (issue_flags & IO_URING_F_NONBLOCK)
1216 		msg_flags |= MSG_DONTWAIT;
1217 	if (msg_flags & MSG_WAITALL)
1218 		min_ret = iov_iter_count(&msg.msg_iter);
1219 	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1220 
1221 	msg.msg_flags = msg_flags;
1222 	msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1223 	ret = sock_sendmsg(sock, &msg);
1224 
1225 	if (unlikely(ret < min_ret)) {
1226 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1227 			return io_setup_async_addr(req, &__address, issue_flags);
1228 
1229 		if (ret > 0 && io_net_retry(sock, msg.msg_flags)) {
1230 			zc->len -= ret;
1231 			zc->buf += ret;
1232 			zc->done_io += ret;
1233 			req->flags |= REQ_F_PARTIAL_IO;
1234 			return io_setup_async_addr(req, &__address, issue_flags);
1235 		}
1236 		if (ret == -ERESTARTSYS)
1237 			ret = -EINTR;
1238 		req_set_fail(req);
1239 	}
1240 
1241 	if (ret >= 0)
1242 		ret += zc->done_io;
1243 	else if (zc->done_io)
1244 		ret = zc->done_io;
1245 
1246 	/*
1247 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1248 	 * flushing notif to io_send_zc_cleanup()
1249 	 */
1250 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1251 		io_notif_flush(zc->notif);
1252 		req->flags &= ~REQ_F_NEED_CLEANUP;
1253 	}
1254 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1255 	return IOU_OK;
1256 }
1257 
1258 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1259 {
1260 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1261 	struct io_async_msghdr iomsg, *kmsg;
1262 	struct socket *sock;
1263 	unsigned flags;
1264 	int ret, min_ret = 0;
1265 
1266 	io_notif_set_extended(sr->notif);
1267 
1268 	sock = sock_from_file(req->file);
1269 	if (unlikely(!sock))
1270 		return -ENOTSOCK;
1271 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1272 		return -EOPNOTSUPP;
1273 
1274 	if (req_has_async_data(req)) {
1275 		kmsg = req->async_data;
1276 	} else {
1277 		ret = io_sendmsg_copy_hdr(req, &iomsg);
1278 		if (ret)
1279 			return ret;
1280 		kmsg = &iomsg;
1281 	}
1282 
1283 	if (!(req->flags & REQ_F_POLLED) &&
1284 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1285 		return io_setup_async_msg(req, kmsg, issue_flags);
1286 
1287 	flags = sr->msg_flags | MSG_ZEROCOPY;
1288 	if (issue_flags & IO_URING_F_NONBLOCK)
1289 		flags |= MSG_DONTWAIT;
1290 	if (flags & MSG_WAITALL)
1291 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1292 
1293 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1294 	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1295 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1296 
1297 	if (unlikely(ret < min_ret)) {
1298 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1299 			return io_setup_async_msg(req, kmsg, issue_flags);
1300 
1301 		if (ret > 0 && io_net_retry(sock, flags)) {
1302 			sr->done_io += ret;
1303 			req->flags |= REQ_F_PARTIAL_IO;
1304 			return io_setup_async_msg(req, kmsg, issue_flags);
1305 		}
1306 		if (ret == -ERESTARTSYS)
1307 			ret = -EINTR;
1308 		req_set_fail(req);
1309 	}
1310 	/* fast path, check for non-NULL to avoid function call */
1311 	if (kmsg->free_iov) {
1312 		kfree(kmsg->free_iov);
1313 		kmsg->free_iov = NULL;
1314 	}
1315 
1316 	io_netmsg_recycle(req, issue_flags);
1317 	if (ret >= 0)
1318 		ret += sr->done_io;
1319 	else if (sr->done_io)
1320 		ret = sr->done_io;
1321 
1322 	/*
1323 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1324 	 * flushing notif to io_send_zc_cleanup()
1325 	 */
1326 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1327 		io_notif_flush(sr->notif);
1328 		req->flags &= ~REQ_F_NEED_CLEANUP;
1329 	}
1330 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1331 	return IOU_OK;
1332 }
1333 
1334 void io_sendrecv_fail(struct io_kiocb *req)
1335 {
1336 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1337 
1338 	if (req->flags & REQ_F_PARTIAL_IO)
1339 		req->cqe.res = sr->done_io;
1340 
1341 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1342 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1343 		req->cqe.flags |= IORING_CQE_F_MORE;
1344 }
1345 
1346 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1347 {
1348 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1349 	unsigned flags;
1350 
1351 	if (sqe->len || sqe->buf_index)
1352 		return -EINVAL;
1353 
1354 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1355 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1356 	accept->flags = READ_ONCE(sqe->accept_flags);
1357 	accept->nofile = rlimit(RLIMIT_NOFILE);
1358 	flags = READ_ONCE(sqe->ioprio);
1359 	if (flags & ~IORING_ACCEPT_MULTISHOT)
1360 		return -EINVAL;
1361 
1362 	accept->file_slot = READ_ONCE(sqe->file_index);
1363 	if (accept->file_slot) {
1364 		if (accept->flags & SOCK_CLOEXEC)
1365 			return -EINVAL;
1366 		if (flags & IORING_ACCEPT_MULTISHOT &&
1367 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1368 			return -EINVAL;
1369 	}
1370 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1371 		return -EINVAL;
1372 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1373 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1374 	if (flags & IORING_ACCEPT_MULTISHOT)
1375 		req->flags |= REQ_F_APOLL_MULTISHOT;
1376 	return 0;
1377 }
1378 
1379 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1380 {
1381 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1382 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1383 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
1384 	bool fixed = !!accept->file_slot;
1385 	struct file *file;
1386 	int ret, fd;
1387 
1388 	if (!io_check_multishot(req, issue_flags))
1389 		return -EAGAIN;
1390 retry:
1391 	if (!fixed) {
1392 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1393 		if (unlikely(fd < 0))
1394 			return fd;
1395 	}
1396 	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
1397 			 accept->flags);
1398 	if (IS_ERR(file)) {
1399 		if (!fixed)
1400 			put_unused_fd(fd);
1401 		ret = PTR_ERR(file);
1402 		if (ret == -EAGAIN && force_nonblock) {
1403 			/*
1404 			 * if it's multishot and polled, we don't need to
1405 			 * return EAGAIN to arm the poll infra since it
1406 			 * has already been done
1407 			 */
1408 			if (issue_flags & IO_URING_F_MULTISHOT)
1409 				return IOU_ISSUE_SKIP_COMPLETE;
1410 			return ret;
1411 		}
1412 		if (ret == -ERESTARTSYS)
1413 			ret = -EINTR;
1414 		req_set_fail(req);
1415 	} else if (!fixed) {
1416 		fd_install(fd, file);
1417 		ret = fd;
1418 	} else {
1419 		ret = io_fixed_fd_install(req, issue_flags, file,
1420 						accept->file_slot);
1421 	}
1422 
1423 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
1424 		io_req_set_res(req, ret, 0);
1425 		return IOU_OK;
1426 	}
1427 
1428 	if (ret < 0)
1429 		return ret;
1430 	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
1431 				ret, IORING_CQE_F_MORE))
1432 		goto retry;
1433 
1434 	io_req_set_res(req, ret, 0);
1435 	return IOU_STOP_MULTISHOT;
1436 }
1437 
1438 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1439 {
1440 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1441 
1442 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1443 		return -EINVAL;
1444 
1445 	sock->domain = READ_ONCE(sqe->fd);
1446 	sock->type = READ_ONCE(sqe->off);
1447 	sock->protocol = READ_ONCE(sqe->len);
1448 	sock->file_slot = READ_ONCE(sqe->file_index);
1449 	sock->nofile = rlimit(RLIMIT_NOFILE);
1450 
1451 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1452 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1453 		return -EINVAL;
1454 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1455 		return -EINVAL;
1456 	return 0;
1457 }
1458 
1459 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1460 {
1461 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1462 	bool fixed = !!sock->file_slot;
1463 	struct file *file;
1464 	int ret, fd;
1465 
1466 	if (!fixed) {
1467 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1468 		if (unlikely(fd < 0))
1469 			return fd;
1470 	}
1471 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1472 	if (IS_ERR(file)) {
1473 		if (!fixed)
1474 			put_unused_fd(fd);
1475 		ret = PTR_ERR(file);
1476 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1477 			return -EAGAIN;
1478 		if (ret == -ERESTARTSYS)
1479 			ret = -EINTR;
1480 		req_set_fail(req);
1481 	} else if (!fixed) {
1482 		fd_install(fd, file);
1483 		ret = fd;
1484 	} else {
1485 		ret = io_fixed_fd_install(req, issue_flags, file,
1486 					    sock->file_slot);
1487 	}
1488 	io_req_set_res(req, ret, 0);
1489 	return IOU_OK;
1490 }
1491 
1492 int io_connect_prep_async(struct io_kiocb *req)
1493 {
1494 	struct io_async_connect *io = req->async_data;
1495 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1496 
1497 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
1498 }
1499 
1500 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1501 {
1502 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1503 
1504 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1505 		return -EINVAL;
1506 
1507 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1508 	conn->addr_len =  READ_ONCE(sqe->addr2);
1509 	conn->in_progress = conn->seen_econnaborted = false;
1510 	return 0;
1511 }
1512 
1513 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1514 {
1515 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1516 	struct io_async_connect __io, *io;
1517 	unsigned file_flags;
1518 	int ret;
1519 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1520 
1521 	if (req_has_async_data(req)) {
1522 		io = req->async_data;
1523 	} else {
1524 		ret = move_addr_to_kernel(connect->addr,
1525 						connect->addr_len,
1526 						&__io.address);
1527 		if (ret)
1528 			goto out;
1529 		io = &__io;
1530 	}
1531 
1532 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1533 
1534 	ret = __sys_connect_file(req->file, &io->address,
1535 					connect->addr_len, file_flags);
1536 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1537 	    && force_nonblock) {
1538 		if (ret == -EINPROGRESS) {
1539 			connect->in_progress = true;
1540 		} else if (ret == -ECONNABORTED) {
1541 			if (connect->seen_econnaborted)
1542 				goto out;
1543 			connect->seen_econnaborted = true;
1544 		}
1545 		if (req_has_async_data(req))
1546 			return -EAGAIN;
1547 		if (io_alloc_async_data(req)) {
1548 			ret = -ENOMEM;
1549 			goto out;
1550 		}
1551 		memcpy(req->async_data, &__io, sizeof(__io));
1552 		return -EAGAIN;
1553 	}
1554 	if (connect->in_progress) {
1555 		/*
1556 		 * At least bluetooth will return -EBADFD on a re-connect
1557 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1558 		 * which means the previous result is good. For both of these,
1559 		 * grab the sock_error() and use that for the completion.
1560 		 */
1561 		if (ret == -EBADFD || ret == -EISCONN)
1562 			ret = sock_error(sock_from_file(req->file)->sk);
1563 	}
1564 	if (ret == -ERESTARTSYS)
1565 		ret = -EINTR;
1566 out:
1567 	if (ret < 0)
1568 		req_set_fail(req);
1569 	io_req_set_res(req, ret, 0);
1570 	return IOU_OK;
1571 }
1572 
1573 void io_netmsg_cache_free(struct io_cache_entry *entry)
1574 {
1575 	kfree(container_of(entry, struct io_async_msghdr, cache));
1576 }
1577 #endif
1578