xref: /openbmc/linux/io_uring/net.c (revision bc9a2b3e)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
10 
11 #include <uapi/linux/io_uring.h>
12 
13 #include "io_uring.h"
14 #include "kbuf.h"
15 #include "alloc_cache.h"
16 #include "net.h"
17 #include "notif.h"
18 #include "rsrc.h"
19 
20 #if defined(CONFIG_NET)
21 struct io_shutdown {
22 	struct file			*file;
23 	int				how;
24 };
25 
26 struct io_accept {
27 	struct file			*file;
28 	struct sockaddr __user		*addr;
29 	int __user			*addr_len;
30 	int				flags;
31 	u32				file_slot;
32 	unsigned long			nofile;
33 };
34 
35 struct io_socket {
36 	struct file			*file;
37 	int				domain;
38 	int				type;
39 	int				protocol;
40 	int				flags;
41 	u32				file_slot;
42 	unsigned long			nofile;
43 };
44 
45 struct io_connect {
46 	struct file			*file;
47 	struct sockaddr __user		*addr;
48 	int				addr_len;
49 	bool				in_progress;
50 	bool				seen_econnaborted;
51 };
52 
53 struct io_sr_msg {
54 	struct file			*file;
55 	union {
56 		struct compat_msghdr __user	*umsg_compat;
57 		struct user_msghdr __user	*umsg;
58 		void __user			*buf;
59 	};
60 	unsigned			len;
61 	unsigned			done_io;
62 	unsigned			msg_flags;
63 	u16				flags;
64 	/* initialised and used only by !msg send variants */
65 	u16				addr_len;
66 	u16				buf_group;
67 	void __user			*addr;
68 	void __user			*msg_control;
69 	/* used only for send zerocopy */
70 	struct io_kiocb 		*notif;
71 };
72 
73 static inline bool io_check_multishot(struct io_kiocb *req,
74 				      unsigned int issue_flags)
75 {
76 	/*
77 	 * When ->locked_cq is set we only allow to post CQEs from the original
78 	 * task context. Usual request completions will be handled in other
79 	 * generic paths but multipoll may decide to post extra cqes.
80 	 */
81 	return !(issue_flags & IO_URING_F_IOWQ) ||
82 		!(issue_flags & IO_URING_F_MULTISHOT) ||
83 		!req->ctx->task_complete;
84 }
85 
86 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
87 {
88 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
89 
90 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
91 		     sqe->buf_index || sqe->splice_fd_in))
92 		return -EINVAL;
93 
94 	shutdown->how = READ_ONCE(sqe->len);
95 	req->flags |= REQ_F_FORCE_ASYNC;
96 	return 0;
97 }
98 
99 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
100 {
101 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
102 	struct socket *sock;
103 	int ret;
104 
105 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
106 
107 	sock = sock_from_file(req->file);
108 	if (unlikely(!sock))
109 		return -ENOTSOCK;
110 
111 	ret = __sys_shutdown_sock(sock, shutdown->how);
112 	io_req_set_res(req, ret, 0);
113 	return IOU_OK;
114 }
115 
116 static bool io_net_retry(struct socket *sock, int flags)
117 {
118 	if (!(flags & MSG_WAITALL))
119 		return false;
120 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
121 }
122 
123 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
124 {
125 	struct io_async_msghdr *hdr = req->async_data;
126 
127 	if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED)
128 		return;
129 
130 	/* Let normal cleanup path reap it if we fail adding to the cache */
131 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
132 		req->async_data = NULL;
133 		req->flags &= ~REQ_F_ASYNC_DATA;
134 	}
135 }
136 
137 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
138 						  unsigned int issue_flags)
139 {
140 	struct io_ring_ctx *ctx = req->ctx;
141 	struct io_cache_entry *entry;
142 	struct io_async_msghdr *hdr;
143 
144 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
145 		entry = io_alloc_cache_get(&ctx->netmsg_cache);
146 		if (entry) {
147 			hdr = container_of(entry, struct io_async_msghdr, cache);
148 			hdr->free_iov = NULL;
149 			req->flags |= REQ_F_ASYNC_DATA;
150 			req->async_data = hdr;
151 			return hdr;
152 		}
153 	}
154 
155 	if (!io_alloc_async_data(req)) {
156 		hdr = req->async_data;
157 		hdr->free_iov = NULL;
158 		return hdr;
159 	}
160 	return NULL;
161 }
162 
163 static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req)
164 {
165 	/* ->prep_async is always called from the submission context */
166 	return io_msg_alloc_async(req, 0);
167 }
168 
169 static int io_setup_async_msg(struct io_kiocb *req,
170 			      struct io_async_msghdr *kmsg,
171 			      unsigned int issue_flags)
172 {
173 	struct io_async_msghdr *async_msg;
174 
175 	if (req_has_async_data(req))
176 		return -EAGAIN;
177 	async_msg = io_msg_alloc_async(req, issue_flags);
178 	if (!async_msg) {
179 		kfree(kmsg->free_iov);
180 		return -ENOMEM;
181 	}
182 	req->flags |= REQ_F_NEED_CLEANUP;
183 	memcpy(async_msg, kmsg, sizeof(*kmsg));
184 	if (async_msg->msg.msg_name)
185 		async_msg->msg.msg_name = &async_msg->addr;
186 	/* if were using fast_iov, set it to the new one */
187 	if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
188 		size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
189 		async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
190 	}
191 
192 	return -EAGAIN;
193 }
194 
195 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
196 			       struct io_async_msghdr *iomsg)
197 {
198 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
199 	int ret;
200 
201 	iomsg->msg.msg_name = &iomsg->addr;
202 	iomsg->free_iov = iomsg->fast_iov;
203 	ret = sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
204 					&iomsg->free_iov);
205 	/* save msg_control as sys_sendmsg() overwrites it */
206 	sr->msg_control = iomsg->msg.msg_control_user;
207 	return ret;
208 }
209 
210 int io_send_prep_async(struct io_kiocb *req)
211 {
212 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
213 	struct io_async_msghdr *io;
214 	int ret;
215 
216 	if (!zc->addr || req_has_async_data(req))
217 		return 0;
218 	io = io_msg_alloc_async_prep(req);
219 	if (!io)
220 		return -ENOMEM;
221 	ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr);
222 	return ret;
223 }
224 
225 static int io_setup_async_addr(struct io_kiocb *req,
226 			      struct sockaddr_storage *addr_storage,
227 			      unsigned int issue_flags)
228 {
229 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
230 	struct io_async_msghdr *io;
231 
232 	if (!sr->addr || req_has_async_data(req))
233 		return -EAGAIN;
234 	io = io_msg_alloc_async(req, issue_flags);
235 	if (!io)
236 		return -ENOMEM;
237 	memcpy(&io->addr, addr_storage, sizeof(io->addr));
238 	return -EAGAIN;
239 }
240 
241 int io_sendmsg_prep_async(struct io_kiocb *req)
242 {
243 	int ret;
244 
245 	if (!io_msg_alloc_async_prep(req))
246 		return -ENOMEM;
247 	ret = io_sendmsg_copy_hdr(req, req->async_data);
248 	if (!ret)
249 		req->flags |= REQ_F_NEED_CLEANUP;
250 	return ret;
251 }
252 
253 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
254 {
255 	struct io_async_msghdr *io = req->async_data;
256 
257 	kfree(io->free_iov);
258 }
259 
260 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
261 {
262 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
263 
264 	if (req->opcode == IORING_OP_SEND) {
265 		if (READ_ONCE(sqe->__pad3[0]))
266 			return -EINVAL;
267 		sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
268 		sr->addr_len = READ_ONCE(sqe->addr_len);
269 	} else if (sqe->addr2 || sqe->file_index) {
270 		return -EINVAL;
271 	}
272 
273 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
274 	sr->len = READ_ONCE(sqe->len);
275 	sr->flags = READ_ONCE(sqe->ioprio);
276 	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
277 		return -EINVAL;
278 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
279 	if (sr->msg_flags & MSG_DONTWAIT)
280 		req->flags |= REQ_F_NOWAIT;
281 
282 #ifdef CONFIG_COMPAT
283 	if (req->ctx->compat)
284 		sr->msg_flags |= MSG_CMSG_COMPAT;
285 #endif
286 	sr->done_io = 0;
287 	return 0;
288 }
289 
290 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
291 {
292 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
293 	struct io_async_msghdr iomsg, *kmsg;
294 	struct socket *sock;
295 	unsigned flags;
296 	int min_ret = 0;
297 	int ret;
298 
299 	sock = sock_from_file(req->file);
300 	if (unlikely(!sock))
301 		return -ENOTSOCK;
302 
303 	if (req_has_async_data(req)) {
304 		kmsg = req->async_data;
305 		kmsg->msg.msg_control_user = sr->msg_control;
306 	} else {
307 		ret = io_sendmsg_copy_hdr(req, &iomsg);
308 		if (ret)
309 			return ret;
310 		kmsg = &iomsg;
311 	}
312 
313 	if (!(req->flags & REQ_F_POLLED) &&
314 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
315 		return io_setup_async_msg(req, kmsg, issue_flags);
316 
317 	flags = sr->msg_flags;
318 	if (issue_flags & IO_URING_F_NONBLOCK)
319 		flags |= MSG_DONTWAIT;
320 	if (flags & MSG_WAITALL)
321 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
322 
323 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
324 
325 	if (ret < min_ret) {
326 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
327 			return io_setup_async_msg(req, kmsg, issue_flags);
328 		if (ret > 0 && io_net_retry(sock, flags)) {
329 			kmsg->msg.msg_controllen = 0;
330 			kmsg->msg.msg_control = NULL;
331 			sr->done_io += ret;
332 			req->flags |= REQ_F_PARTIAL_IO;
333 			return io_setup_async_msg(req, kmsg, issue_flags);
334 		}
335 		if (ret == -ERESTARTSYS)
336 			ret = -EINTR;
337 		req_set_fail(req);
338 	}
339 	/* fast path, check for non-NULL to avoid function call */
340 	if (kmsg->free_iov)
341 		kfree(kmsg->free_iov);
342 	req->flags &= ~REQ_F_NEED_CLEANUP;
343 	io_netmsg_recycle(req, issue_flags);
344 	if (ret >= 0)
345 		ret += sr->done_io;
346 	else if (sr->done_io)
347 		ret = sr->done_io;
348 	io_req_set_res(req, ret, 0);
349 	return IOU_OK;
350 }
351 
352 int io_send(struct io_kiocb *req, unsigned int issue_flags)
353 {
354 	struct sockaddr_storage __address;
355 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
356 	struct msghdr msg;
357 	struct socket *sock;
358 	unsigned flags;
359 	int min_ret = 0;
360 	int ret;
361 
362 	msg.msg_name = NULL;
363 	msg.msg_control = NULL;
364 	msg.msg_controllen = 0;
365 	msg.msg_namelen = 0;
366 	msg.msg_ubuf = NULL;
367 
368 	if (sr->addr) {
369 		if (req_has_async_data(req)) {
370 			struct io_async_msghdr *io = req->async_data;
371 
372 			msg.msg_name = &io->addr;
373 		} else {
374 			ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address);
375 			if (unlikely(ret < 0))
376 				return ret;
377 			msg.msg_name = (struct sockaddr *)&__address;
378 		}
379 		msg.msg_namelen = sr->addr_len;
380 	}
381 
382 	if (!(req->flags & REQ_F_POLLED) &&
383 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
384 		return io_setup_async_addr(req, &__address, issue_flags);
385 
386 	sock = sock_from_file(req->file);
387 	if (unlikely(!sock))
388 		return -ENOTSOCK;
389 
390 	ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
391 	if (unlikely(ret))
392 		return ret;
393 
394 	flags = sr->msg_flags;
395 	if (issue_flags & IO_URING_F_NONBLOCK)
396 		flags |= MSG_DONTWAIT;
397 	if (flags & MSG_WAITALL)
398 		min_ret = iov_iter_count(&msg.msg_iter);
399 
400 	msg.msg_flags = flags;
401 	ret = sock_sendmsg(sock, &msg);
402 	if (ret < min_ret) {
403 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
404 			return io_setup_async_addr(req, &__address, issue_flags);
405 
406 		if (ret > 0 && io_net_retry(sock, flags)) {
407 			sr->len -= ret;
408 			sr->buf += ret;
409 			sr->done_io += ret;
410 			req->flags |= REQ_F_PARTIAL_IO;
411 			return io_setup_async_addr(req, &__address, issue_flags);
412 		}
413 		if (ret == -ERESTARTSYS)
414 			ret = -EINTR;
415 		req_set_fail(req);
416 	}
417 	if (ret >= 0)
418 		ret += sr->done_io;
419 	else if (sr->done_io)
420 		ret = sr->done_io;
421 	io_req_set_res(req, ret, 0);
422 	return IOU_OK;
423 }
424 
425 static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
426 {
427 	int hdr;
428 
429 	if (iomsg->namelen < 0)
430 		return true;
431 	if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
432 			       iomsg->namelen, &hdr))
433 		return true;
434 	if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
435 		return true;
436 
437 	return false;
438 }
439 
440 static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
441 				 struct io_async_msghdr *iomsg)
442 {
443 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
444 	struct user_msghdr msg;
445 	int ret;
446 
447 	if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
448 		return -EFAULT;
449 
450 	ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
451 	if (ret)
452 		return ret;
453 
454 	if (req->flags & REQ_F_BUFFER_SELECT) {
455 		if (msg.msg_iovlen == 0) {
456 			sr->len = iomsg->fast_iov[0].iov_len = 0;
457 			iomsg->fast_iov[0].iov_base = NULL;
458 			iomsg->free_iov = NULL;
459 		} else if (msg.msg_iovlen > 1) {
460 			return -EINVAL;
461 		} else {
462 			if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
463 				return -EFAULT;
464 			sr->len = iomsg->fast_iov[0].iov_len;
465 			iomsg->free_iov = NULL;
466 		}
467 
468 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
469 			iomsg->namelen = msg.msg_namelen;
470 			iomsg->controllen = msg.msg_controllen;
471 			if (io_recvmsg_multishot_overflow(iomsg))
472 				return -EOVERFLOW;
473 		}
474 	} else {
475 		iomsg->free_iov = iomsg->fast_iov;
476 		ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
477 				     &iomsg->free_iov, &iomsg->msg.msg_iter,
478 				     false);
479 		if (ret > 0)
480 			ret = 0;
481 	}
482 
483 	return ret;
484 }
485 
486 #ifdef CONFIG_COMPAT
487 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
488 					struct io_async_msghdr *iomsg)
489 {
490 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
491 	struct compat_msghdr msg;
492 	struct compat_iovec __user *uiov;
493 	int ret;
494 
495 	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
496 		return -EFAULT;
497 
498 	ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
499 	if (ret)
500 		return ret;
501 
502 	uiov = compat_ptr(msg.msg_iov);
503 	if (req->flags & REQ_F_BUFFER_SELECT) {
504 		compat_ssize_t clen;
505 
506 		iomsg->free_iov = NULL;
507 		if (msg.msg_iovlen == 0) {
508 			sr->len = 0;
509 		} else if (msg.msg_iovlen > 1) {
510 			return -EINVAL;
511 		} else {
512 			if (!access_ok(uiov, sizeof(*uiov)))
513 				return -EFAULT;
514 			if (__get_user(clen, &uiov->iov_len))
515 				return -EFAULT;
516 			if (clen < 0)
517 				return -EINVAL;
518 			sr->len = clen;
519 		}
520 
521 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
522 			iomsg->namelen = msg.msg_namelen;
523 			iomsg->controllen = msg.msg_controllen;
524 			if (io_recvmsg_multishot_overflow(iomsg))
525 				return -EOVERFLOW;
526 		}
527 	} else {
528 		iomsg->free_iov = iomsg->fast_iov;
529 		ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen,
530 				   UIO_FASTIOV, &iomsg->free_iov,
531 				   &iomsg->msg.msg_iter, true);
532 		if (ret < 0)
533 			return ret;
534 	}
535 
536 	return 0;
537 }
538 #endif
539 
540 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
541 			       struct io_async_msghdr *iomsg)
542 {
543 	iomsg->msg.msg_name = &iomsg->addr;
544 
545 #ifdef CONFIG_COMPAT
546 	if (req->ctx->compat)
547 		return __io_compat_recvmsg_copy_hdr(req, iomsg);
548 #endif
549 
550 	return __io_recvmsg_copy_hdr(req, iomsg);
551 }
552 
553 int io_recvmsg_prep_async(struct io_kiocb *req)
554 {
555 	int ret;
556 
557 	if (!io_msg_alloc_async_prep(req))
558 		return -ENOMEM;
559 	ret = io_recvmsg_copy_hdr(req, req->async_data);
560 	if (!ret)
561 		req->flags |= REQ_F_NEED_CLEANUP;
562 	return ret;
563 }
564 
565 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
566 
567 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
568 {
569 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
570 
571 	if (unlikely(sqe->file_index || sqe->addr2))
572 		return -EINVAL;
573 
574 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
575 	sr->len = READ_ONCE(sqe->len);
576 	sr->flags = READ_ONCE(sqe->ioprio);
577 	if (sr->flags & ~(RECVMSG_FLAGS))
578 		return -EINVAL;
579 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
580 	if (sr->msg_flags & MSG_DONTWAIT)
581 		req->flags |= REQ_F_NOWAIT;
582 	if (sr->msg_flags & MSG_ERRQUEUE)
583 		req->flags |= REQ_F_CLEAR_POLLIN;
584 	if (sr->flags & IORING_RECV_MULTISHOT) {
585 		if (!(req->flags & REQ_F_BUFFER_SELECT))
586 			return -EINVAL;
587 		if (sr->msg_flags & MSG_WAITALL)
588 			return -EINVAL;
589 		if (req->opcode == IORING_OP_RECV && sr->len)
590 			return -EINVAL;
591 		req->flags |= REQ_F_APOLL_MULTISHOT;
592 		/*
593 		 * Store the buffer group for this multishot receive separately,
594 		 * as if we end up doing an io-wq based issue that selects a
595 		 * buffer, it has to be committed immediately and that will
596 		 * clear ->buf_list. This means we lose the link to the buffer
597 		 * list, and the eventual buffer put on completion then cannot
598 		 * restore it.
599 		 */
600 		sr->buf_group = req->buf_index;
601 	}
602 
603 #ifdef CONFIG_COMPAT
604 	if (req->ctx->compat)
605 		sr->msg_flags |= MSG_CMSG_COMPAT;
606 #endif
607 	sr->done_io = 0;
608 	return 0;
609 }
610 
611 static inline void io_recv_prep_retry(struct io_kiocb *req)
612 {
613 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
614 
615 	sr->done_io = 0;
616 	sr->len = 0; /* get from the provided buffer */
617 	req->buf_index = sr->buf_group;
618 }
619 
620 /*
621  * Finishes io_recv and io_recvmsg.
622  *
623  * Returns true if it is actually finished, or false if it should run
624  * again (for multishot).
625  */
626 static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
627 				  unsigned int cflags, bool mshot_finished,
628 				  unsigned issue_flags)
629 {
630 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
631 		io_req_set_res(req, *ret, cflags);
632 		*ret = IOU_OK;
633 		return true;
634 	}
635 
636 	if (!mshot_finished) {
637 		if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
638 			       req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) {
639 			io_recv_prep_retry(req);
640 			return false;
641 		}
642 		/* Otherwise stop multishot but use the current result. */
643 	}
644 
645 	io_req_set_res(req, *ret, cflags);
646 
647 	if (issue_flags & IO_URING_F_MULTISHOT)
648 		*ret = IOU_STOP_MULTISHOT;
649 	else
650 		*ret = IOU_OK;
651 	return true;
652 }
653 
654 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
655 				     struct io_sr_msg *sr, void __user **buf,
656 				     size_t *len)
657 {
658 	unsigned long ubuf = (unsigned long) *buf;
659 	unsigned long hdr;
660 
661 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
662 		kmsg->controllen;
663 	if (*len < hdr)
664 		return -EFAULT;
665 
666 	if (kmsg->controllen) {
667 		unsigned long control = ubuf + hdr - kmsg->controllen;
668 
669 		kmsg->msg.msg_control_user = (void __user *) control;
670 		kmsg->msg.msg_controllen = kmsg->controllen;
671 	}
672 
673 	sr->buf = *buf; /* stash for later copy */
674 	*buf = (void __user *) (ubuf + hdr);
675 	kmsg->payloadlen = *len = *len - hdr;
676 	return 0;
677 }
678 
679 struct io_recvmsg_multishot_hdr {
680 	struct io_uring_recvmsg_out msg;
681 	struct sockaddr_storage addr;
682 };
683 
684 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
685 				struct io_async_msghdr *kmsg,
686 				unsigned int flags, bool *finished)
687 {
688 	int err;
689 	int copy_len;
690 	struct io_recvmsg_multishot_hdr hdr;
691 
692 	if (kmsg->namelen)
693 		kmsg->msg.msg_name = &hdr.addr;
694 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
695 	kmsg->msg.msg_namelen = 0;
696 
697 	if (sock->file->f_flags & O_NONBLOCK)
698 		flags |= MSG_DONTWAIT;
699 
700 	err = sock_recvmsg(sock, &kmsg->msg, flags);
701 	*finished = err <= 0;
702 	if (err < 0)
703 		return err;
704 
705 	hdr.msg = (struct io_uring_recvmsg_out) {
706 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
707 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
708 	};
709 
710 	hdr.msg.payloadlen = err;
711 	if (err > kmsg->payloadlen)
712 		err = kmsg->payloadlen;
713 
714 	copy_len = sizeof(struct io_uring_recvmsg_out);
715 	if (kmsg->msg.msg_namelen > kmsg->namelen)
716 		copy_len += kmsg->namelen;
717 	else
718 		copy_len += kmsg->msg.msg_namelen;
719 
720 	/*
721 	 *      "fromlen shall refer to the value before truncation.."
722 	 *                      1003.1g
723 	 */
724 	hdr.msg.namelen = kmsg->msg.msg_namelen;
725 
726 	/* ensure that there is no gap between hdr and sockaddr_storage */
727 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
728 		     sizeof(struct io_uring_recvmsg_out));
729 	if (copy_to_user(io->buf, &hdr, copy_len)) {
730 		*finished = true;
731 		return -EFAULT;
732 	}
733 
734 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
735 			kmsg->controllen + err;
736 }
737 
738 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
739 {
740 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
741 	struct io_async_msghdr iomsg, *kmsg;
742 	struct socket *sock;
743 	unsigned int cflags;
744 	unsigned flags;
745 	int ret, min_ret = 0;
746 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
747 	bool mshot_finished = true;
748 
749 	sock = sock_from_file(req->file);
750 	if (unlikely(!sock))
751 		return -ENOTSOCK;
752 
753 	if (req_has_async_data(req)) {
754 		kmsg = req->async_data;
755 	} else {
756 		ret = io_recvmsg_copy_hdr(req, &iomsg);
757 		if (ret)
758 			return ret;
759 		kmsg = &iomsg;
760 	}
761 
762 	if (!(req->flags & REQ_F_POLLED) &&
763 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
764 		return io_setup_async_msg(req, kmsg, issue_flags);
765 
766 	if (!io_check_multishot(req, issue_flags))
767 		return io_setup_async_msg(req, kmsg, issue_flags);
768 
769 retry_multishot:
770 	if (io_do_buffer_select(req)) {
771 		void __user *buf;
772 		size_t len = sr->len;
773 
774 		buf = io_buffer_select(req, &len, issue_flags);
775 		if (!buf)
776 			return -ENOBUFS;
777 
778 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
779 			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
780 			if (ret) {
781 				io_kbuf_recycle(req, issue_flags);
782 				return ret;
783 			}
784 		}
785 
786 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
787 	}
788 
789 	flags = sr->msg_flags;
790 	if (force_nonblock)
791 		flags |= MSG_DONTWAIT;
792 
793 	kmsg->msg.msg_get_inq = 1;
794 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
795 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
796 					   &mshot_finished);
797 	} else {
798 		/* disable partial retry for recvmsg with cmsg attached */
799 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
800 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
801 
802 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
803 					 kmsg->uaddr, flags);
804 	}
805 
806 	if (ret < min_ret) {
807 		if (ret == -EAGAIN && force_nonblock) {
808 			ret = io_setup_async_msg(req, kmsg, issue_flags);
809 			if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) {
810 				io_kbuf_recycle(req, issue_flags);
811 				return IOU_ISSUE_SKIP_COMPLETE;
812 			}
813 			return ret;
814 		}
815 		if (ret > 0 && io_net_retry(sock, flags)) {
816 			sr->done_io += ret;
817 			req->flags |= REQ_F_PARTIAL_IO;
818 			return io_setup_async_msg(req, kmsg, issue_flags);
819 		}
820 		if (ret == -ERESTARTSYS)
821 			ret = -EINTR;
822 		req_set_fail(req);
823 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
824 		req_set_fail(req);
825 	}
826 
827 	if (ret > 0)
828 		ret += sr->done_io;
829 	else if (sr->done_io)
830 		ret = sr->done_io;
831 	else
832 		io_kbuf_recycle(req, issue_flags);
833 
834 	cflags = io_put_kbuf(req, issue_flags);
835 	if (kmsg->msg.msg_inq)
836 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
837 
838 	if (!io_recv_finish(req, &ret, cflags, mshot_finished, issue_flags))
839 		goto retry_multishot;
840 
841 	if (mshot_finished) {
842 		/* fast path, check for non-NULL to avoid function call */
843 		if (kmsg->free_iov)
844 			kfree(kmsg->free_iov);
845 		io_netmsg_recycle(req, issue_flags);
846 		req->flags &= ~REQ_F_NEED_CLEANUP;
847 	}
848 
849 	return ret;
850 }
851 
852 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
853 {
854 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
855 	struct msghdr msg;
856 	struct socket *sock;
857 	unsigned int cflags;
858 	unsigned flags;
859 	int ret, min_ret = 0;
860 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
861 	size_t len = sr->len;
862 
863 	if (!(req->flags & REQ_F_POLLED) &&
864 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
865 		return -EAGAIN;
866 
867 	if (!io_check_multishot(req, issue_flags))
868 		return -EAGAIN;
869 
870 	sock = sock_from_file(req->file);
871 	if (unlikely(!sock))
872 		return -ENOTSOCK;
873 
874 retry_multishot:
875 	if (io_do_buffer_select(req)) {
876 		void __user *buf;
877 
878 		buf = io_buffer_select(req, &len, issue_flags);
879 		if (!buf)
880 			return -ENOBUFS;
881 		sr->buf = buf;
882 	}
883 
884 	ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
885 	if (unlikely(ret))
886 		goto out_free;
887 
888 	msg.msg_name = NULL;
889 	msg.msg_namelen = 0;
890 	msg.msg_control = NULL;
891 	msg.msg_get_inq = 1;
892 	msg.msg_flags = 0;
893 	msg.msg_controllen = 0;
894 	msg.msg_iocb = NULL;
895 	msg.msg_ubuf = NULL;
896 
897 	flags = sr->msg_flags;
898 	if (force_nonblock)
899 		flags |= MSG_DONTWAIT;
900 	if (flags & MSG_WAITALL)
901 		min_ret = iov_iter_count(&msg.msg_iter);
902 
903 	ret = sock_recvmsg(sock, &msg, flags);
904 	if (ret < min_ret) {
905 		if (ret == -EAGAIN && force_nonblock) {
906 			if (issue_flags & IO_URING_F_MULTISHOT) {
907 				io_kbuf_recycle(req, issue_flags);
908 				return IOU_ISSUE_SKIP_COMPLETE;
909 			}
910 
911 			return -EAGAIN;
912 		}
913 		if (ret > 0 && io_net_retry(sock, flags)) {
914 			sr->len -= ret;
915 			sr->buf += ret;
916 			sr->done_io += ret;
917 			req->flags |= REQ_F_PARTIAL_IO;
918 			return -EAGAIN;
919 		}
920 		if (ret == -ERESTARTSYS)
921 			ret = -EINTR;
922 		req_set_fail(req);
923 	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
924 out_free:
925 		req_set_fail(req);
926 	}
927 
928 	if (ret > 0)
929 		ret += sr->done_io;
930 	else if (sr->done_io)
931 		ret = sr->done_io;
932 	else
933 		io_kbuf_recycle(req, issue_flags);
934 
935 	cflags = io_put_kbuf(req, issue_flags);
936 	if (msg.msg_inq)
937 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
938 
939 	if (!io_recv_finish(req, &ret, cflags, ret <= 0, issue_flags))
940 		goto retry_multishot;
941 
942 	return ret;
943 }
944 
945 void io_send_zc_cleanup(struct io_kiocb *req)
946 {
947 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
948 	struct io_async_msghdr *io;
949 
950 	if (req_has_async_data(req)) {
951 		io = req->async_data;
952 		/* might be ->fast_iov if *msg_copy_hdr failed */
953 		if (io->free_iov != io->fast_iov)
954 			kfree(io->free_iov);
955 	}
956 	if (zc->notif) {
957 		io_notif_flush(zc->notif);
958 		zc->notif = NULL;
959 	}
960 }
961 
962 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
963 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
964 
965 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
966 {
967 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
968 	struct io_ring_ctx *ctx = req->ctx;
969 	struct io_kiocb *notif;
970 
971 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
972 		return -EINVAL;
973 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
974 	if (req->flags & REQ_F_CQE_SKIP)
975 		return -EINVAL;
976 
977 	notif = zc->notif = io_alloc_notif(ctx);
978 	if (!notif)
979 		return -ENOMEM;
980 	notif->cqe.user_data = req->cqe.user_data;
981 	notif->cqe.res = 0;
982 	notif->cqe.flags = IORING_CQE_F_NOTIF;
983 	req->flags |= REQ_F_NEED_CLEANUP;
984 
985 	zc->flags = READ_ONCE(sqe->ioprio);
986 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
987 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
988 			return -EINVAL;
989 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
990 			io_notif_set_extended(notif);
991 			io_notif_to_data(notif)->zc_report = true;
992 		}
993 	}
994 
995 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
996 		unsigned idx = READ_ONCE(sqe->buf_index);
997 
998 		if (unlikely(idx >= ctx->nr_user_bufs))
999 			return -EFAULT;
1000 		idx = array_index_nospec(idx, ctx->nr_user_bufs);
1001 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
1002 		io_req_set_rsrc_node(notif, ctx, 0);
1003 	}
1004 
1005 	if (req->opcode == IORING_OP_SEND_ZC) {
1006 		if (READ_ONCE(sqe->__pad3[0]))
1007 			return -EINVAL;
1008 		zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1009 		zc->addr_len = READ_ONCE(sqe->addr_len);
1010 	} else {
1011 		if (unlikely(sqe->addr2 || sqe->file_index))
1012 			return -EINVAL;
1013 		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
1014 			return -EINVAL;
1015 	}
1016 
1017 	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1018 	zc->len = READ_ONCE(sqe->len);
1019 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
1020 	if (zc->msg_flags & MSG_DONTWAIT)
1021 		req->flags |= REQ_F_NOWAIT;
1022 
1023 	zc->done_io = 0;
1024 
1025 #ifdef CONFIG_COMPAT
1026 	if (req->ctx->compat)
1027 		zc->msg_flags |= MSG_CMSG_COMPAT;
1028 #endif
1029 	return 0;
1030 }
1031 
1032 static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb,
1033 				 struct iov_iter *from, size_t length)
1034 {
1035 	skb_zcopy_downgrade_managed(skb);
1036 	return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1037 }
1038 
1039 static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
1040 			   struct iov_iter *from, size_t length)
1041 {
1042 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1043 	int frag = shinfo->nr_frags;
1044 	int ret = 0;
1045 	struct bvec_iter bi;
1046 	ssize_t copied = 0;
1047 	unsigned long truesize = 0;
1048 
1049 	if (!frag)
1050 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1051 	else if (unlikely(!skb_zcopy_managed(skb)))
1052 		return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1053 
1054 	bi.bi_size = min(from->count, length);
1055 	bi.bi_bvec_done = from->iov_offset;
1056 	bi.bi_idx = 0;
1057 
1058 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1059 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1060 
1061 		copied += v.bv_len;
1062 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1063 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1064 					   v.bv_offset, v.bv_len);
1065 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1066 	}
1067 	if (bi.bi_size)
1068 		ret = -EMSGSIZE;
1069 
1070 	shinfo->nr_frags = frag;
1071 	from->bvec += bi.bi_idx;
1072 	from->nr_segs -= bi.bi_idx;
1073 	from->count -= copied;
1074 	from->iov_offset = bi.bi_bvec_done;
1075 
1076 	skb->data_len += copied;
1077 	skb->len += copied;
1078 	skb->truesize += truesize;
1079 
1080 	if (sk && sk->sk_type == SOCK_STREAM) {
1081 		sk_wmem_queued_add(sk, truesize);
1082 		if (!skb_zcopy_pure(skb))
1083 			sk_mem_charge(sk, truesize);
1084 	} else {
1085 		refcount_add(truesize, &skb->sk->sk_wmem_alloc);
1086 	}
1087 	return ret;
1088 }
1089 
1090 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1091 {
1092 	struct sockaddr_storage __address;
1093 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1094 	struct msghdr msg;
1095 	struct socket *sock;
1096 	unsigned msg_flags;
1097 	int ret, min_ret = 0;
1098 
1099 	sock = sock_from_file(req->file);
1100 	if (unlikely(!sock))
1101 		return -ENOTSOCK;
1102 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1103 		return -EOPNOTSUPP;
1104 
1105 	msg.msg_name = NULL;
1106 	msg.msg_control = NULL;
1107 	msg.msg_controllen = 0;
1108 	msg.msg_namelen = 0;
1109 
1110 	if (zc->addr) {
1111 		if (req_has_async_data(req)) {
1112 			struct io_async_msghdr *io = req->async_data;
1113 
1114 			msg.msg_name = &io->addr;
1115 		} else {
1116 			ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address);
1117 			if (unlikely(ret < 0))
1118 				return ret;
1119 			msg.msg_name = (struct sockaddr *)&__address;
1120 		}
1121 		msg.msg_namelen = zc->addr_len;
1122 	}
1123 
1124 	if (!(req->flags & REQ_F_POLLED) &&
1125 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1126 		return io_setup_async_addr(req, &__address, issue_flags);
1127 
1128 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1129 		ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu,
1130 					(u64)(uintptr_t)zc->buf, zc->len);
1131 		if (unlikely(ret))
1132 			return ret;
1133 		msg.sg_from_iter = io_sg_from_iter;
1134 	} else {
1135 		io_notif_set_extended(zc->notif);
1136 		ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter);
1137 		if (unlikely(ret))
1138 			return ret;
1139 		ret = io_notif_account_mem(zc->notif, zc->len);
1140 		if (unlikely(ret))
1141 			return ret;
1142 		msg.sg_from_iter = io_sg_from_iter_iovec;
1143 	}
1144 
1145 	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
1146 	if (issue_flags & IO_URING_F_NONBLOCK)
1147 		msg_flags |= MSG_DONTWAIT;
1148 	if (msg_flags & MSG_WAITALL)
1149 		min_ret = iov_iter_count(&msg.msg_iter);
1150 
1151 	msg.msg_flags = msg_flags;
1152 	msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1153 	ret = sock_sendmsg(sock, &msg);
1154 
1155 	if (unlikely(ret < min_ret)) {
1156 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1157 			return io_setup_async_addr(req, &__address, issue_flags);
1158 
1159 		if (ret > 0 && io_net_retry(sock, msg.msg_flags)) {
1160 			zc->len -= ret;
1161 			zc->buf += ret;
1162 			zc->done_io += ret;
1163 			req->flags |= REQ_F_PARTIAL_IO;
1164 			return io_setup_async_addr(req, &__address, issue_flags);
1165 		}
1166 		if (ret == -ERESTARTSYS)
1167 			ret = -EINTR;
1168 		req_set_fail(req);
1169 	}
1170 
1171 	if (ret >= 0)
1172 		ret += zc->done_io;
1173 	else if (zc->done_io)
1174 		ret = zc->done_io;
1175 
1176 	/*
1177 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1178 	 * flushing notif to io_send_zc_cleanup()
1179 	 */
1180 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1181 		io_notif_flush(zc->notif);
1182 		req->flags &= ~REQ_F_NEED_CLEANUP;
1183 	}
1184 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1185 	return IOU_OK;
1186 }
1187 
1188 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1189 {
1190 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1191 	struct io_async_msghdr iomsg, *kmsg;
1192 	struct socket *sock;
1193 	unsigned flags;
1194 	int ret, min_ret = 0;
1195 
1196 	io_notif_set_extended(sr->notif);
1197 
1198 	sock = sock_from_file(req->file);
1199 	if (unlikely(!sock))
1200 		return -ENOTSOCK;
1201 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1202 		return -EOPNOTSUPP;
1203 
1204 	if (req_has_async_data(req)) {
1205 		kmsg = req->async_data;
1206 	} else {
1207 		ret = io_sendmsg_copy_hdr(req, &iomsg);
1208 		if (ret)
1209 			return ret;
1210 		kmsg = &iomsg;
1211 	}
1212 
1213 	if (!(req->flags & REQ_F_POLLED) &&
1214 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1215 		return io_setup_async_msg(req, kmsg, issue_flags);
1216 
1217 	flags = sr->msg_flags | MSG_ZEROCOPY;
1218 	if (issue_flags & IO_URING_F_NONBLOCK)
1219 		flags |= MSG_DONTWAIT;
1220 	if (flags & MSG_WAITALL)
1221 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1222 
1223 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1224 	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1225 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1226 
1227 	if (unlikely(ret < min_ret)) {
1228 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1229 			return io_setup_async_msg(req, kmsg, issue_flags);
1230 
1231 		if (ret > 0 && io_net_retry(sock, flags)) {
1232 			sr->done_io += ret;
1233 			req->flags |= REQ_F_PARTIAL_IO;
1234 			return io_setup_async_msg(req, kmsg, issue_flags);
1235 		}
1236 		if (ret == -ERESTARTSYS)
1237 			ret = -EINTR;
1238 		req_set_fail(req);
1239 	}
1240 	/* fast path, check for non-NULL to avoid function call */
1241 	if (kmsg->free_iov) {
1242 		kfree(kmsg->free_iov);
1243 		kmsg->free_iov = NULL;
1244 	}
1245 
1246 	io_netmsg_recycle(req, issue_flags);
1247 	if (ret >= 0)
1248 		ret += sr->done_io;
1249 	else if (sr->done_io)
1250 		ret = sr->done_io;
1251 
1252 	/*
1253 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1254 	 * flushing notif to io_send_zc_cleanup()
1255 	 */
1256 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1257 		io_notif_flush(sr->notif);
1258 		req->flags &= ~REQ_F_NEED_CLEANUP;
1259 	}
1260 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1261 	return IOU_OK;
1262 }
1263 
1264 void io_sendrecv_fail(struct io_kiocb *req)
1265 {
1266 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1267 
1268 	if (req->flags & REQ_F_PARTIAL_IO)
1269 		req->cqe.res = sr->done_io;
1270 
1271 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1272 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1273 		req->cqe.flags |= IORING_CQE_F_MORE;
1274 }
1275 
1276 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1277 {
1278 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1279 	unsigned flags;
1280 
1281 	if (sqe->len || sqe->buf_index)
1282 		return -EINVAL;
1283 
1284 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1285 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1286 	accept->flags = READ_ONCE(sqe->accept_flags);
1287 	accept->nofile = rlimit(RLIMIT_NOFILE);
1288 	flags = READ_ONCE(sqe->ioprio);
1289 	if (flags & ~IORING_ACCEPT_MULTISHOT)
1290 		return -EINVAL;
1291 
1292 	accept->file_slot = READ_ONCE(sqe->file_index);
1293 	if (accept->file_slot) {
1294 		if (accept->flags & SOCK_CLOEXEC)
1295 			return -EINVAL;
1296 		if (flags & IORING_ACCEPT_MULTISHOT &&
1297 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1298 			return -EINVAL;
1299 	}
1300 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1301 		return -EINVAL;
1302 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1303 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1304 	if (flags & IORING_ACCEPT_MULTISHOT)
1305 		req->flags |= REQ_F_APOLL_MULTISHOT;
1306 	return 0;
1307 }
1308 
1309 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1310 {
1311 	struct io_ring_ctx *ctx = req->ctx;
1312 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1313 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1314 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
1315 	bool fixed = !!accept->file_slot;
1316 	struct file *file;
1317 	int ret, fd;
1318 
1319 	if (!io_check_multishot(req, issue_flags))
1320 		return -EAGAIN;
1321 retry:
1322 	if (!fixed) {
1323 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1324 		if (unlikely(fd < 0))
1325 			return fd;
1326 	}
1327 	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
1328 			 accept->flags);
1329 	if (IS_ERR(file)) {
1330 		if (!fixed)
1331 			put_unused_fd(fd);
1332 		ret = PTR_ERR(file);
1333 		if (ret == -EAGAIN && force_nonblock) {
1334 			/*
1335 			 * if it's multishot and polled, we don't need to
1336 			 * return EAGAIN to arm the poll infra since it
1337 			 * has already been done
1338 			 */
1339 			if (issue_flags & IO_URING_F_MULTISHOT)
1340 				ret = IOU_ISSUE_SKIP_COMPLETE;
1341 			return ret;
1342 		}
1343 		if (ret == -ERESTARTSYS)
1344 			ret = -EINTR;
1345 		req_set_fail(req);
1346 	} else if (!fixed) {
1347 		fd_install(fd, file);
1348 		ret = fd;
1349 	} else {
1350 		ret = io_fixed_fd_install(req, issue_flags, file,
1351 						accept->file_slot);
1352 	}
1353 
1354 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
1355 		io_req_set_res(req, ret, 0);
1356 		return IOU_OK;
1357 	}
1358 
1359 	if (ret < 0)
1360 		return ret;
1361 	if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
1362 		       req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
1363 		goto retry;
1364 
1365 	return -ECANCELED;
1366 }
1367 
1368 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1369 {
1370 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1371 
1372 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1373 		return -EINVAL;
1374 
1375 	sock->domain = READ_ONCE(sqe->fd);
1376 	sock->type = READ_ONCE(sqe->off);
1377 	sock->protocol = READ_ONCE(sqe->len);
1378 	sock->file_slot = READ_ONCE(sqe->file_index);
1379 	sock->nofile = rlimit(RLIMIT_NOFILE);
1380 
1381 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1382 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1383 		return -EINVAL;
1384 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1385 		return -EINVAL;
1386 	return 0;
1387 }
1388 
1389 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1390 {
1391 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1392 	bool fixed = !!sock->file_slot;
1393 	struct file *file;
1394 	int ret, fd;
1395 
1396 	if (!fixed) {
1397 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1398 		if (unlikely(fd < 0))
1399 			return fd;
1400 	}
1401 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1402 	if (IS_ERR(file)) {
1403 		if (!fixed)
1404 			put_unused_fd(fd);
1405 		ret = PTR_ERR(file);
1406 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1407 			return -EAGAIN;
1408 		if (ret == -ERESTARTSYS)
1409 			ret = -EINTR;
1410 		req_set_fail(req);
1411 	} else if (!fixed) {
1412 		fd_install(fd, file);
1413 		ret = fd;
1414 	} else {
1415 		ret = io_fixed_fd_install(req, issue_flags, file,
1416 					    sock->file_slot);
1417 	}
1418 	io_req_set_res(req, ret, 0);
1419 	return IOU_OK;
1420 }
1421 
1422 int io_connect_prep_async(struct io_kiocb *req)
1423 {
1424 	struct io_async_connect *io = req->async_data;
1425 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1426 
1427 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
1428 }
1429 
1430 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1431 {
1432 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1433 
1434 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1435 		return -EINVAL;
1436 
1437 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1438 	conn->addr_len =  READ_ONCE(sqe->addr2);
1439 	conn->in_progress = conn->seen_econnaborted = false;
1440 	return 0;
1441 }
1442 
1443 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1444 {
1445 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1446 	struct io_async_connect __io, *io;
1447 	unsigned file_flags;
1448 	int ret;
1449 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1450 
1451 	if (connect->in_progress) {
1452 		struct socket *socket;
1453 
1454 		ret = -ENOTSOCK;
1455 		socket = sock_from_file(req->file);
1456 		if (socket)
1457 			ret = sock_error(socket->sk);
1458 		goto out;
1459 	}
1460 
1461 	if (req_has_async_data(req)) {
1462 		io = req->async_data;
1463 	} else {
1464 		ret = move_addr_to_kernel(connect->addr,
1465 						connect->addr_len,
1466 						&__io.address);
1467 		if (ret)
1468 			goto out;
1469 		io = &__io;
1470 	}
1471 
1472 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1473 
1474 	ret = __sys_connect_file(req->file, &io->address,
1475 					connect->addr_len, file_flags);
1476 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1477 	    && force_nonblock) {
1478 		if (ret == -EINPROGRESS) {
1479 			connect->in_progress = true;
1480 			return -EAGAIN;
1481 		}
1482 		if (ret == -ECONNABORTED) {
1483 			if (connect->seen_econnaborted)
1484 				goto out;
1485 			connect->seen_econnaborted = true;
1486 		}
1487 		if (req_has_async_data(req))
1488 			return -EAGAIN;
1489 		if (io_alloc_async_data(req)) {
1490 			ret = -ENOMEM;
1491 			goto out;
1492 		}
1493 		memcpy(req->async_data, &__io, sizeof(__io));
1494 		return -EAGAIN;
1495 	}
1496 	if (ret == -ERESTARTSYS)
1497 		ret = -EINTR;
1498 out:
1499 	if (ret < 0)
1500 		req_set_fail(req);
1501 	io_req_set_res(req, ret, 0);
1502 	return IOU_OK;
1503 }
1504 
1505 void io_netmsg_cache_free(struct io_cache_entry *entry)
1506 {
1507 	kfree(container_of(entry, struct io_async_msghdr, cache));
1508 }
1509 #endif
1510