xref: /openbmc/linux/io_uring/net.c (revision 1a03310940bb5a35daaf933692dcfa55262eb8bf)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/file.h>
5 #include <linux/slab.h>
6 #include <linux/net.h>
7 #include <linux/compat.h>
8 #include <net/compat.h>
9 #include <linux/io_uring.h>
10 
11 #include <uapi/linux/io_uring.h>
12 
13 #include "io_uring.h"
14 #include "kbuf.h"
15 #include "alloc_cache.h"
16 #include "net.h"
17 #include "notif.h"
18 #include "rsrc.h"
19 
20 #if defined(CONFIG_NET)
21 struct io_shutdown {
22 	struct file			*file;
23 	int				how;
24 };
25 
26 struct io_accept {
27 	struct file			*file;
28 	struct sockaddr __user		*addr;
29 	int __user			*addr_len;
30 	int				flags;
31 	u32				file_slot;
32 	unsigned long			nofile;
33 };
34 
35 struct io_socket {
36 	struct file			*file;
37 	int				domain;
38 	int				type;
39 	int				protocol;
40 	int				flags;
41 	u32				file_slot;
42 	unsigned long			nofile;
43 };
44 
45 struct io_connect {
46 	struct file			*file;
47 	struct sockaddr __user		*addr;
48 	int				addr_len;
49 	bool				in_progress;
50 	bool				seen_econnaborted;
51 };
52 
53 struct io_sr_msg {
54 	struct file			*file;
55 	union {
56 		struct compat_msghdr __user	*umsg_compat;
57 		struct user_msghdr __user	*umsg;
58 		void __user			*buf;
59 	};
60 	unsigned			len;
61 	unsigned			done_io;
62 	unsigned			msg_flags;
63 	unsigned			nr_multishot_loops;
64 	u16				flags;
65 	/* initialised and used only by !msg send variants */
66 	u16				addr_len;
67 	u16				buf_group;
68 	void __user			*addr;
69 	void __user			*msg_control;
70 	/* used only for send zerocopy */
71 	struct io_kiocb 		*notif;
72 };
73 
74 /*
75  * Number of times we'll try and do receives if there's more data. If we
76  * exceed this limit, then add us to the back of the queue and retry from
77  * there. This helps fairness between flooding clients.
78  */
79 #define MULTISHOT_MAX_RETRY	32
80 
81 static inline bool io_check_multishot(struct io_kiocb *req,
82 				      unsigned int issue_flags)
83 {
84 	/*
85 	 * When ->locked_cq is set we only allow to post CQEs from the original
86 	 * task context. Usual request completions will be handled in other
87 	 * generic paths but multipoll may decide to post extra cqes.
88 	 */
89 	return !(issue_flags & IO_URING_F_IOWQ) ||
90 		!(issue_flags & IO_URING_F_MULTISHOT) ||
91 		!req->ctx->task_complete;
92 }
93 
94 int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
95 {
96 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
97 
98 	if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
99 		     sqe->buf_index || sqe->splice_fd_in))
100 		return -EINVAL;
101 
102 	shutdown->how = READ_ONCE(sqe->len);
103 	req->flags |= REQ_F_FORCE_ASYNC;
104 	return 0;
105 }
106 
107 int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
108 {
109 	struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
110 	struct socket *sock;
111 	int ret;
112 
113 	WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
114 
115 	sock = sock_from_file(req->file);
116 	if (unlikely(!sock))
117 		return -ENOTSOCK;
118 
119 	ret = __sys_shutdown_sock(sock, shutdown->how);
120 	io_req_set_res(req, ret, 0);
121 	return IOU_OK;
122 }
123 
124 static bool io_net_retry(struct socket *sock, int flags)
125 {
126 	if (!(flags & MSG_WAITALL))
127 		return false;
128 	return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
129 }
130 
131 static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
132 {
133 	struct io_async_msghdr *hdr = req->async_data;
134 
135 	if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED)
136 		return;
137 
138 	/* Let normal cleanup path reap it if we fail adding to the cache */
139 	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
140 		req->async_data = NULL;
141 		req->flags &= ~REQ_F_ASYNC_DATA;
142 	}
143 }
144 
145 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req,
146 						  unsigned int issue_flags)
147 {
148 	struct io_ring_ctx *ctx = req->ctx;
149 	struct io_cache_entry *entry;
150 	struct io_async_msghdr *hdr;
151 
152 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
153 		entry = io_alloc_cache_get(&ctx->netmsg_cache);
154 		if (entry) {
155 			hdr = container_of(entry, struct io_async_msghdr, cache);
156 			hdr->free_iov = NULL;
157 			req->flags |= REQ_F_ASYNC_DATA;
158 			req->async_data = hdr;
159 			return hdr;
160 		}
161 	}
162 
163 	if (!io_alloc_async_data(req)) {
164 		hdr = req->async_data;
165 		hdr->free_iov = NULL;
166 		return hdr;
167 	}
168 	return NULL;
169 }
170 
171 static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req)
172 {
173 	/* ->prep_async is always called from the submission context */
174 	return io_msg_alloc_async(req, 0);
175 }
176 
177 static int io_setup_async_msg(struct io_kiocb *req,
178 			      struct io_async_msghdr *kmsg,
179 			      unsigned int issue_flags)
180 {
181 	struct io_async_msghdr *async_msg;
182 
183 	if (req_has_async_data(req))
184 		return -EAGAIN;
185 	async_msg = io_msg_alloc_async(req, issue_flags);
186 	if (!async_msg) {
187 		kfree(kmsg->free_iov);
188 		return -ENOMEM;
189 	}
190 	req->flags |= REQ_F_NEED_CLEANUP;
191 	memcpy(async_msg, kmsg, sizeof(*kmsg));
192 	if (async_msg->msg.msg_name)
193 		async_msg->msg.msg_name = &async_msg->addr;
194 
195 	if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs)
196 		return -EAGAIN;
197 
198 	/* if were using fast_iov, set it to the new one */
199 	if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
200 		size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
201 		async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
202 	}
203 
204 	return -EAGAIN;
205 }
206 
207 static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
208 {
209 	int hdr;
210 
211 	if (iomsg->namelen < 0)
212 		return true;
213 	if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
214 			       iomsg->namelen, &hdr))
215 		return true;
216 	if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
217 		return true;
218 
219 	return false;
220 }
221 
222 #ifdef CONFIG_COMPAT
223 static int __io_compat_msg_copy_hdr(struct io_kiocb *req,
224 				    struct io_async_msghdr *iomsg,
225 				    struct sockaddr __user **addr, int ddir)
226 {
227 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
228 	struct compat_msghdr msg;
229 	struct compat_iovec __user *uiov;
230 	int ret;
231 
232 	if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
233 		return -EFAULT;
234 
235 	ret = __get_compat_msghdr(&iomsg->msg, &msg, addr);
236 	if (ret)
237 		return ret;
238 
239 	uiov = compat_ptr(msg.msg_iov);
240 	if (req->flags & REQ_F_BUFFER_SELECT) {
241 		compat_ssize_t clen;
242 
243 		iomsg->free_iov = NULL;
244 		if (msg.msg_iovlen == 0) {
245 			sr->len = 0;
246 		} else if (msg.msg_iovlen > 1) {
247 			return -EINVAL;
248 		} else {
249 			if (!access_ok(uiov, sizeof(*uiov)))
250 				return -EFAULT;
251 			if (__get_user(clen, &uiov->iov_len))
252 				return -EFAULT;
253 			if (clen < 0)
254 				return -EINVAL;
255 			sr->len = clen;
256 		}
257 
258 		if (ddir == ITER_DEST && req->flags & REQ_F_APOLL_MULTISHOT) {
259 			iomsg->namelen = msg.msg_namelen;
260 			iomsg->controllen = msg.msg_controllen;
261 			if (io_recvmsg_multishot_overflow(iomsg))
262 				return -EOVERFLOW;
263 		}
264 
265 		return 0;
266 	}
267 
268 	iomsg->free_iov = iomsg->fast_iov;
269 	ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg.msg_iovlen,
270 				UIO_FASTIOV, &iomsg->free_iov,
271 				&iomsg->msg.msg_iter, true);
272 	if (unlikely(ret < 0))
273 		return ret;
274 
275 	return 0;
276 }
277 #endif
278 
279 static int __io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
280 			     struct sockaddr __user **addr, int ddir)
281 {
282 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
283 	struct user_msghdr msg;
284 	int ret;
285 
286 	if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
287 		return -EFAULT;
288 
289 	ret = __copy_msghdr(&iomsg->msg, &msg, addr);
290 	if (ret)
291 		return ret;
292 
293 	if (req->flags & REQ_F_BUFFER_SELECT) {
294 		if (msg.msg_iovlen == 0) {
295 			sr->len = iomsg->fast_iov[0].iov_len = 0;
296 			iomsg->fast_iov[0].iov_base = NULL;
297 			iomsg->free_iov = NULL;
298 		} else if (msg.msg_iovlen > 1) {
299 			return -EINVAL;
300 		} else {
301 			if (copy_from_user(iomsg->fast_iov, msg.msg_iov,
302 					   sizeof(*msg.msg_iov)))
303 				return -EFAULT;
304 			sr->len = iomsg->fast_iov[0].iov_len;
305 			iomsg->free_iov = NULL;
306 		}
307 
308 		if (ddir == ITER_DEST && req->flags & REQ_F_APOLL_MULTISHOT) {
309 			iomsg->namelen = msg.msg_namelen;
310 			iomsg->controllen = msg.msg_controllen;
311 			if (io_recvmsg_multishot_overflow(iomsg))
312 				return -EOVERFLOW;
313 		}
314 
315 		return 0;
316 	}
317 
318 	iomsg->free_iov = iomsg->fast_iov;
319 	ret = __import_iovec(ddir, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
320 				&iomsg->free_iov, &iomsg->msg.msg_iter, false);
321 	if (unlikely(ret < 0))
322 		return ret;
323 
324 	return 0;
325 }
326 
327 static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg,
328 			   struct sockaddr __user **addr, int ddir)
329 {
330 	iomsg->msg.msg_name = &iomsg->addr;
331 	iomsg->msg.msg_iter.nr_segs = 0;
332 
333 #ifdef CONFIG_COMPAT
334 	if (req->ctx->compat)
335 		return __io_compat_msg_copy_hdr(req, iomsg, addr, ddir);
336 #endif
337 
338 	return __io_msg_copy_hdr(req, iomsg, addr, ddir);
339 }
340 
341 static int io_sendmsg_copy_hdr(struct io_kiocb *req,
342 			       struct io_async_msghdr *iomsg)
343 {
344 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
345 	int ret;
346 
347 	ret = io_msg_copy_hdr(req, iomsg, NULL, ITER_SOURCE);
348 	if (ret)
349 		return ret;
350 
351 	/* save msg_control as sys_sendmsg() overwrites it */
352 	sr->msg_control = iomsg->msg.msg_control_user;
353 	return ret;
354 }
355 
356 int io_send_prep_async(struct io_kiocb *req)
357 {
358 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
359 	struct io_async_msghdr *io;
360 	int ret;
361 
362 	if (!zc->addr || req_has_async_data(req))
363 		return 0;
364 	io = io_msg_alloc_async_prep(req);
365 	if (!io)
366 		return -ENOMEM;
367 	ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr);
368 	return ret;
369 }
370 
371 static int io_setup_async_addr(struct io_kiocb *req,
372 			      struct sockaddr_storage *addr_storage,
373 			      unsigned int issue_flags)
374 {
375 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
376 	struct io_async_msghdr *io;
377 
378 	if (!sr->addr || req_has_async_data(req))
379 		return -EAGAIN;
380 	io = io_msg_alloc_async(req, issue_flags);
381 	if (!io)
382 		return -ENOMEM;
383 	memcpy(&io->addr, addr_storage, sizeof(io->addr));
384 	return -EAGAIN;
385 }
386 
387 int io_sendmsg_prep_async(struct io_kiocb *req)
388 {
389 	int ret;
390 
391 	if (!io_msg_alloc_async_prep(req))
392 		return -ENOMEM;
393 	ret = io_sendmsg_copy_hdr(req, req->async_data);
394 	if (!ret)
395 		req->flags |= REQ_F_NEED_CLEANUP;
396 	return ret;
397 }
398 
399 void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
400 {
401 	struct io_async_msghdr *io = req->async_data;
402 
403 	kfree(io->free_iov);
404 }
405 
406 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
407 {
408 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
409 
410 	if (req->opcode == IORING_OP_SEND) {
411 		if (READ_ONCE(sqe->__pad3[0]))
412 			return -EINVAL;
413 		sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
414 		sr->addr_len = READ_ONCE(sqe->addr_len);
415 	} else if (sqe->addr2 || sqe->file_index) {
416 		return -EINVAL;
417 	}
418 
419 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
420 	sr->len = READ_ONCE(sqe->len);
421 	sr->flags = READ_ONCE(sqe->ioprio);
422 	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
423 		return -EINVAL;
424 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
425 	if (sr->msg_flags & MSG_DONTWAIT)
426 		req->flags |= REQ_F_NOWAIT;
427 
428 #ifdef CONFIG_COMPAT
429 	if (req->ctx->compat)
430 		sr->msg_flags |= MSG_CMSG_COMPAT;
431 #endif
432 	sr->done_io = 0;
433 	return 0;
434 }
435 
436 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
437 {
438 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
439 	struct io_async_msghdr iomsg, *kmsg;
440 	struct socket *sock;
441 	unsigned flags;
442 	int min_ret = 0;
443 	int ret;
444 
445 	sock = sock_from_file(req->file);
446 	if (unlikely(!sock))
447 		return -ENOTSOCK;
448 
449 	if (req_has_async_data(req)) {
450 		kmsg = req->async_data;
451 		kmsg->msg.msg_control_user = sr->msg_control;
452 	} else {
453 		ret = io_sendmsg_copy_hdr(req, &iomsg);
454 		if (ret)
455 			return ret;
456 		kmsg = &iomsg;
457 	}
458 
459 	if (!(req->flags & REQ_F_POLLED) &&
460 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
461 		return io_setup_async_msg(req, kmsg, issue_flags);
462 
463 	flags = sr->msg_flags;
464 	if (issue_flags & IO_URING_F_NONBLOCK)
465 		flags |= MSG_DONTWAIT;
466 	if (flags & MSG_WAITALL)
467 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
468 
469 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
470 
471 	if (ret < min_ret) {
472 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
473 			return io_setup_async_msg(req, kmsg, issue_flags);
474 		if (ret > 0 && io_net_retry(sock, flags)) {
475 			kmsg->msg.msg_controllen = 0;
476 			kmsg->msg.msg_control = NULL;
477 			sr->done_io += ret;
478 			req->flags |= REQ_F_PARTIAL_IO;
479 			return io_setup_async_msg(req, kmsg, issue_flags);
480 		}
481 		if (ret == -ERESTARTSYS)
482 			ret = -EINTR;
483 		req_set_fail(req);
484 	}
485 	/* fast path, check for non-NULL to avoid function call */
486 	if (kmsg->free_iov)
487 		kfree(kmsg->free_iov);
488 	req->flags &= ~REQ_F_NEED_CLEANUP;
489 	io_netmsg_recycle(req, issue_flags);
490 	if (ret >= 0)
491 		ret += sr->done_io;
492 	else if (sr->done_io)
493 		ret = sr->done_io;
494 	io_req_set_res(req, ret, 0);
495 	return IOU_OK;
496 }
497 
498 int io_send(struct io_kiocb *req, unsigned int issue_flags)
499 {
500 	struct sockaddr_storage __address;
501 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
502 	struct msghdr msg;
503 	struct socket *sock;
504 	unsigned flags;
505 	int min_ret = 0;
506 	int ret;
507 
508 	msg.msg_name = NULL;
509 	msg.msg_control = NULL;
510 	msg.msg_controllen = 0;
511 	msg.msg_namelen = 0;
512 	msg.msg_ubuf = NULL;
513 
514 	if (sr->addr) {
515 		if (req_has_async_data(req)) {
516 			struct io_async_msghdr *io = req->async_data;
517 
518 			msg.msg_name = &io->addr;
519 		} else {
520 			ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address);
521 			if (unlikely(ret < 0))
522 				return ret;
523 			msg.msg_name = (struct sockaddr *)&__address;
524 		}
525 		msg.msg_namelen = sr->addr_len;
526 	}
527 
528 	if (!(req->flags & REQ_F_POLLED) &&
529 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
530 		return io_setup_async_addr(req, &__address, issue_flags);
531 
532 	sock = sock_from_file(req->file);
533 	if (unlikely(!sock))
534 		return -ENOTSOCK;
535 
536 	ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter);
537 	if (unlikely(ret))
538 		return ret;
539 
540 	flags = sr->msg_flags;
541 	if (issue_flags & IO_URING_F_NONBLOCK)
542 		flags |= MSG_DONTWAIT;
543 	if (flags & MSG_WAITALL)
544 		min_ret = iov_iter_count(&msg.msg_iter);
545 
546 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
547 	msg.msg_flags = flags;
548 	ret = sock_sendmsg(sock, &msg);
549 	if (ret < min_ret) {
550 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
551 			return io_setup_async_addr(req, &__address, issue_flags);
552 
553 		if (ret > 0 && io_net_retry(sock, flags)) {
554 			sr->len -= ret;
555 			sr->buf += ret;
556 			sr->done_io += ret;
557 			req->flags |= REQ_F_PARTIAL_IO;
558 			return io_setup_async_addr(req, &__address, issue_flags);
559 		}
560 		if (ret == -ERESTARTSYS)
561 			ret = -EINTR;
562 		req_set_fail(req);
563 	}
564 	if (ret >= 0)
565 		ret += sr->done_io;
566 	else if (sr->done_io)
567 		ret = sr->done_io;
568 	io_req_set_res(req, ret, 0);
569 	return IOU_OK;
570 }
571 
572 static int io_recvmsg_copy_hdr(struct io_kiocb *req,
573 			       struct io_async_msghdr *iomsg)
574 {
575 	return io_msg_copy_hdr(req, iomsg, &iomsg->uaddr, ITER_DEST);
576 }
577 
578 int io_recvmsg_prep_async(struct io_kiocb *req)
579 {
580 	struct io_async_msghdr *iomsg;
581 	int ret;
582 
583 	if (!io_msg_alloc_async_prep(req))
584 		return -ENOMEM;
585 	iomsg = req->async_data;
586 	ret = io_recvmsg_copy_hdr(req, iomsg);
587 	if (!ret)
588 		req->flags |= REQ_F_NEED_CLEANUP;
589 	return ret;
590 }
591 
592 #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
593 
594 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
595 {
596 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
597 
598 	if (unlikely(sqe->file_index || sqe->addr2))
599 		return -EINVAL;
600 
601 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
602 	sr->len = READ_ONCE(sqe->len);
603 	sr->flags = READ_ONCE(sqe->ioprio);
604 	if (sr->flags & ~(RECVMSG_FLAGS))
605 		return -EINVAL;
606 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
607 	if (sr->msg_flags & MSG_DONTWAIT)
608 		req->flags |= REQ_F_NOWAIT;
609 	if (sr->msg_flags & MSG_ERRQUEUE)
610 		req->flags |= REQ_F_CLEAR_POLLIN;
611 	if (sr->flags & IORING_RECV_MULTISHOT) {
612 		if (!(req->flags & REQ_F_BUFFER_SELECT))
613 			return -EINVAL;
614 		if (sr->msg_flags & MSG_WAITALL)
615 			return -EINVAL;
616 		if (req->opcode == IORING_OP_RECV && sr->len)
617 			return -EINVAL;
618 		req->flags |= REQ_F_APOLL_MULTISHOT;
619 		/*
620 		 * Store the buffer group for this multishot receive separately,
621 		 * as if we end up doing an io-wq based issue that selects a
622 		 * buffer, it has to be committed immediately and that will
623 		 * clear ->buf_list. This means we lose the link to the buffer
624 		 * list, and the eventual buffer put on completion then cannot
625 		 * restore it.
626 		 */
627 		sr->buf_group = req->buf_index;
628 	}
629 
630 #ifdef CONFIG_COMPAT
631 	if (req->ctx->compat)
632 		sr->msg_flags |= MSG_CMSG_COMPAT;
633 #endif
634 	sr->done_io = 0;
635 	sr->nr_multishot_loops = 0;
636 	return 0;
637 }
638 
639 static inline void io_recv_prep_retry(struct io_kiocb *req)
640 {
641 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
642 
643 	sr->done_io = 0;
644 	sr->len = 0; /* get from the provided buffer */
645 	req->buf_index = sr->buf_group;
646 }
647 
648 /*
649  * Finishes io_recv and io_recvmsg.
650  *
651  * Returns true if it is actually finished, or false if it should run
652  * again (for multishot).
653  */
654 static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
655 				  struct msghdr *msg, bool mshot_finished,
656 				  unsigned issue_flags)
657 {
658 	unsigned int cflags;
659 
660 	cflags = io_put_kbuf(req, issue_flags);
661 	if (msg->msg_inq && msg->msg_inq != -1)
662 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
663 
664 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
665 		io_req_set_res(req, *ret, cflags);
666 		*ret = IOU_OK;
667 		return true;
668 	}
669 
670 	if (mshot_finished)
671 		goto finish;
672 
673 	/*
674 	 * Fill CQE for this receive and see if we should keep trying to
675 	 * receive from this socket.
676 	 */
677 	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
678 				*ret, cflags | IORING_CQE_F_MORE)) {
679 		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
680 		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
681 
682 		io_recv_prep_retry(req);
683 		/* Known not-empty or unknown state, retry */
684 		if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) {
685 			if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY)
686 				return false;
687 			/* mshot retries exceeded, force a requeue */
688 			sr->nr_multishot_loops = 0;
689 			mshot_retry_ret = IOU_REQUEUE;
690 		}
691 		if (issue_flags & IO_URING_F_MULTISHOT)
692 			*ret = mshot_retry_ret;
693 		else
694 			*ret = -EAGAIN;
695 		return true;
696 	}
697 	/* Otherwise stop multishot but use the current result. */
698 finish:
699 	io_req_set_res(req, *ret, cflags);
700 
701 	if (issue_flags & IO_URING_F_MULTISHOT)
702 		*ret = IOU_STOP_MULTISHOT;
703 	else
704 		*ret = IOU_OK;
705 	return true;
706 }
707 
708 static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
709 				     struct io_sr_msg *sr, void __user **buf,
710 				     size_t *len)
711 {
712 	unsigned long ubuf = (unsigned long) *buf;
713 	unsigned long hdr;
714 
715 	hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
716 		kmsg->controllen;
717 	if (*len < hdr)
718 		return -EFAULT;
719 
720 	if (kmsg->controllen) {
721 		unsigned long control = ubuf + hdr - kmsg->controllen;
722 
723 		kmsg->msg.msg_control_user = (void __user *) control;
724 		kmsg->msg.msg_controllen = kmsg->controllen;
725 	}
726 
727 	sr->buf = *buf; /* stash for later copy */
728 	*buf = (void __user *) (ubuf + hdr);
729 	kmsg->payloadlen = *len = *len - hdr;
730 	return 0;
731 }
732 
733 struct io_recvmsg_multishot_hdr {
734 	struct io_uring_recvmsg_out msg;
735 	struct sockaddr_storage addr;
736 };
737 
738 static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
739 				struct io_async_msghdr *kmsg,
740 				unsigned int flags, bool *finished)
741 {
742 	int err;
743 	int copy_len;
744 	struct io_recvmsg_multishot_hdr hdr;
745 
746 	if (kmsg->namelen)
747 		kmsg->msg.msg_name = &hdr.addr;
748 	kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
749 	kmsg->msg.msg_namelen = 0;
750 
751 	if (sock->file->f_flags & O_NONBLOCK)
752 		flags |= MSG_DONTWAIT;
753 
754 	err = sock_recvmsg(sock, &kmsg->msg, flags);
755 	*finished = err <= 0;
756 	if (err < 0)
757 		return err;
758 
759 	hdr.msg = (struct io_uring_recvmsg_out) {
760 		.controllen = kmsg->controllen - kmsg->msg.msg_controllen,
761 		.flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
762 	};
763 
764 	hdr.msg.payloadlen = err;
765 	if (err > kmsg->payloadlen)
766 		err = kmsg->payloadlen;
767 
768 	copy_len = sizeof(struct io_uring_recvmsg_out);
769 	if (kmsg->msg.msg_namelen > kmsg->namelen)
770 		copy_len += kmsg->namelen;
771 	else
772 		copy_len += kmsg->msg.msg_namelen;
773 
774 	/*
775 	 *      "fromlen shall refer to the value before truncation.."
776 	 *                      1003.1g
777 	 */
778 	hdr.msg.namelen = kmsg->msg.msg_namelen;
779 
780 	/* ensure that there is no gap between hdr and sockaddr_storage */
781 	BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
782 		     sizeof(struct io_uring_recvmsg_out));
783 	if (copy_to_user(io->buf, &hdr, copy_len)) {
784 		*finished = true;
785 		return -EFAULT;
786 	}
787 
788 	return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
789 			kmsg->controllen + err;
790 }
791 
792 int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
793 {
794 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
795 	struct io_async_msghdr iomsg, *kmsg;
796 	struct socket *sock;
797 	unsigned flags;
798 	int ret, min_ret = 0;
799 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
800 	bool mshot_finished = true;
801 
802 	sock = sock_from_file(req->file);
803 	if (unlikely(!sock))
804 		return -ENOTSOCK;
805 
806 	if (req_has_async_data(req)) {
807 		kmsg = req->async_data;
808 	} else {
809 		ret = io_recvmsg_copy_hdr(req, &iomsg);
810 		if (ret)
811 			return ret;
812 		kmsg = &iomsg;
813 	}
814 
815 	if (!(req->flags & REQ_F_POLLED) &&
816 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
817 		return io_setup_async_msg(req, kmsg, issue_flags);
818 
819 	if (!io_check_multishot(req, issue_flags))
820 		return io_setup_async_msg(req, kmsg, issue_flags);
821 
822 retry_multishot:
823 	if (io_do_buffer_select(req)) {
824 		void __user *buf;
825 		size_t len = sr->len;
826 
827 		buf = io_buffer_select(req, &len, issue_flags);
828 		if (!buf)
829 			return -ENOBUFS;
830 
831 		if (req->flags & REQ_F_APOLL_MULTISHOT) {
832 			ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
833 			if (ret) {
834 				io_kbuf_recycle(req, issue_flags);
835 				return ret;
836 			}
837 		}
838 
839 		iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len);
840 	}
841 
842 	flags = sr->msg_flags;
843 	if (force_nonblock)
844 		flags |= MSG_DONTWAIT;
845 
846 	kmsg->msg.msg_get_inq = 1;
847 	kmsg->msg.msg_inq = -1;
848 	if (req->flags & REQ_F_APOLL_MULTISHOT) {
849 		ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
850 					   &mshot_finished);
851 	} else {
852 		/* disable partial retry for recvmsg with cmsg attached */
853 		if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
854 			min_ret = iov_iter_count(&kmsg->msg.msg_iter);
855 
856 		ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
857 					 kmsg->uaddr, flags);
858 	}
859 
860 	if (ret < min_ret) {
861 		if (ret == -EAGAIN && force_nonblock) {
862 			ret = io_setup_async_msg(req, kmsg, issue_flags);
863 			if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) {
864 				io_kbuf_recycle(req, issue_flags);
865 				return IOU_ISSUE_SKIP_COMPLETE;
866 			}
867 			return ret;
868 		}
869 		if (ret > 0 && io_net_retry(sock, flags)) {
870 			sr->done_io += ret;
871 			req->flags |= REQ_F_PARTIAL_IO;
872 			return io_setup_async_msg(req, kmsg, issue_flags);
873 		}
874 		if (ret == -ERESTARTSYS)
875 			ret = -EINTR;
876 		req_set_fail(req);
877 	} else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
878 		req_set_fail(req);
879 	}
880 
881 	if (ret > 0)
882 		ret += sr->done_io;
883 	else if (sr->done_io)
884 		ret = sr->done_io;
885 	else
886 		io_kbuf_recycle(req, issue_flags);
887 
888 	if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
889 		goto retry_multishot;
890 
891 	if (mshot_finished) {
892 		/* fast path, check for non-NULL to avoid function call */
893 		if (kmsg->free_iov)
894 			kfree(kmsg->free_iov);
895 		io_netmsg_recycle(req, issue_flags);
896 		req->flags &= ~REQ_F_NEED_CLEANUP;
897 	}
898 
899 	return ret;
900 }
901 
902 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
903 {
904 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
905 	struct msghdr msg;
906 	struct socket *sock;
907 	unsigned flags;
908 	int ret, min_ret = 0;
909 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
910 	size_t len = sr->len;
911 
912 	if (!(req->flags & REQ_F_POLLED) &&
913 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
914 		return -EAGAIN;
915 
916 	if (!io_check_multishot(req, issue_flags))
917 		return -EAGAIN;
918 
919 	sock = sock_from_file(req->file);
920 	if (unlikely(!sock))
921 		return -ENOTSOCK;
922 
923 	msg.msg_name = NULL;
924 	msg.msg_namelen = 0;
925 	msg.msg_control = NULL;
926 	msg.msg_get_inq = 1;
927 	msg.msg_controllen = 0;
928 	msg.msg_iocb = NULL;
929 	msg.msg_ubuf = NULL;
930 
931 retry_multishot:
932 	if (io_do_buffer_select(req)) {
933 		void __user *buf;
934 
935 		buf = io_buffer_select(req, &len, issue_flags);
936 		if (!buf)
937 			return -ENOBUFS;
938 		sr->buf = buf;
939 		sr->len = len;
940 	}
941 
942 	ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);
943 	if (unlikely(ret))
944 		goto out_free;
945 
946 	msg.msg_inq = -1;
947 	msg.msg_flags = 0;
948 
949 	flags = sr->msg_flags;
950 	if (force_nonblock)
951 		flags |= MSG_DONTWAIT;
952 	if (flags & MSG_WAITALL)
953 		min_ret = iov_iter_count(&msg.msg_iter);
954 
955 	ret = sock_recvmsg(sock, &msg, flags);
956 	if (ret < min_ret) {
957 		if (ret == -EAGAIN && force_nonblock) {
958 			if (issue_flags & IO_URING_F_MULTISHOT) {
959 				io_kbuf_recycle(req, issue_flags);
960 				return IOU_ISSUE_SKIP_COMPLETE;
961 			}
962 
963 			return -EAGAIN;
964 		}
965 		if (ret > 0 && io_net_retry(sock, flags)) {
966 			sr->len -= ret;
967 			sr->buf += ret;
968 			sr->done_io += ret;
969 			req->flags |= REQ_F_PARTIAL_IO;
970 			return -EAGAIN;
971 		}
972 		if (ret == -ERESTARTSYS)
973 			ret = -EINTR;
974 		req_set_fail(req);
975 	} else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
976 out_free:
977 		req_set_fail(req);
978 	}
979 
980 	if (ret > 0)
981 		ret += sr->done_io;
982 	else if (sr->done_io)
983 		ret = sr->done_io;
984 	else
985 		io_kbuf_recycle(req, issue_flags);
986 
987 	if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags))
988 		goto retry_multishot;
989 
990 	return ret;
991 }
992 
993 void io_send_zc_cleanup(struct io_kiocb *req)
994 {
995 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
996 	struct io_async_msghdr *io;
997 
998 	if (req_has_async_data(req)) {
999 		io = req->async_data;
1000 		/* might be ->fast_iov if *msg_copy_hdr failed */
1001 		if (io->free_iov != io->fast_iov)
1002 			kfree(io->free_iov);
1003 	}
1004 	if (zc->notif) {
1005 		io_notif_flush(zc->notif);
1006 		zc->notif = NULL;
1007 	}
1008 }
1009 
1010 #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)
1011 #define IO_ZC_FLAGS_VALID  (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE)
1012 
1013 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1014 {
1015 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1016 	struct io_ring_ctx *ctx = req->ctx;
1017 	struct io_kiocb *notif;
1018 
1019 	if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
1020 		return -EINVAL;
1021 	/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
1022 	if (req->flags & REQ_F_CQE_SKIP)
1023 		return -EINVAL;
1024 
1025 	notif = zc->notif = io_alloc_notif(ctx);
1026 	if (!notif)
1027 		return -ENOMEM;
1028 	notif->cqe.user_data = req->cqe.user_data;
1029 	notif->cqe.res = 0;
1030 	notif->cqe.flags = IORING_CQE_F_NOTIF;
1031 	req->flags |= REQ_F_NEED_CLEANUP;
1032 
1033 	zc->flags = READ_ONCE(sqe->ioprio);
1034 	if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) {
1035 		if (zc->flags & ~IO_ZC_FLAGS_VALID)
1036 			return -EINVAL;
1037 		if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) {
1038 			io_notif_set_extended(notif);
1039 			io_notif_to_data(notif)->zc_report = true;
1040 		}
1041 	}
1042 
1043 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1044 		unsigned idx = READ_ONCE(sqe->buf_index);
1045 
1046 		if (unlikely(idx >= ctx->nr_user_bufs))
1047 			return -EFAULT;
1048 		idx = array_index_nospec(idx, ctx->nr_user_bufs);
1049 		req->imu = READ_ONCE(ctx->user_bufs[idx]);
1050 		io_req_set_rsrc_node(notif, ctx, 0);
1051 	}
1052 
1053 	if (req->opcode == IORING_OP_SEND_ZC) {
1054 		if (READ_ONCE(sqe->__pad3[0]))
1055 			return -EINVAL;
1056 		zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1057 		zc->addr_len = READ_ONCE(sqe->addr_len);
1058 	} else {
1059 		if (unlikely(sqe->addr2 || sqe->file_index))
1060 			return -EINVAL;
1061 		if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF))
1062 			return -EINVAL;
1063 	}
1064 
1065 	zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
1066 	zc->len = READ_ONCE(sqe->len);
1067 	zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
1068 	if (zc->msg_flags & MSG_DONTWAIT)
1069 		req->flags |= REQ_F_NOWAIT;
1070 
1071 	zc->done_io = 0;
1072 
1073 #ifdef CONFIG_COMPAT
1074 	if (req->ctx->compat)
1075 		zc->msg_flags |= MSG_CMSG_COMPAT;
1076 #endif
1077 	return 0;
1078 }
1079 
1080 static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb,
1081 				 struct iov_iter *from, size_t length)
1082 {
1083 	skb_zcopy_downgrade_managed(skb);
1084 	return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1085 }
1086 
1087 static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
1088 			   struct iov_iter *from, size_t length)
1089 {
1090 	struct skb_shared_info *shinfo = skb_shinfo(skb);
1091 	int frag = shinfo->nr_frags;
1092 	int ret = 0;
1093 	struct bvec_iter bi;
1094 	ssize_t copied = 0;
1095 	unsigned long truesize = 0;
1096 
1097 	if (!frag)
1098 		shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
1099 	else if (unlikely(!skb_zcopy_managed(skb)))
1100 		return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
1101 
1102 	bi.bi_size = min(from->count, length);
1103 	bi.bi_bvec_done = from->iov_offset;
1104 	bi.bi_idx = 0;
1105 
1106 	while (bi.bi_size && frag < MAX_SKB_FRAGS) {
1107 		struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
1108 
1109 		copied += v.bv_len;
1110 		truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
1111 		__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
1112 					   v.bv_offset, v.bv_len);
1113 		bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
1114 	}
1115 	if (bi.bi_size)
1116 		ret = -EMSGSIZE;
1117 
1118 	shinfo->nr_frags = frag;
1119 	from->bvec += bi.bi_idx;
1120 	from->nr_segs -= bi.bi_idx;
1121 	from->count -= copied;
1122 	from->iov_offset = bi.bi_bvec_done;
1123 
1124 	skb->data_len += copied;
1125 	skb->len += copied;
1126 	skb->truesize += truesize;
1127 
1128 	if (sk && sk->sk_type == SOCK_STREAM) {
1129 		sk_wmem_queued_add(sk, truesize);
1130 		if (!skb_zcopy_pure(skb))
1131 			sk_mem_charge(sk, truesize);
1132 	} else {
1133 		refcount_add(truesize, &skb->sk->sk_wmem_alloc);
1134 	}
1135 	return ret;
1136 }
1137 
1138 int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
1139 {
1140 	struct sockaddr_storage __address;
1141 	struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
1142 	struct msghdr msg;
1143 	struct socket *sock;
1144 	unsigned msg_flags;
1145 	int ret, min_ret = 0;
1146 
1147 	sock = sock_from_file(req->file);
1148 	if (unlikely(!sock))
1149 		return -ENOTSOCK;
1150 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1151 		return -EOPNOTSUPP;
1152 
1153 	msg.msg_name = NULL;
1154 	msg.msg_control = NULL;
1155 	msg.msg_controllen = 0;
1156 	msg.msg_namelen = 0;
1157 
1158 	if (zc->addr) {
1159 		if (req_has_async_data(req)) {
1160 			struct io_async_msghdr *io = req->async_data;
1161 
1162 			msg.msg_name = &io->addr;
1163 		} else {
1164 			ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address);
1165 			if (unlikely(ret < 0))
1166 				return ret;
1167 			msg.msg_name = (struct sockaddr *)&__address;
1168 		}
1169 		msg.msg_namelen = zc->addr_len;
1170 	}
1171 
1172 	if (!(req->flags & REQ_F_POLLED) &&
1173 	    (zc->flags & IORING_RECVSEND_POLL_FIRST))
1174 		return io_setup_async_addr(req, &__address, issue_flags);
1175 
1176 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
1177 		ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu,
1178 					(u64)(uintptr_t)zc->buf, zc->len);
1179 		if (unlikely(ret))
1180 			return ret;
1181 		msg.sg_from_iter = io_sg_from_iter;
1182 	} else {
1183 		io_notif_set_extended(zc->notif);
1184 		ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter);
1185 		if (unlikely(ret))
1186 			return ret;
1187 		ret = io_notif_account_mem(zc->notif, zc->len);
1188 		if (unlikely(ret))
1189 			return ret;
1190 		msg.sg_from_iter = io_sg_from_iter_iovec;
1191 	}
1192 
1193 	msg_flags = zc->msg_flags | MSG_ZEROCOPY;
1194 	if (issue_flags & IO_URING_F_NONBLOCK)
1195 		msg_flags |= MSG_DONTWAIT;
1196 	if (msg_flags & MSG_WAITALL)
1197 		min_ret = iov_iter_count(&msg.msg_iter);
1198 	msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
1199 
1200 	msg.msg_flags = msg_flags;
1201 	msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
1202 	ret = sock_sendmsg(sock, &msg);
1203 
1204 	if (unlikely(ret < min_ret)) {
1205 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1206 			return io_setup_async_addr(req, &__address, issue_flags);
1207 
1208 		if (ret > 0 && io_net_retry(sock, msg.msg_flags)) {
1209 			zc->len -= ret;
1210 			zc->buf += ret;
1211 			zc->done_io += ret;
1212 			req->flags |= REQ_F_PARTIAL_IO;
1213 			return io_setup_async_addr(req, &__address, issue_flags);
1214 		}
1215 		if (ret == -ERESTARTSYS)
1216 			ret = -EINTR;
1217 		req_set_fail(req);
1218 	}
1219 
1220 	if (ret >= 0)
1221 		ret += zc->done_io;
1222 	else if (zc->done_io)
1223 		ret = zc->done_io;
1224 
1225 	/*
1226 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1227 	 * flushing notif to io_send_zc_cleanup()
1228 	 */
1229 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1230 		io_notif_flush(zc->notif);
1231 		req->flags &= ~REQ_F_NEED_CLEANUP;
1232 	}
1233 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1234 	return IOU_OK;
1235 }
1236 
1237 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags)
1238 {
1239 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1240 	struct io_async_msghdr iomsg, *kmsg;
1241 	struct socket *sock;
1242 	unsigned flags;
1243 	int ret, min_ret = 0;
1244 
1245 	io_notif_set_extended(sr->notif);
1246 
1247 	sock = sock_from_file(req->file);
1248 	if (unlikely(!sock))
1249 		return -ENOTSOCK;
1250 	if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags))
1251 		return -EOPNOTSUPP;
1252 
1253 	if (req_has_async_data(req)) {
1254 		kmsg = req->async_data;
1255 	} else {
1256 		ret = io_sendmsg_copy_hdr(req, &iomsg);
1257 		if (ret)
1258 			return ret;
1259 		kmsg = &iomsg;
1260 	}
1261 
1262 	if (!(req->flags & REQ_F_POLLED) &&
1263 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
1264 		return io_setup_async_msg(req, kmsg, issue_flags);
1265 
1266 	flags = sr->msg_flags | MSG_ZEROCOPY;
1267 	if (issue_flags & IO_URING_F_NONBLOCK)
1268 		flags |= MSG_DONTWAIT;
1269 	if (flags & MSG_WAITALL)
1270 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
1271 
1272 	kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg;
1273 	kmsg->msg.sg_from_iter = io_sg_from_iter_iovec;
1274 	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
1275 
1276 	if (unlikely(ret < min_ret)) {
1277 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1278 			return io_setup_async_msg(req, kmsg, issue_flags);
1279 
1280 		if (ret > 0 && io_net_retry(sock, flags)) {
1281 			sr->done_io += ret;
1282 			req->flags |= REQ_F_PARTIAL_IO;
1283 			return io_setup_async_msg(req, kmsg, issue_flags);
1284 		}
1285 		if (ret == -ERESTARTSYS)
1286 			ret = -EINTR;
1287 		req_set_fail(req);
1288 	}
1289 	/* fast path, check for non-NULL to avoid function call */
1290 	if (kmsg->free_iov) {
1291 		kfree(kmsg->free_iov);
1292 		kmsg->free_iov = NULL;
1293 	}
1294 
1295 	io_netmsg_recycle(req, issue_flags);
1296 	if (ret >= 0)
1297 		ret += sr->done_io;
1298 	else if (sr->done_io)
1299 		ret = sr->done_io;
1300 
1301 	/*
1302 	 * If we're in io-wq we can't rely on tw ordering guarantees, defer
1303 	 * flushing notif to io_send_zc_cleanup()
1304 	 */
1305 	if (!(issue_flags & IO_URING_F_UNLOCKED)) {
1306 		io_notif_flush(sr->notif);
1307 		req->flags &= ~REQ_F_NEED_CLEANUP;
1308 	}
1309 	io_req_set_res(req, ret, IORING_CQE_F_MORE);
1310 	return IOU_OK;
1311 }
1312 
1313 void io_sendrecv_fail(struct io_kiocb *req)
1314 {
1315 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
1316 
1317 	if (req->flags & REQ_F_PARTIAL_IO)
1318 		req->cqe.res = sr->done_io;
1319 
1320 	if ((req->flags & REQ_F_NEED_CLEANUP) &&
1321 	    (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC))
1322 		req->cqe.flags |= IORING_CQE_F_MORE;
1323 }
1324 
1325 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1326 {
1327 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1328 	unsigned flags;
1329 
1330 	if (sqe->len || sqe->buf_index)
1331 		return -EINVAL;
1332 
1333 	accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1334 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
1335 	accept->flags = READ_ONCE(sqe->accept_flags);
1336 	accept->nofile = rlimit(RLIMIT_NOFILE);
1337 	flags = READ_ONCE(sqe->ioprio);
1338 	if (flags & ~IORING_ACCEPT_MULTISHOT)
1339 		return -EINVAL;
1340 
1341 	accept->file_slot = READ_ONCE(sqe->file_index);
1342 	if (accept->file_slot) {
1343 		if (accept->flags & SOCK_CLOEXEC)
1344 			return -EINVAL;
1345 		if (flags & IORING_ACCEPT_MULTISHOT &&
1346 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
1347 			return -EINVAL;
1348 	}
1349 	if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1350 		return -EINVAL;
1351 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
1352 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
1353 	if (flags & IORING_ACCEPT_MULTISHOT)
1354 		req->flags |= REQ_F_APOLL_MULTISHOT;
1355 	return 0;
1356 }
1357 
1358 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
1359 {
1360 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
1361 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1362 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
1363 	bool fixed = !!accept->file_slot;
1364 	struct file *file;
1365 	int ret, fd;
1366 
1367 	if (!io_check_multishot(req, issue_flags))
1368 		return -EAGAIN;
1369 retry:
1370 	if (!fixed) {
1371 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
1372 		if (unlikely(fd < 0))
1373 			return fd;
1374 	}
1375 	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
1376 			 accept->flags);
1377 	if (IS_ERR(file)) {
1378 		if (!fixed)
1379 			put_unused_fd(fd);
1380 		ret = PTR_ERR(file);
1381 		if (ret == -EAGAIN && force_nonblock) {
1382 			/*
1383 			 * if it's multishot and polled, we don't need to
1384 			 * return EAGAIN to arm the poll infra since it
1385 			 * has already been done
1386 			 */
1387 			if (issue_flags & IO_URING_F_MULTISHOT)
1388 				return IOU_ISSUE_SKIP_COMPLETE;
1389 			return ret;
1390 		}
1391 		if (ret == -ERESTARTSYS)
1392 			ret = -EINTR;
1393 		req_set_fail(req);
1394 	} else if (!fixed) {
1395 		fd_install(fd, file);
1396 		ret = fd;
1397 	} else {
1398 		ret = io_fixed_fd_install(req, issue_flags, file,
1399 						accept->file_slot);
1400 	}
1401 
1402 	if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
1403 		io_req_set_res(req, ret, 0);
1404 		return IOU_OK;
1405 	}
1406 
1407 	if (ret < 0)
1408 		return ret;
1409 	if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
1410 				ret, IORING_CQE_F_MORE))
1411 		goto retry;
1412 
1413 	io_req_set_res(req, ret, 0);
1414 	return IOU_STOP_MULTISHOT;
1415 }
1416 
1417 int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1418 {
1419 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1420 
1421 	if (sqe->addr || sqe->rw_flags || sqe->buf_index)
1422 		return -EINVAL;
1423 
1424 	sock->domain = READ_ONCE(sqe->fd);
1425 	sock->type = READ_ONCE(sqe->off);
1426 	sock->protocol = READ_ONCE(sqe->len);
1427 	sock->file_slot = READ_ONCE(sqe->file_index);
1428 	sock->nofile = rlimit(RLIMIT_NOFILE);
1429 
1430 	sock->flags = sock->type & ~SOCK_TYPE_MASK;
1431 	if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
1432 		return -EINVAL;
1433 	if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
1434 		return -EINVAL;
1435 	return 0;
1436 }
1437 
1438 int io_socket(struct io_kiocb *req, unsigned int issue_flags)
1439 {
1440 	struct io_socket *sock = io_kiocb_to_cmd(req, struct io_socket);
1441 	bool fixed = !!sock->file_slot;
1442 	struct file *file;
1443 	int ret, fd;
1444 
1445 	if (!fixed) {
1446 		fd = __get_unused_fd_flags(sock->flags, sock->nofile);
1447 		if (unlikely(fd < 0))
1448 			return fd;
1449 	}
1450 	file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
1451 	if (IS_ERR(file)) {
1452 		if (!fixed)
1453 			put_unused_fd(fd);
1454 		ret = PTR_ERR(file);
1455 		if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
1456 			return -EAGAIN;
1457 		if (ret == -ERESTARTSYS)
1458 			ret = -EINTR;
1459 		req_set_fail(req);
1460 	} else if (!fixed) {
1461 		fd_install(fd, file);
1462 		ret = fd;
1463 	} else {
1464 		ret = io_fixed_fd_install(req, issue_flags, file,
1465 					    sock->file_slot);
1466 	}
1467 	io_req_set_res(req, ret, 0);
1468 	return IOU_OK;
1469 }
1470 
1471 int io_connect_prep_async(struct io_kiocb *req)
1472 {
1473 	struct io_async_connect *io = req->async_data;
1474 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1475 
1476 	return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
1477 }
1478 
1479 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
1480 {
1481 	struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect);
1482 
1483 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
1484 		return -EINVAL;
1485 
1486 	conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
1487 	conn->addr_len =  READ_ONCE(sqe->addr2);
1488 	conn->in_progress = conn->seen_econnaborted = false;
1489 	return 0;
1490 }
1491 
1492 int io_connect(struct io_kiocb *req, unsigned int issue_flags)
1493 {
1494 	struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect);
1495 	struct io_async_connect __io, *io;
1496 	unsigned file_flags;
1497 	int ret;
1498 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
1499 
1500 	if (req_has_async_data(req)) {
1501 		io = req->async_data;
1502 	} else {
1503 		ret = move_addr_to_kernel(connect->addr,
1504 						connect->addr_len,
1505 						&__io.address);
1506 		if (ret)
1507 			goto out;
1508 		io = &__io;
1509 	}
1510 
1511 	file_flags = force_nonblock ? O_NONBLOCK : 0;
1512 
1513 	ret = __sys_connect_file(req->file, &io->address,
1514 					connect->addr_len, file_flags);
1515 	if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED)
1516 	    && force_nonblock) {
1517 		if (ret == -EINPROGRESS) {
1518 			connect->in_progress = true;
1519 		} else if (ret == -ECONNABORTED) {
1520 			if (connect->seen_econnaborted)
1521 				goto out;
1522 			connect->seen_econnaborted = true;
1523 		}
1524 		if (req_has_async_data(req))
1525 			return -EAGAIN;
1526 		if (io_alloc_async_data(req)) {
1527 			ret = -ENOMEM;
1528 			goto out;
1529 		}
1530 		memcpy(req->async_data, &__io, sizeof(__io));
1531 		return -EAGAIN;
1532 	}
1533 	if (connect->in_progress) {
1534 		/*
1535 		 * At least bluetooth will return -EBADFD on a re-connect
1536 		 * attempt, and it's (supposedly) also valid to get -EISCONN
1537 		 * which means the previous result is good. For both of these,
1538 		 * grab the sock_error() and use that for the completion.
1539 		 */
1540 		if (ret == -EBADFD || ret == -EISCONN)
1541 			ret = sock_error(sock_from_file(req->file)->sk);
1542 	}
1543 	if (ret == -ERESTARTSYS)
1544 		ret = -EINTR;
1545 out:
1546 	if (ret < 0)
1547 		req_set_fail(req);
1548 	io_req_set_res(req, ret, 0);
1549 	return IOU_OK;
1550 }
1551 
1552 void io_netmsg_cache_free(struct io_cache_entry *entry)
1553 {
1554 	kfree(container_of(entry, struct io_async_msghdr, cache));
1555 }
1556 #endif
1557