xref: /openbmc/linux/io_uring/rsrc.c (revision 5ff4fdff)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12 
13 #include <uapi/linux/io_uring.h>
14 
15 #include "io_uring_types.h"
16 #include "io_uring.h"
17 #include "openclose.h"
18 #include "rsrc.h"
19 
20 struct io_rsrc_update {
21 	struct file			*file;
22 	u64				arg;
23 	u32				nr_args;
24 	u32				offset;
25 };
26 
27 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
28 				  struct io_mapped_ubuf **pimu,
29 				  struct page **last_hpage);
30 
31 #define IO_RSRC_REF_BATCH	100
32 
33 /* only define max */
34 #define IORING_MAX_FIXED_FILES	(1U << 20)
35 #define IORING_MAX_REG_BUFFERS	(1U << 14)
36 
37 void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
38 	__must_hold(&ctx->uring_lock)
39 {
40 	if (ctx->rsrc_cached_refs) {
41 		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
42 		ctx->rsrc_cached_refs = 0;
43 	}
44 }
45 
46 static inline void __io_unaccount_mem(struct user_struct *user,
47 				      unsigned long nr_pages)
48 {
49 	atomic_long_sub(nr_pages, &user->locked_vm);
50 }
51 
52 static inline int __io_account_mem(struct user_struct *user,
53 				   unsigned long nr_pages)
54 {
55 	unsigned long page_limit, cur_pages, new_pages;
56 
57 	/* Don't allow more pages than we can safely lock */
58 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
59 
60 	do {
61 		cur_pages = atomic_long_read(&user->locked_vm);
62 		new_pages = cur_pages + nr_pages;
63 		if (new_pages > page_limit)
64 			return -ENOMEM;
65 	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
66 					new_pages) != cur_pages);
67 
68 	return 0;
69 }
70 
71 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
72 {
73 	if (ctx->user)
74 		__io_unaccount_mem(ctx->user, nr_pages);
75 
76 	if (ctx->mm_account)
77 		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
78 }
79 
80 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
81 {
82 	int ret;
83 
84 	if (ctx->user) {
85 		ret = __io_account_mem(ctx->user, nr_pages);
86 		if (ret)
87 			return ret;
88 	}
89 
90 	if (ctx->mm_account)
91 		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
92 
93 	return 0;
94 }
95 
96 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
97 		       void __user *arg, unsigned index)
98 {
99 	struct iovec __user *src;
100 
101 #ifdef CONFIG_COMPAT
102 	if (ctx->compat) {
103 		struct compat_iovec __user *ciovs;
104 		struct compat_iovec ciov;
105 
106 		ciovs = (struct compat_iovec __user *) arg;
107 		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
108 			return -EFAULT;
109 
110 		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
111 		dst->iov_len = ciov.iov_len;
112 		return 0;
113 	}
114 #endif
115 	src = (struct iovec __user *) arg;
116 	if (copy_from_user(dst, &src[index], sizeof(*dst)))
117 		return -EFAULT;
118 	return 0;
119 }
120 
121 static int io_buffer_validate(struct iovec *iov)
122 {
123 	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
124 
125 	/*
126 	 * Don't impose further limits on the size and buffer
127 	 * constraints here, we'll -EINVAL later when IO is
128 	 * submitted if they are wrong.
129 	 */
130 	if (!iov->iov_base)
131 		return iov->iov_len ? -EFAULT : 0;
132 	if (!iov->iov_len)
133 		return -EFAULT;
134 
135 	/* arbitrary limit, but we need something */
136 	if (iov->iov_len > SZ_1G)
137 		return -EFAULT;
138 
139 	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
140 		return -EOVERFLOW;
141 
142 	return 0;
143 }
144 
145 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
146 {
147 	struct io_mapped_ubuf *imu = *slot;
148 	unsigned int i;
149 
150 	if (imu != ctx->dummy_ubuf) {
151 		for (i = 0; i < imu->nr_bvecs; i++)
152 			unpin_user_page(imu->bvec[i].bv_page);
153 		if (imu->acct_pages)
154 			io_unaccount_mem(ctx, imu->acct_pages);
155 		kvfree(imu);
156 	}
157 	*slot = NULL;
158 }
159 
160 void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
161 	__must_hold(&ctx->uring_lock)
162 {
163 	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
164 	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
165 }
166 
167 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
168 {
169 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
170 	struct io_ring_ctx *ctx = rsrc_data->ctx;
171 	struct io_rsrc_put *prsrc, *tmp;
172 
173 	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
174 		list_del(&prsrc->list);
175 
176 		if (prsrc->tag) {
177 			if (ctx->flags & IORING_SETUP_IOPOLL)
178 				mutex_lock(&ctx->uring_lock);
179 
180 			spin_lock(&ctx->completion_lock);
181 			io_fill_cqe_aux(ctx, prsrc->tag, 0, 0);
182 			io_commit_cqring(ctx);
183 			spin_unlock(&ctx->completion_lock);
184 			io_cqring_ev_posted(ctx);
185 
186 			if (ctx->flags & IORING_SETUP_IOPOLL)
187 				mutex_unlock(&ctx->uring_lock);
188 		}
189 
190 		rsrc_data->do_put(ctx, prsrc);
191 		kfree(prsrc);
192 	}
193 
194 	io_rsrc_node_destroy(ref_node);
195 	if (atomic_dec_and_test(&rsrc_data->refs))
196 		complete(&rsrc_data->done);
197 }
198 
199 void io_rsrc_put_work(struct work_struct *work)
200 {
201 	struct io_ring_ctx *ctx;
202 	struct llist_node *node;
203 
204 	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
205 	node = llist_del_all(&ctx->rsrc_put_llist);
206 
207 	while (node) {
208 		struct io_rsrc_node *ref_node;
209 		struct llist_node *next = node->next;
210 
211 		ref_node = llist_entry(node, struct io_rsrc_node, llist);
212 		__io_rsrc_put_work(ref_node);
213 		node = next;
214 	}
215 }
216 
217 void io_wait_rsrc_data(struct io_rsrc_data *data)
218 {
219 	if (data && !atomic_dec_and_test(&data->refs))
220 		wait_for_completion(&data->done);
221 }
222 
223 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
224 {
225 	percpu_ref_exit(&ref_node->refs);
226 	kfree(ref_node);
227 }
228 
229 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
230 {
231 	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
232 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
233 	unsigned long flags;
234 	bool first_add = false;
235 	unsigned long delay = HZ;
236 
237 	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
238 	node->done = true;
239 
240 	/* if we are mid-quiesce then do not delay */
241 	if (node->rsrc_data->quiesce)
242 		delay = 0;
243 
244 	while (!list_empty(&ctx->rsrc_ref_list)) {
245 		node = list_first_entry(&ctx->rsrc_ref_list,
246 					    struct io_rsrc_node, node);
247 		/* recycle ref nodes in order */
248 		if (!node->done)
249 			break;
250 		list_del(&node->node);
251 		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
252 	}
253 	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
254 
255 	if (first_add)
256 		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
257 }
258 
259 static struct io_rsrc_node *io_rsrc_node_alloc(void)
260 {
261 	struct io_rsrc_node *ref_node;
262 
263 	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
264 	if (!ref_node)
265 		return NULL;
266 
267 	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
268 			    0, GFP_KERNEL)) {
269 		kfree(ref_node);
270 		return NULL;
271 	}
272 	INIT_LIST_HEAD(&ref_node->node);
273 	INIT_LIST_HEAD(&ref_node->rsrc_list);
274 	ref_node->done = false;
275 	return ref_node;
276 }
277 
278 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
279 			 struct io_rsrc_data *data_to_kill)
280 	__must_hold(&ctx->uring_lock)
281 {
282 	WARN_ON_ONCE(!ctx->rsrc_backup_node);
283 	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
284 
285 	io_rsrc_refs_drop(ctx);
286 
287 	if (data_to_kill) {
288 		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
289 
290 		rsrc_node->rsrc_data = data_to_kill;
291 		spin_lock_irq(&ctx->rsrc_ref_lock);
292 		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
293 		spin_unlock_irq(&ctx->rsrc_ref_lock);
294 
295 		atomic_inc(&data_to_kill->refs);
296 		percpu_ref_kill(&rsrc_node->refs);
297 		ctx->rsrc_node = NULL;
298 	}
299 
300 	if (!ctx->rsrc_node) {
301 		ctx->rsrc_node = ctx->rsrc_backup_node;
302 		ctx->rsrc_backup_node = NULL;
303 	}
304 }
305 
306 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
307 {
308 	if (ctx->rsrc_backup_node)
309 		return 0;
310 	ctx->rsrc_backup_node = io_rsrc_node_alloc();
311 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
312 }
313 
314 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
315 				      struct io_ring_ctx *ctx)
316 {
317 	int ret;
318 
319 	/* As we may drop ->uring_lock, other task may have started quiesce */
320 	if (data->quiesce)
321 		return -ENXIO;
322 
323 	data->quiesce = true;
324 	do {
325 		ret = io_rsrc_node_switch_start(ctx);
326 		if (ret)
327 			break;
328 		io_rsrc_node_switch(ctx, data);
329 
330 		/* kill initial ref, already quiesced if zero */
331 		if (atomic_dec_and_test(&data->refs))
332 			break;
333 		mutex_unlock(&ctx->uring_lock);
334 		flush_delayed_work(&ctx->rsrc_put_work);
335 		ret = wait_for_completion_interruptible(&data->done);
336 		if (!ret) {
337 			mutex_lock(&ctx->uring_lock);
338 			if (atomic_read(&data->refs) > 0) {
339 				/*
340 				 * it has been revived by another thread while
341 				 * we were unlocked
342 				 */
343 				mutex_unlock(&ctx->uring_lock);
344 			} else {
345 				break;
346 			}
347 		}
348 
349 		atomic_inc(&data->refs);
350 		/* wait for all works potentially completing data->done */
351 		flush_delayed_work(&ctx->rsrc_put_work);
352 		reinit_completion(&data->done);
353 
354 		ret = io_run_task_work_sig();
355 		mutex_lock(&ctx->uring_lock);
356 	} while (ret >= 0);
357 	data->quiesce = false;
358 
359 	return ret;
360 }
361 
362 static void io_free_page_table(void **table, size_t size)
363 {
364 	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
365 
366 	for (i = 0; i < nr_tables; i++)
367 		kfree(table[i]);
368 	kfree(table);
369 }
370 
371 static void io_rsrc_data_free(struct io_rsrc_data *data)
372 {
373 	size_t size = data->nr * sizeof(data->tags[0][0]);
374 
375 	if (data->tags)
376 		io_free_page_table((void **)data->tags, size);
377 	kfree(data);
378 }
379 
380 static __cold void **io_alloc_page_table(size_t size)
381 {
382 	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
383 	size_t init_size = size;
384 	void **table;
385 
386 	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
387 	if (!table)
388 		return NULL;
389 
390 	for (i = 0; i < nr_tables; i++) {
391 		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
392 
393 		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
394 		if (!table[i]) {
395 			io_free_page_table(table, init_size);
396 			return NULL;
397 		}
398 		size -= this_size;
399 	}
400 	return table;
401 }
402 
403 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
404 				     rsrc_put_fn *do_put, u64 __user *utags,
405 				     unsigned nr, struct io_rsrc_data **pdata)
406 {
407 	struct io_rsrc_data *data;
408 	int ret = -ENOMEM;
409 	unsigned i;
410 
411 	data = kzalloc(sizeof(*data), GFP_KERNEL);
412 	if (!data)
413 		return -ENOMEM;
414 	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
415 	if (!data->tags) {
416 		kfree(data);
417 		return -ENOMEM;
418 	}
419 
420 	data->nr = nr;
421 	data->ctx = ctx;
422 	data->do_put = do_put;
423 	if (utags) {
424 		ret = -EFAULT;
425 		for (i = 0; i < nr; i++) {
426 			u64 *tag_slot = io_get_tag_slot(data, i);
427 
428 			if (copy_from_user(tag_slot, &utags[i],
429 					   sizeof(*tag_slot)))
430 				goto fail;
431 		}
432 	}
433 
434 	atomic_set(&data->refs, 1);
435 	init_completion(&data->done);
436 	*pdata = data;
437 	return 0;
438 fail:
439 	io_rsrc_data_free(data);
440 	return ret;
441 }
442 
443 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
444 				 struct io_uring_rsrc_update2 *up,
445 				 unsigned nr_args)
446 {
447 	u64 __user *tags = u64_to_user_ptr(up->tags);
448 	__s32 __user *fds = u64_to_user_ptr(up->data);
449 	struct io_rsrc_data *data = ctx->file_data;
450 	struct io_fixed_file *file_slot;
451 	struct file *file;
452 	int fd, i, err = 0;
453 	unsigned int done;
454 	bool needs_switch = false;
455 
456 	if (!ctx->file_data)
457 		return -ENXIO;
458 	if (up->offset + nr_args > ctx->nr_user_files)
459 		return -EINVAL;
460 
461 	for (done = 0; done < nr_args; done++) {
462 		u64 tag = 0;
463 
464 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
465 		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
466 			err = -EFAULT;
467 			break;
468 		}
469 		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
470 			err = -EINVAL;
471 			break;
472 		}
473 		if (fd == IORING_REGISTER_FILES_SKIP)
474 			continue;
475 
476 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
477 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
478 
479 		if (file_slot->file_ptr) {
480 			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
481 			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
482 			if (err)
483 				break;
484 			file_slot->file_ptr = 0;
485 			io_file_bitmap_clear(&ctx->file_table, i);
486 			needs_switch = true;
487 		}
488 		if (fd != -1) {
489 			file = fget(fd);
490 			if (!file) {
491 				err = -EBADF;
492 				break;
493 			}
494 			/*
495 			 * Don't allow io_uring instances to be registered. If
496 			 * UNIX isn't enabled, then this causes a reference
497 			 * cycle and this instance can never get freed. If UNIX
498 			 * is enabled we'll handle it just fine, but there's
499 			 * still no point in allowing a ring fd as it doesn't
500 			 * support regular read/write anyway.
501 			 */
502 			if (io_is_uring_fops(file)) {
503 				fput(file);
504 				err = -EBADF;
505 				break;
506 			}
507 			err = io_scm_file_account(ctx, file);
508 			if (err) {
509 				fput(file);
510 				break;
511 			}
512 			*io_get_tag_slot(data, i) = tag;
513 			io_fixed_file_set(file_slot, file);
514 			io_file_bitmap_set(&ctx->file_table, i);
515 		}
516 	}
517 
518 	if (needs_switch)
519 		io_rsrc_node_switch(ctx, data);
520 	return done ? done : err;
521 }
522 
523 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
524 				   struct io_uring_rsrc_update2 *up,
525 				   unsigned int nr_args)
526 {
527 	u64 __user *tags = u64_to_user_ptr(up->tags);
528 	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
529 	struct page *last_hpage = NULL;
530 	bool needs_switch = false;
531 	__u32 done;
532 	int i, err;
533 
534 	if (!ctx->buf_data)
535 		return -ENXIO;
536 	if (up->offset + nr_args > ctx->nr_user_bufs)
537 		return -EINVAL;
538 
539 	for (done = 0; done < nr_args; done++) {
540 		struct io_mapped_ubuf *imu;
541 		int offset = up->offset + done;
542 		u64 tag = 0;
543 
544 		err = io_copy_iov(ctx, &iov, iovs, done);
545 		if (err)
546 			break;
547 		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
548 			err = -EFAULT;
549 			break;
550 		}
551 		err = io_buffer_validate(&iov);
552 		if (err)
553 			break;
554 		if (!iov.iov_base && tag) {
555 			err = -EINVAL;
556 			break;
557 		}
558 		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
559 		if (err)
560 			break;
561 
562 		i = array_index_nospec(offset, ctx->nr_user_bufs);
563 		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
564 			err = io_queue_rsrc_removal(ctx->buf_data, i,
565 						    ctx->rsrc_node, ctx->user_bufs[i]);
566 			if (unlikely(err)) {
567 				io_buffer_unmap(ctx, &imu);
568 				break;
569 			}
570 			ctx->user_bufs[i] = ctx->dummy_ubuf;
571 			needs_switch = true;
572 		}
573 
574 		ctx->user_bufs[i] = imu;
575 		*io_get_tag_slot(ctx->buf_data, offset) = tag;
576 	}
577 
578 	if (needs_switch)
579 		io_rsrc_node_switch(ctx, ctx->buf_data);
580 	return done ? done : err;
581 }
582 
583 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
584 				     struct io_uring_rsrc_update2 *up,
585 				     unsigned nr_args)
586 {
587 	__u32 tmp;
588 	int err;
589 
590 	if (check_add_overflow(up->offset, nr_args, &tmp))
591 		return -EOVERFLOW;
592 	err = io_rsrc_node_switch_start(ctx);
593 	if (err)
594 		return err;
595 
596 	switch (type) {
597 	case IORING_RSRC_FILE:
598 		return __io_sqe_files_update(ctx, up, nr_args);
599 	case IORING_RSRC_BUFFER:
600 		return __io_sqe_buffers_update(ctx, up, nr_args);
601 	}
602 	return -EINVAL;
603 }
604 
605 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
606 			     unsigned nr_args)
607 {
608 	struct io_uring_rsrc_update2 up;
609 
610 	if (!nr_args)
611 		return -EINVAL;
612 	memset(&up, 0, sizeof(up));
613 	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
614 		return -EFAULT;
615 	if (up.resv || up.resv2)
616 		return -EINVAL;
617 	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
618 }
619 
620 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
621 			    unsigned size, unsigned type)
622 {
623 	struct io_uring_rsrc_update2 up;
624 
625 	if (size != sizeof(up))
626 		return -EINVAL;
627 	if (copy_from_user(&up, arg, sizeof(up)))
628 		return -EFAULT;
629 	if (!up.nr || up.resv || up.resv2)
630 		return -EINVAL;
631 	return __io_register_rsrc_update(ctx, type, &up, up.nr);
632 }
633 
634 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
635 			    unsigned int size, unsigned int type)
636 {
637 	struct io_uring_rsrc_register rr;
638 
639 	/* keep it extendible */
640 	if (size != sizeof(rr))
641 		return -EINVAL;
642 
643 	memset(&rr, 0, sizeof(rr));
644 	if (copy_from_user(&rr, arg, size))
645 		return -EFAULT;
646 	if (!rr.nr || rr.resv2)
647 		return -EINVAL;
648 	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
649 		return -EINVAL;
650 
651 	switch (type) {
652 	case IORING_RSRC_FILE:
653 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
654 			break;
655 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
656 					     rr.nr, u64_to_user_ptr(rr.tags));
657 	case IORING_RSRC_BUFFER:
658 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
659 			break;
660 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
661 					       rr.nr, u64_to_user_ptr(rr.tags));
662 	}
663 	return -EINVAL;
664 }
665 
666 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
667 {
668 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
669 
670 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
671 		return -EINVAL;
672 	if (sqe->rw_flags || sqe->splice_fd_in)
673 		return -EINVAL;
674 
675 	up->offset = READ_ONCE(sqe->off);
676 	up->nr_args = READ_ONCE(sqe->len);
677 	if (!up->nr_args)
678 		return -EINVAL;
679 	up->arg = READ_ONCE(sqe->addr);
680 	return 0;
681 }
682 
683 static int io_files_update_with_index_alloc(struct io_kiocb *req,
684 					    unsigned int issue_flags)
685 {
686 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
687 	__s32 __user *fds = u64_to_user_ptr(up->arg);
688 	unsigned int done;
689 	struct file *file;
690 	int ret, fd;
691 
692 	if (!req->ctx->file_data)
693 		return -ENXIO;
694 
695 	for (done = 0; done < up->nr_args; done++) {
696 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
697 			ret = -EFAULT;
698 			break;
699 		}
700 
701 		file = fget(fd);
702 		if (!file) {
703 			ret = -EBADF;
704 			break;
705 		}
706 		ret = io_fixed_fd_install(req, issue_flags, file,
707 					  IORING_FILE_INDEX_ALLOC);
708 		if (ret < 0)
709 			break;
710 		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
711 			__io_close_fixed(req, issue_flags, ret);
712 			ret = -EFAULT;
713 			break;
714 		}
715 	}
716 
717 	if (done)
718 		return done;
719 	return ret;
720 }
721 
722 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
723 {
724 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
725 	struct io_ring_ctx *ctx = req->ctx;
726 	struct io_uring_rsrc_update2 up2;
727 	int ret;
728 
729 	up2.offset = up->offset;
730 	up2.data = up->arg;
731 	up2.nr = 0;
732 	up2.tags = 0;
733 	up2.resv = 0;
734 	up2.resv2 = 0;
735 
736 	if (up->offset == IORING_FILE_INDEX_ALLOC) {
737 		ret = io_files_update_with_index_alloc(req, issue_flags);
738 	} else {
739 		io_ring_submit_lock(ctx, issue_flags);
740 		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
741 						&up2, up->nr_args);
742 		io_ring_submit_unlock(ctx, issue_flags);
743 	}
744 
745 	if (ret < 0)
746 		req_set_fail(req);
747 	io_req_set_res(req, ret, 0);
748 	return IOU_OK;
749 }
750 
751 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
752 			  struct io_rsrc_node *node, void *rsrc)
753 {
754 	u64 *tag_slot = io_get_tag_slot(data, idx);
755 	struct io_rsrc_put *prsrc;
756 
757 	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
758 	if (!prsrc)
759 		return -ENOMEM;
760 
761 	prsrc->tag = *tag_slot;
762 	*tag_slot = 0;
763 	prsrc->rsrc = rsrc;
764 	list_add(&prsrc->list, &node->rsrc_list);
765 	return 0;
766 }
767 
768 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
769 {
770 #if !defined(IO_URING_SCM_ALL)
771 	int i;
772 
773 	for (i = 0; i < ctx->nr_user_files; i++) {
774 		struct file *file = io_file_from_index(&ctx->file_table, i);
775 
776 		if (!file)
777 			continue;
778 		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
779 			continue;
780 		io_file_bitmap_clear(&ctx->file_table, i);
781 		fput(file);
782 	}
783 #endif
784 
785 #if defined(CONFIG_UNIX)
786 	if (ctx->ring_sock) {
787 		struct sock *sock = ctx->ring_sock->sk;
788 		struct sk_buff *skb;
789 
790 		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
791 			kfree_skb(skb);
792 	}
793 #endif
794 	io_free_file_tables(&ctx->file_table);
795 	io_rsrc_data_free(ctx->file_data);
796 	ctx->file_data = NULL;
797 	ctx->nr_user_files = 0;
798 }
799 
800 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
801 {
802 	unsigned nr = ctx->nr_user_files;
803 	int ret;
804 
805 	if (!ctx->file_data)
806 		return -ENXIO;
807 
808 	/*
809 	 * Quiesce may unlock ->uring_lock, and while it's not held
810 	 * prevent new requests using the table.
811 	 */
812 	ctx->nr_user_files = 0;
813 	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
814 	ctx->nr_user_files = nr;
815 	if (!ret)
816 		__io_sqe_files_unregister(ctx);
817 	return ret;
818 }
819 
820 /*
821  * Ensure the UNIX gc is aware of our file set, so we are certain that
822  * the io_uring can be safely unregistered on process exit, even if we have
823  * loops in the file referencing. We account only files that can hold other
824  * files because otherwise they can't form a loop and so are not interesting
825  * for GC.
826  */
827 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
828 {
829 #if defined(CONFIG_UNIX)
830 	struct sock *sk = ctx->ring_sock->sk;
831 	struct sk_buff_head *head = &sk->sk_receive_queue;
832 	struct scm_fp_list *fpl;
833 	struct sk_buff *skb;
834 
835 	if (likely(!io_file_need_scm(file)))
836 		return 0;
837 
838 	/*
839 	 * See if we can merge this file into an existing skb SCM_RIGHTS
840 	 * file set. If there's no room, fall back to allocating a new skb
841 	 * and filling it in.
842 	 */
843 	spin_lock_irq(&head->lock);
844 	skb = skb_peek(head);
845 	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
846 		__skb_unlink(skb, head);
847 	else
848 		skb = NULL;
849 	spin_unlock_irq(&head->lock);
850 
851 	if (!skb) {
852 		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
853 		if (!fpl)
854 			return -ENOMEM;
855 
856 		skb = alloc_skb(0, GFP_KERNEL);
857 		if (!skb) {
858 			kfree(fpl);
859 			return -ENOMEM;
860 		}
861 
862 		fpl->user = get_uid(current_user());
863 		fpl->max = SCM_MAX_FD;
864 		fpl->count = 0;
865 
866 		UNIXCB(skb).fp = fpl;
867 		skb->sk = sk;
868 		skb->destructor = unix_destruct_scm;
869 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
870 	}
871 
872 	fpl = UNIXCB(skb).fp;
873 	fpl->fp[fpl->count++] = get_file(file);
874 	unix_inflight(fpl->user, file);
875 	skb_queue_head(head, skb);
876 	fput(file);
877 #endif
878 	return 0;
879 }
880 
881 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
882 {
883 	struct file *file = prsrc->file;
884 #if defined(CONFIG_UNIX)
885 	struct sock *sock = ctx->ring_sock->sk;
886 	struct sk_buff_head list, *head = &sock->sk_receive_queue;
887 	struct sk_buff *skb;
888 	int i;
889 
890 	if (!io_file_need_scm(file)) {
891 		fput(file);
892 		return;
893 	}
894 
895 	__skb_queue_head_init(&list);
896 
897 	/*
898 	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
899 	 * remove this entry and rearrange the file array.
900 	 */
901 	skb = skb_dequeue(head);
902 	while (skb) {
903 		struct scm_fp_list *fp;
904 
905 		fp = UNIXCB(skb).fp;
906 		for (i = 0; i < fp->count; i++) {
907 			int left;
908 
909 			if (fp->fp[i] != file)
910 				continue;
911 
912 			unix_notinflight(fp->user, fp->fp[i]);
913 			left = fp->count - 1 - i;
914 			if (left) {
915 				memmove(&fp->fp[i], &fp->fp[i + 1],
916 						left * sizeof(struct file *));
917 			}
918 			fp->count--;
919 			if (!fp->count) {
920 				kfree_skb(skb);
921 				skb = NULL;
922 			} else {
923 				__skb_queue_tail(&list, skb);
924 			}
925 			fput(file);
926 			file = NULL;
927 			break;
928 		}
929 
930 		if (!file)
931 			break;
932 
933 		__skb_queue_tail(&list, skb);
934 
935 		skb = skb_dequeue(head);
936 	}
937 
938 	if (skb_peek(&list)) {
939 		spin_lock_irq(&head->lock);
940 		while ((skb = __skb_dequeue(&list)) != NULL)
941 			__skb_queue_tail(head, skb);
942 		spin_unlock_irq(&head->lock);
943 	}
944 #else
945 	fput(file);
946 #endif
947 }
948 
949 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
950 			  unsigned nr_args, u64 __user *tags)
951 {
952 	__s32 __user *fds = (__s32 __user *) arg;
953 	struct file *file;
954 	int fd, ret;
955 	unsigned i;
956 
957 	if (ctx->file_data)
958 		return -EBUSY;
959 	if (!nr_args)
960 		return -EINVAL;
961 	if (nr_args > IORING_MAX_FIXED_FILES)
962 		return -EMFILE;
963 	if (nr_args > rlimit(RLIMIT_NOFILE))
964 		return -EMFILE;
965 	ret = io_rsrc_node_switch_start(ctx);
966 	if (ret)
967 		return ret;
968 	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
969 				 &ctx->file_data);
970 	if (ret)
971 		return ret;
972 
973 	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
974 		io_rsrc_data_free(ctx->file_data);
975 		ctx->file_data = NULL;
976 		return -ENOMEM;
977 	}
978 
979 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
980 		struct io_fixed_file *file_slot;
981 
982 		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
983 			ret = -EFAULT;
984 			goto fail;
985 		}
986 		/* allow sparse sets */
987 		if (!fds || fd == -1) {
988 			ret = -EINVAL;
989 			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
990 				goto fail;
991 			continue;
992 		}
993 
994 		file = fget(fd);
995 		ret = -EBADF;
996 		if (unlikely(!file))
997 			goto fail;
998 
999 		/*
1000 		 * Don't allow io_uring instances to be registered. If UNIX
1001 		 * isn't enabled, then this causes a reference cycle and this
1002 		 * instance can never get freed. If UNIX is enabled we'll
1003 		 * handle it just fine, but there's still no point in allowing
1004 		 * a ring fd as it doesn't support regular read/write anyway.
1005 		 */
1006 		if (io_is_uring_fops(file)) {
1007 			fput(file);
1008 			goto fail;
1009 		}
1010 		ret = io_scm_file_account(ctx, file);
1011 		if (ret) {
1012 			fput(file);
1013 			goto fail;
1014 		}
1015 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
1016 		io_fixed_file_set(file_slot, file);
1017 		io_file_bitmap_set(&ctx->file_table, i);
1018 	}
1019 
1020 	io_rsrc_node_switch(ctx, NULL);
1021 	return 0;
1022 fail:
1023 	__io_sqe_files_unregister(ctx);
1024 	return ret;
1025 }
1026 
1027 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
1028 {
1029 	io_buffer_unmap(ctx, &prsrc->buf);
1030 	prsrc->buf = NULL;
1031 }
1032 
1033 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1034 {
1035 	unsigned int i;
1036 
1037 	for (i = 0; i < ctx->nr_user_bufs; i++)
1038 		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
1039 	kfree(ctx->user_bufs);
1040 	io_rsrc_data_free(ctx->buf_data);
1041 	ctx->user_bufs = NULL;
1042 	ctx->buf_data = NULL;
1043 	ctx->nr_user_bufs = 0;
1044 }
1045 
1046 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1047 {
1048 	unsigned nr = ctx->nr_user_bufs;
1049 	int ret;
1050 
1051 	if (!ctx->buf_data)
1052 		return -ENXIO;
1053 
1054 	/*
1055 	 * Quiesce may unlock ->uring_lock, and while it's not held
1056 	 * prevent new requests using the table.
1057 	 */
1058 	ctx->nr_user_bufs = 0;
1059 	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
1060 	ctx->nr_user_bufs = nr;
1061 	if (!ret)
1062 		__io_sqe_buffers_unregister(ctx);
1063 	return ret;
1064 }
1065 
1066 /*
1067  * Not super efficient, but this is just a registration time. And we do cache
1068  * the last compound head, so generally we'll only do a full search if we don't
1069  * match that one.
1070  *
1071  * We check if the given compound head page has already been accounted, to
1072  * avoid double accounting it. This allows us to account the full size of the
1073  * page, not just the constituent pages of a huge page.
1074  */
1075 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
1076 				  int nr_pages, struct page *hpage)
1077 {
1078 	int i, j;
1079 
1080 	/* check current page array */
1081 	for (i = 0; i < nr_pages; i++) {
1082 		if (!PageCompound(pages[i]))
1083 			continue;
1084 		if (compound_head(pages[i]) == hpage)
1085 			return true;
1086 	}
1087 
1088 	/* check previously registered pages */
1089 	for (i = 0; i < ctx->nr_user_bufs; i++) {
1090 		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
1091 
1092 		for (j = 0; j < imu->nr_bvecs; j++) {
1093 			if (!PageCompound(imu->bvec[j].bv_page))
1094 				continue;
1095 			if (compound_head(imu->bvec[j].bv_page) == hpage)
1096 				return true;
1097 		}
1098 	}
1099 
1100 	return false;
1101 }
1102 
1103 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
1104 				 int nr_pages, struct io_mapped_ubuf *imu,
1105 				 struct page **last_hpage)
1106 {
1107 	int i, ret;
1108 
1109 	imu->acct_pages = 0;
1110 	for (i = 0; i < nr_pages; i++) {
1111 		if (!PageCompound(pages[i])) {
1112 			imu->acct_pages++;
1113 		} else {
1114 			struct page *hpage;
1115 
1116 			hpage = compound_head(pages[i]);
1117 			if (hpage == *last_hpage)
1118 				continue;
1119 			*last_hpage = hpage;
1120 			if (headpage_already_acct(ctx, pages, i, hpage))
1121 				continue;
1122 			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
1123 		}
1124 	}
1125 
1126 	if (!imu->acct_pages)
1127 		return 0;
1128 
1129 	ret = io_account_mem(ctx, imu->acct_pages);
1130 	if (ret)
1131 		imu->acct_pages = 0;
1132 	return ret;
1133 }
1134 
1135 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
1136 {
1137 	unsigned long start, end, nr_pages;
1138 	struct vm_area_struct **vmas = NULL;
1139 	struct page **pages = NULL;
1140 	int i, pret, ret = -ENOMEM;
1141 
1142 	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1143 	start = ubuf >> PAGE_SHIFT;
1144 	nr_pages = end - start;
1145 
1146 	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
1147 	if (!pages)
1148 		goto done;
1149 
1150 	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
1151 			      GFP_KERNEL);
1152 	if (!vmas)
1153 		goto done;
1154 
1155 	ret = 0;
1156 	mmap_read_lock(current->mm);
1157 	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
1158 			      pages, vmas);
1159 	if (pret == nr_pages) {
1160 		/* don't support file backed memory */
1161 		for (i = 0; i < nr_pages; i++) {
1162 			struct vm_area_struct *vma = vmas[i];
1163 
1164 			if (vma_is_shmem(vma))
1165 				continue;
1166 			if (vma->vm_file &&
1167 			    !is_file_hugepages(vma->vm_file)) {
1168 				ret = -EOPNOTSUPP;
1169 				break;
1170 			}
1171 		}
1172 		*npages = nr_pages;
1173 	} else {
1174 		ret = pret < 0 ? pret : -EFAULT;
1175 	}
1176 	mmap_read_unlock(current->mm);
1177 	if (ret) {
1178 		/*
1179 		 * if we did partial map, or found file backed vmas,
1180 		 * release any pages we did get
1181 		 */
1182 		if (pret > 0)
1183 			unpin_user_pages(pages, pret);
1184 		goto done;
1185 	}
1186 	ret = 0;
1187 done:
1188 	kvfree(vmas);
1189 	if (ret < 0) {
1190 		kvfree(pages);
1191 		pages = ERR_PTR(ret);
1192 	}
1193 	return pages;
1194 }
1195 
1196 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
1197 				  struct io_mapped_ubuf **pimu,
1198 				  struct page **last_hpage)
1199 {
1200 	struct io_mapped_ubuf *imu = NULL;
1201 	struct page **pages = NULL;
1202 	unsigned long off;
1203 	size_t size;
1204 	int ret, nr_pages, i;
1205 
1206 	*pimu = ctx->dummy_ubuf;
1207 	if (!iov->iov_base)
1208 		return 0;
1209 
1210 	ret = -ENOMEM;
1211 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
1212 				&nr_pages);
1213 	if (IS_ERR(pages)) {
1214 		ret = PTR_ERR(pages);
1215 		pages = NULL;
1216 		goto done;
1217 	}
1218 
1219 	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1220 	if (!imu)
1221 		goto done;
1222 
1223 	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1224 	if (ret) {
1225 		unpin_user_pages(pages, nr_pages);
1226 		goto done;
1227 	}
1228 
1229 	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1230 	size = iov->iov_len;
1231 	for (i = 0; i < nr_pages; i++) {
1232 		size_t vec_len;
1233 
1234 		vec_len = min_t(size_t, size, PAGE_SIZE - off);
1235 		imu->bvec[i].bv_page = pages[i];
1236 		imu->bvec[i].bv_len = vec_len;
1237 		imu->bvec[i].bv_offset = off;
1238 		off = 0;
1239 		size -= vec_len;
1240 	}
1241 	/* store original address for later verification */
1242 	imu->ubuf = (unsigned long) iov->iov_base;
1243 	imu->ubuf_end = imu->ubuf + iov->iov_len;
1244 	imu->nr_bvecs = nr_pages;
1245 	*pimu = imu;
1246 	ret = 0;
1247 done:
1248 	if (ret)
1249 		kvfree(imu);
1250 	kvfree(pages);
1251 	return ret;
1252 }
1253 
1254 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1255 {
1256 	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1257 	return ctx->user_bufs ? 0 : -ENOMEM;
1258 }
1259 
1260 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1261 			    unsigned int nr_args, u64 __user *tags)
1262 {
1263 	struct page *last_hpage = NULL;
1264 	struct io_rsrc_data *data;
1265 	int i, ret;
1266 	struct iovec iov;
1267 
1268 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1269 
1270 	if (ctx->user_bufs)
1271 		return -EBUSY;
1272 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1273 		return -EINVAL;
1274 	ret = io_rsrc_node_switch_start(ctx);
1275 	if (ret)
1276 		return ret;
1277 	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
1278 	if (ret)
1279 		return ret;
1280 	ret = io_buffers_map_alloc(ctx, nr_args);
1281 	if (ret) {
1282 		io_rsrc_data_free(data);
1283 		return ret;
1284 	}
1285 
1286 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1287 		if (arg) {
1288 			ret = io_copy_iov(ctx, &iov, arg, i);
1289 			if (ret)
1290 				break;
1291 			ret = io_buffer_validate(&iov);
1292 			if (ret)
1293 				break;
1294 		} else {
1295 			memset(&iov, 0, sizeof(iov));
1296 		}
1297 
1298 		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
1299 			ret = -EINVAL;
1300 			break;
1301 		}
1302 
1303 		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
1304 					     &last_hpage);
1305 		if (ret)
1306 			break;
1307 	}
1308 
1309 	WARN_ON_ONCE(ctx->buf_data);
1310 
1311 	ctx->buf_data = data;
1312 	if (ret)
1313 		__io_sqe_buffers_unregister(ctx);
1314 	else
1315 		io_rsrc_node_switch(ctx, NULL);
1316 	return ret;
1317 }
1318