xref: /openbmc/linux/io_uring/rsrc.c (revision c059f785)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/fs.h>
5 #include <linux/file.h>
6 #include <linux/mm.h>
7 #include <linux/slab.h>
8 #include <linux/nospec.h>
9 #include <linux/hugetlb.h>
10 #include <linux/compat.h>
11 #include <linux/io_uring.h>
12 
13 #include <uapi/linux/io_uring.h>
14 
15 #include "io_uring.h"
16 #include "openclose.h"
17 #include "rsrc.h"
18 
19 struct io_rsrc_update {
20 	struct file			*file;
21 	u64				arg;
22 	u32				nr_args;
23 	u32				offset;
24 };
25 
26 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
27 				  struct io_mapped_ubuf **pimu,
28 				  struct page **last_hpage);
29 
30 #define IO_RSRC_REF_BATCH	100
31 
32 /* only define max */
33 #define IORING_MAX_FIXED_FILES	(1U << 20)
34 #define IORING_MAX_REG_BUFFERS	(1U << 14)
35 
36 void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
37 	__must_hold(&ctx->uring_lock)
38 {
39 	if (ctx->rsrc_cached_refs) {
40 		io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
41 		ctx->rsrc_cached_refs = 0;
42 	}
43 }
44 
45 static inline void __io_unaccount_mem(struct user_struct *user,
46 				      unsigned long nr_pages)
47 {
48 	atomic_long_sub(nr_pages, &user->locked_vm);
49 }
50 
51 static inline int __io_account_mem(struct user_struct *user,
52 				   unsigned long nr_pages)
53 {
54 	unsigned long page_limit, cur_pages, new_pages;
55 
56 	/* Don't allow more pages than we can safely lock */
57 	page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
58 
59 	do {
60 		cur_pages = atomic_long_read(&user->locked_vm);
61 		new_pages = cur_pages + nr_pages;
62 		if (new_pages > page_limit)
63 			return -ENOMEM;
64 	} while (atomic_long_cmpxchg(&user->locked_vm, cur_pages,
65 					new_pages) != cur_pages);
66 
67 	return 0;
68 }
69 
70 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
71 {
72 	if (ctx->user)
73 		__io_unaccount_mem(ctx->user, nr_pages);
74 
75 	if (ctx->mm_account)
76 		atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
77 }
78 
79 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
80 {
81 	int ret;
82 
83 	if (ctx->user) {
84 		ret = __io_account_mem(ctx->user, nr_pages);
85 		if (ret)
86 			return ret;
87 	}
88 
89 	if (ctx->mm_account)
90 		atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
91 
92 	return 0;
93 }
94 
95 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
96 		       void __user *arg, unsigned index)
97 {
98 	struct iovec __user *src;
99 
100 #ifdef CONFIG_COMPAT
101 	if (ctx->compat) {
102 		struct compat_iovec __user *ciovs;
103 		struct compat_iovec ciov;
104 
105 		ciovs = (struct compat_iovec __user *) arg;
106 		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
107 			return -EFAULT;
108 
109 		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
110 		dst->iov_len = ciov.iov_len;
111 		return 0;
112 	}
113 #endif
114 	src = (struct iovec __user *) arg;
115 	if (copy_from_user(dst, &src[index], sizeof(*dst)))
116 		return -EFAULT;
117 	return 0;
118 }
119 
120 static int io_buffer_validate(struct iovec *iov)
121 {
122 	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
123 
124 	/*
125 	 * Don't impose further limits on the size and buffer
126 	 * constraints here, we'll -EINVAL later when IO is
127 	 * submitted if they are wrong.
128 	 */
129 	if (!iov->iov_base)
130 		return iov->iov_len ? -EFAULT : 0;
131 	if (!iov->iov_len)
132 		return -EFAULT;
133 
134 	/* arbitrary limit, but we need something */
135 	if (iov->iov_len > SZ_1G)
136 		return -EFAULT;
137 
138 	if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
139 		return -EOVERFLOW;
140 
141 	return 0;
142 }
143 
144 static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
145 {
146 	struct io_mapped_ubuf *imu = *slot;
147 	unsigned int i;
148 
149 	if (imu != ctx->dummy_ubuf) {
150 		for (i = 0; i < imu->nr_bvecs; i++)
151 			unpin_user_page(imu->bvec[i].bv_page);
152 		if (imu->acct_pages)
153 			io_unaccount_mem(ctx, imu->acct_pages);
154 		kvfree(imu);
155 	}
156 	*slot = NULL;
157 }
158 
159 void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
160 	__must_hold(&ctx->uring_lock)
161 {
162 	ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
163 	percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
164 }
165 
166 static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
167 {
168 	struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
169 	struct io_ring_ctx *ctx = rsrc_data->ctx;
170 	struct io_rsrc_put *prsrc, *tmp;
171 
172 	list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
173 		list_del(&prsrc->list);
174 
175 		if (prsrc->tag) {
176 			if (ctx->flags & IORING_SETUP_IOPOLL) {
177 				mutex_lock(&ctx->uring_lock);
178 				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
179 				mutex_unlock(&ctx->uring_lock);
180 			} else {
181 				io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
182 			}
183 		}
184 
185 		rsrc_data->do_put(ctx, prsrc);
186 		kfree(prsrc);
187 	}
188 
189 	io_rsrc_node_destroy(ref_node);
190 	if (atomic_dec_and_test(&rsrc_data->refs))
191 		complete(&rsrc_data->done);
192 }
193 
194 void io_rsrc_put_work(struct work_struct *work)
195 {
196 	struct io_ring_ctx *ctx;
197 	struct llist_node *node;
198 
199 	ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
200 	node = llist_del_all(&ctx->rsrc_put_llist);
201 
202 	while (node) {
203 		struct io_rsrc_node *ref_node;
204 		struct llist_node *next = node->next;
205 
206 		ref_node = llist_entry(node, struct io_rsrc_node, llist);
207 		__io_rsrc_put_work(ref_node);
208 		node = next;
209 	}
210 }
211 
212 void io_wait_rsrc_data(struct io_rsrc_data *data)
213 {
214 	if (data && !atomic_dec_and_test(&data->refs))
215 		wait_for_completion(&data->done);
216 }
217 
218 void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
219 {
220 	percpu_ref_exit(&ref_node->refs);
221 	kfree(ref_node);
222 }
223 
224 static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
225 {
226 	struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
227 	struct io_ring_ctx *ctx = node->rsrc_data->ctx;
228 	unsigned long flags;
229 	bool first_add = false;
230 	unsigned long delay = HZ;
231 
232 	spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
233 	node->done = true;
234 
235 	/* if we are mid-quiesce then do not delay */
236 	if (node->rsrc_data->quiesce)
237 		delay = 0;
238 
239 	while (!list_empty(&ctx->rsrc_ref_list)) {
240 		node = list_first_entry(&ctx->rsrc_ref_list,
241 					    struct io_rsrc_node, node);
242 		/* recycle ref nodes in order */
243 		if (!node->done)
244 			break;
245 		list_del(&node->node);
246 		first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
247 	}
248 	spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
249 
250 	if (first_add)
251 		mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
252 }
253 
254 static struct io_rsrc_node *io_rsrc_node_alloc(void)
255 {
256 	struct io_rsrc_node *ref_node;
257 
258 	ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
259 	if (!ref_node)
260 		return NULL;
261 
262 	if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
263 			    0, GFP_KERNEL)) {
264 		kfree(ref_node);
265 		return NULL;
266 	}
267 	INIT_LIST_HEAD(&ref_node->node);
268 	INIT_LIST_HEAD(&ref_node->rsrc_list);
269 	ref_node->done = false;
270 	return ref_node;
271 }
272 
273 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
274 			 struct io_rsrc_data *data_to_kill)
275 	__must_hold(&ctx->uring_lock)
276 {
277 	WARN_ON_ONCE(!ctx->rsrc_backup_node);
278 	WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
279 
280 	io_rsrc_refs_drop(ctx);
281 
282 	if (data_to_kill) {
283 		struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
284 
285 		rsrc_node->rsrc_data = data_to_kill;
286 		spin_lock_irq(&ctx->rsrc_ref_lock);
287 		list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
288 		spin_unlock_irq(&ctx->rsrc_ref_lock);
289 
290 		atomic_inc(&data_to_kill->refs);
291 		percpu_ref_kill(&rsrc_node->refs);
292 		ctx->rsrc_node = NULL;
293 	}
294 
295 	if (!ctx->rsrc_node) {
296 		ctx->rsrc_node = ctx->rsrc_backup_node;
297 		ctx->rsrc_backup_node = NULL;
298 	}
299 }
300 
301 int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
302 {
303 	if (ctx->rsrc_backup_node)
304 		return 0;
305 	ctx->rsrc_backup_node = io_rsrc_node_alloc();
306 	return ctx->rsrc_backup_node ? 0 : -ENOMEM;
307 }
308 
309 __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
310 				      struct io_ring_ctx *ctx)
311 {
312 	int ret;
313 
314 	/* As we may drop ->uring_lock, other task may have started quiesce */
315 	if (data->quiesce)
316 		return -ENXIO;
317 
318 	data->quiesce = true;
319 	do {
320 		ret = io_rsrc_node_switch_start(ctx);
321 		if (ret)
322 			break;
323 		io_rsrc_node_switch(ctx, data);
324 
325 		/* kill initial ref, already quiesced if zero */
326 		if (atomic_dec_and_test(&data->refs))
327 			break;
328 		mutex_unlock(&ctx->uring_lock);
329 		flush_delayed_work(&ctx->rsrc_put_work);
330 		ret = wait_for_completion_interruptible(&data->done);
331 		if (!ret) {
332 			mutex_lock(&ctx->uring_lock);
333 			if (atomic_read(&data->refs) > 0) {
334 				/*
335 				 * it has been revived by another thread while
336 				 * we were unlocked
337 				 */
338 				mutex_unlock(&ctx->uring_lock);
339 			} else {
340 				break;
341 			}
342 		}
343 
344 		atomic_inc(&data->refs);
345 		/* wait for all works potentially completing data->done */
346 		flush_delayed_work(&ctx->rsrc_put_work);
347 		reinit_completion(&data->done);
348 
349 		ret = io_run_task_work_sig();
350 		mutex_lock(&ctx->uring_lock);
351 	} while (ret >= 0);
352 	data->quiesce = false;
353 
354 	return ret;
355 }
356 
357 static void io_free_page_table(void **table, size_t size)
358 {
359 	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
360 
361 	for (i = 0; i < nr_tables; i++)
362 		kfree(table[i]);
363 	kfree(table);
364 }
365 
366 static void io_rsrc_data_free(struct io_rsrc_data *data)
367 {
368 	size_t size = data->nr * sizeof(data->tags[0][0]);
369 
370 	if (data->tags)
371 		io_free_page_table((void **)data->tags, size);
372 	kfree(data);
373 }
374 
375 static __cold void **io_alloc_page_table(size_t size)
376 {
377 	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
378 	size_t init_size = size;
379 	void **table;
380 
381 	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
382 	if (!table)
383 		return NULL;
384 
385 	for (i = 0; i < nr_tables; i++) {
386 		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
387 
388 		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
389 		if (!table[i]) {
390 			io_free_page_table(table, init_size);
391 			return NULL;
392 		}
393 		size -= this_size;
394 	}
395 	return table;
396 }
397 
398 __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
399 				     rsrc_put_fn *do_put, u64 __user *utags,
400 				     unsigned nr, struct io_rsrc_data **pdata)
401 {
402 	struct io_rsrc_data *data;
403 	int ret = -ENOMEM;
404 	unsigned i;
405 
406 	data = kzalloc(sizeof(*data), GFP_KERNEL);
407 	if (!data)
408 		return -ENOMEM;
409 	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
410 	if (!data->tags) {
411 		kfree(data);
412 		return -ENOMEM;
413 	}
414 
415 	data->nr = nr;
416 	data->ctx = ctx;
417 	data->do_put = do_put;
418 	if (utags) {
419 		ret = -EFAULT;
420 		for (i = 0; i < nr; i++) {
421 			u64 *tag_slot = io_get_tag_slot(data, i);
422 
423 			if (copy_from_user(tag_slot, &utags[i],
424 					   sizeof(*tag_slot)))
425 				goto fail;
426 		}
427 	}
428 
429 	atomic_set(&data->refs, 1);
430 	init_completion(&data->done);
431 	*pdata = data;
432 	return 0;
433 fail:
434 	io_rsrc_data_free(data);
435 	return ret;
436 }
437 
438 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
439 				 struct io_uring_rsrc_update2 *up,
440 				 unsigned nr_args)
441 {
442 	u64 __user *tags = u64_to_user_ptr(up->tags);
443 	__s32 __user *fds = u64_to_user_ptr(up->data);
444 	struct io_rsrc_data *data = ctx->file_data;
445 	struct io_fixed_file *file_slot;
446 	struct file *file;
447 	int fd, i, err = 0;
448 	unsigned int done;
449 	bool needs_switch = false;
450 
451 	if (!ctx->file_data)
452 		return -ENXIO;
453 	if (up->offset + nr_args > ctx->nr_user_files)
454 		return -EINVAL;
455 
456 	for (done = 0; done < nr_args; done++) {
457 		u64 tag = 0;
458 
459 		if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
460 		    copy_from_user(&fd, &fds[done], sizeof(fd))) {
461 			err = -EFAULT;
462 			break;
463 		}
464 		if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
465 			err = -EINVAL;
466 			break;
467 		}
468 		if (fd == IORING_REGISTER_FILES_SKIP)
469 			continue;
470 
471 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
472 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
473 
474 		if (file_slot->file_ptr) {
475 			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
476 			err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
477 			if (err)
478 				break;
479 			file_slot->file_ptr = 0;
480 			io_file_bitmap_clear(&ctx->file_table, i);
481 			needs_switch = true;
482 		}
483 		if (fd != -1) {
484 			file = fget(fd);
485 			if (!file) {
486 				err = -EBADF;
487 				break;
488 			}
489 			/*
490 			 * Don't allow io_uring instances to be registered. If
491 			 * UNIX isn't enabled, then this causes a reference
492 			 * cycle and this instance can never get freed. If UNIX
493 			 * is enabled we'll handle it just fine, but there's
494 			 * still no point in allowing a ring fd as it doesn't
495 			 * support regular read/write anyway.
496 			 */
497 			if (io_is_uring_fops(file)) {
498 				fput(file);
499 				err = -EBADF;
500 				break;
501 			}
502 			err = io_scm_file_account(ctx, file);
503 			if (err) {
504 				fput(file);
505 				break;
506 			}
507 			*io_get_tag_slot(data, i) = tag;
508 			io_fixed_file_set(file_slot, file);
509 			io_file_bitmap_set(&ctx->file_table, i);
510 		}
511 	}
512 
513 	if (needs_switch)
514 		io_rsrc_node_switch(ctx, data);
515 	return done ? done : err;
516 }
517 
518 static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
519 				   struct io_uring_rsrc_update2 *up,
520 				   unsigned int nr_args)
521 {
522 	u64 __user *tags = u64_to_user_ptr(up->tags);
523 	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
524 	struct page *last_hpage = NULL;
525 	bool needs_switch = false;
526 	__u32 done;
527 	int i, err;
528 
529 	if (!ctx->buf_data)
530 		return -ENXIO;
531 	if (up->offset + nr_args > ctx->nr_user_bufs)
532 		return -EINVAL;
533 
534 	for (done = 0; done < nr_args; done++) {
535 		struct io_mapped_ubuf *imu;
536 		int offset = up->offset + done;
537 		u64 tag = 0;
538 
539 		err = io_copy_iov(ctx, &iov, iovs, done);
540 		if (err)
541 			break;
542 		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
543 			err = -EFAULT;
544 			break;
545 		}
546 		err = io_buffer_validate(&iov);
547 		if (err)
548 			break;
549 		if (!iov.iov_base && tag) {
550 			err = -EINVAL;
551 			break;
552 		}
553 		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
554 		if (err)
555 			break;
556 
557 		i = array_index_nospec(offset, ctx->nr_user_bufs);
558 		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
559 			err = io_queue_rsrc_removal(ctx->buf_data, i,
560 						    ctx->rsrc_node, ctx->user_bufs[i]);
561 			if (unlikely(err)) {
562 				io_buffer_unmap(ctx, &imu);
563 				break;
564 			}
565 			ctx->user_bufs[i] = ctx->dummy_ubuf;
566 			needs_switch = true;
567 		}
568 
569 		ctx->user_bufs[i] = imu;
570 		*io_get_tag_slot(ctx->buf_data, offset) = tag;
571 	}
572 
573 	if (needs_switch)
574 		io_rsrc_node_switch(ctx, ctx->buf_data);
575 	return done ? done : err;
576 }
577 
578 static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
579 				     struct io_uring_rsrc_update2 *up,
580 				     unsigned nr_args)
581 {
582 	__u32 tmp;
583 	int err;
584 
585 	if (check_add_overflow(up->offset, nr_args, &tmp))
586 		return -EOVERFLOW;
587 	err = io_rsrc_node_switch_start(ctx);
588 	if (err)
589 		return err;
590 
591 	switch (type) {
592 	case IORING_RSRC_FILE:
593 		return __io_sqe_files_update(ctx, up, nr_args);
594 	case IORING_RSRC_BUFFER:
595 		return __io_sqe_buffers_update(ctx, up, nr_args);
596 	}
597 	return -EINVAL;
598 }
599 
600 int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
601 			     unsigned nr_args)
602 {
603 	struct io_uring_rsrc_update2 up;
604 
605 	if (!nr_args)
606 		return -EINVAL;
607 	memset(&up, 0, sizeof(up));
608 	if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
609 		return -EFAULT;
610 	if (up.resv || up.resv2)
611 		return -EINVAL;
612 	return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
613 }
614 
615 int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
616 			    unsigned size, unsigned type)
617 {
618 	struct io_uring_rsrc_update2 up;
619 
620 	if (size != sizeof(up))
621 		return -EINVAL;
622 	if (copy_from_user(&up, arg, sizeof(up)))
623 		return -EFAULT;
624 	if (!up.nr || up.resv || up.resv2)
625 		return -EINVAL;
626 	return __io_register_rsrc_update(ctx, type, &up, up.nr);
627 }
628 
629 __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
630 			    unsigned int size, unsigned int type)
631 {
632 	struct io_uring_rsrc_register rr;
633 
634 	/* keep it extendible */
635 	if (size != sizeof(rr))
636 		return -EINVAL;
637 
638 	memset(&rr, 0, sizeof(rr));
639 	if (copy_from_user(&rr, arg, size))
640 		return -EFAULT;
641 	if (!rr.nr || rr.resv2)
642 		return -EINVAL;
643 	if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
644 		return -EINVAL;
645 
646 	switch (type) {
647 	case IORING_RSRC_FILE:
648 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
649 			break;
650 		return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
651 					     rr.nr, u64_to_user_ptr(rr.tags));
652 	case IORING_RSRC_BUFFER:
653 		if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
654 			break;
655 		return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
656 					       rr.nr, u64_to_user_ptr(rr.tags));
657 	}
658 	return -EINVAL;
659 }
660 
661 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
662 {
663 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
664 
665 	if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
666 		return -EINVAL;
667 	if (sqe->rw_flags || sqe->splice_fd_in)
668 		return -EINVAL;
669 
670 	up->offset = READ_ONCE(sqe->off);
671 	up->nr_args = READ_ONCE(sqe->len);
672 	if (!up->nr_args)
673 		return -EINVAL;
674 	up->arg = READ_ONCE(sqe->addr);
675 	return 0;
676 }
677 
678 static int io_files_update_with_index_alloc(struct io_kiocb *req,
679 					    unsigned int issue_flags)
680 {
681 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
682 	__s32 __user *fds = u64_to_user_ptr(up->arg);
683 	unsigned int done;
684 	struct file *file;
685 	int ret, fd;
686 
687 	if (!req->ctx->file_data)
688 		return -ENXIO;
689 
690 	for (done = 0; done < up->nr_args; done++) {
691 		if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
692 			ret = -EFAULT;
693 			break;
694 		}
695 
696 		file = fget(fd);
697 		if (!file) {
698 			ret = -EBADF;
699 			break;
700 		}
701 		ret = io_fixed_fd_install(req, issue_flags, file,
702 					  IORING_FILE_INDEX_ALLOC);
703 		if (ret < 0)
704 			break;
705 		if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
706 			__io_close_fixed(req, issue_flags, ret);
707 			ret = -EFAULT;
708 			break;
709 		}
710 	}
711 
712 	if (done)
713 		return done;
714 	return ret;
715 }
716 
717 int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
718 {
719 	struct io_rsrc_update *up = io_kiocb_to_cmd(req);
720 	struct io_ring_ctx *ctx = req->ctx;
721 	struct io_uring_rsrc_update2 up2;
722 	int ret;
723 
724 	up2.offset = up->offset;
725 	up2.data = up->arg;
726 	up2.nr = 0;
727 	up2.tags = 0;
728 	up2.resv = 0;
729 	up2.resv2 = 0;
730 
731 	if (up->offset == IORING_FILE_INDEX_ALLOC) {
732 		ret = io_files_update_with_index_alloc(req, issue_flags);
733 	} else {
734 		io_ring_submit_lock(ctx, issue_flags);
735 		ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
736 						&up2, up->nr_args);
737 		io_ring_submit_unlock(ctx, issue_flags);
738 	}
739 
740 	if (ret < 0)
741 		req_set_fail(req);
742 	io_req_set_res(req, ret, 0);
743 	return IOU_OK;
744 }
745 
746 int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
747 			  struct io_rsrc_node *node, void *rsrc)
748 {
749 	u64 *tag_slot = io_get_tag_slot(data, idx);
750 	struct io_rsrc_put *prsrc;
751 
752 	prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
753 	if (!prsrc)
754 		return -ENOMEM;
755 
756 	prsrc->tag = *tag_slot;
757 	*tag_slot = 0;
758 	prsrc->rsrc = rsrc;
759 	list_add(&prsrc->list, &node->rsrc_list);
760 	return 0;
761 }
762 
763 void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
764 {
765 #if !defined(IO_URING_SCM_ALL)
766 	int i;
767 
768 	for (i = 0; i < ctx->nr_user_files; i++) {
769 		struct file *file = io_file_from_index(&ctx->file_table, i);
770 
771 		if (!file)
772 			continue;
773 		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
774 			continue;
775 		io_file_bitmap_clear(&ctx->file_table, i);
776 		fput(file);
777 	}
778 #endif
779 
780 #if defined(CONFIG_UNIX)
781 	if (ctx->ring_sock) {
782 		struct sock *sock = ctx->ring_sock->sk;
783 		struct sk_buff *skb;
784 
785 		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
786 			kfree_skb(skb);
787 	}
788 #endif
789 	io_free_file_tables(&ctx->file_table);
790 	io_rsrc_data_free(ctx->file_data);
791 	ctx->file_data = NULL;
792 	ctx->nr_user_files = 0;
793 }
794 
795 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
796 {
797 	unsigned nr = ctx->nr_user_files;
798 	int ret;
799 
800 	if (!ctx->file_data)
801 		return -ENXIO;
802 
803 	/*
804 	 * Quiesce may unlock ->uring_lock, and while it's not held
805 	 * prevent new requests using the table.
806 	 */
807 	ctx->nr_user_files = 0;
808 	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
809 	ctx->nr_user_files = nr;
810 	if (!ret)
811 		__io_sqe_files_unregister(ctx);
812 	return ret;
813 }
814 
815 /*
816  * Ensure the UNIX gc is aware of our file set, so we are certain that
817  * the io_uring can be safely unregistered on process exit, even if we have
818  * loops in the file referencing. We account only files that can hold other
819  * files because otherwise they can't form a loop and so are not interesting
820  * for GC.
821  */
822 int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
823 {
824 #if defined(CONFIG_UNIX)
825 	struct sock *sk = ctx->ring_sock->sk;
826 	struct sk_buff_head *head = &sk->sk_receive_queue;
827 	struct scm_fp_list *fpl;
828 	struct sk_buff *skb;
829 
830 	if (likely(!io_file_need_scm(file)))
831 		return 0;
832 
833 	/*
834 	 * See if we can merge this file into an existing skb SCM_RIGHTS
835 	 * file set. If there's no room, fall back to allocating a new skb
836 	 * and filling it in.
837 	 */
838 	spin_lock_irq(&head->lock);
839 	skb = skb_peek(head);
840 	if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD)
841 		__skb_unlink(skb, head);
842 	else
843 		skb = NULL;
844 	spin_unlock_irq(&head->lock);
845 
846 	if (!skb) {
847 		fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
848 		if (!fpl)
849 			return -ENOMEM;
850 
851 		skb = alloc_skb(0, GFP_KERNEL);
852 		if (!skb) {
853 			kfree(fpl);
854 			return -ENOMEM;
855 		}
856 
857 		fpl->user = get_uid(current_user());
858 		fpl->max = SCM_MAX_FD;
859 		fpl->count = 0;
860 
861 		UNIXCB(skb).fp = fpl;
862 		skb->sk = sk;
863 		skb->destructor = unix_destruct_scm;
864 		refcount_add(skb->truesize, &sk->sk_wmem_alloc);
865 	}
866 
867 	fpl = UNIXCB(skb).fp;
868 	fpl->fp[fpl->count++] = get_file(file);
869 	unix_inflight(fpl->user, file);
870 	skb_queue_head(head, skb);
871 	fput(file);
872 #endif
873 	return 0;
874 }
875 
876 static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
877 {
878 	struct file *file = prsrc->file;
879 #if defined(CONFIG_UNIX)
880 	struct sock *sock = ctx->ring_sock->sk;
881 	struct sk_buff_head list, *head = &sock->sk_receive_queue;
882 	struct sk_buff *skb;
883 	int i;
884 
885 	if (!io_file_need_scm(file)) {
886 		fput(file);
887 		return;
888 	}
889 
890 	__skb_queue_head_init(&list);
891 
892 	/*
893 	 * Find the skb that holds this file in its SCM_RIGHTS. When found,
894 	 * remove this entry and rearrange the file array.
895 	 */
896 	skb = skb_dequeue(head);
897 	while (skb) {
898 		struct scm_fp_list *fp;
899 
900 		fp = UNIXCB(skb).fp;
901 		for (i = 0; i < fp->count; i++) {
902 			int left;
903 
904 			if (fp->fp[i] != file)
905 				continue;
906 
907 			unix_notinflight(fp->user, fp->fp[i]);
908 			left = fp->count - 1 - i;
909 			if (left) {
910 				memmove(&fp->fp[i], &fp->fp[i + 1],
911 						left * sizeof(struct file *));
912 			}
913 			fp->count--;
914 			if (!fp->count) {
915 				kfree_skb(skb);
916 				skb = NULL;
917 			} else {
918 				__skb_queue_tail(&list, skb);
919 			}
920 			fput(file);
921 			file = NULL;
922 			break;
923 		}
924 
925 		if (!file)
926 			break;
927 
928 		__skb_queue_tail(&list, skb);
929 
930 		skb = skb_dequeue(head);
931 	}
932 
933 	if (skb_peek(&list)) {
934 		spin_lock_irq(&head->lock);
935 		while ((skb = __skb_dequeue(&list)) != NULL)
936 			__skb_queue_tail(head, skb);
937 		spin_unlock_irq(&head->lock);
938 	}
939 #else
940 	fput(file);
941 #endif
942 }
943 
944 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
945 			  unsigned nr_args, u64 __user *tags)
946 {
947 	__s32 __user *fds = (__s32 __user *) arg;
948 	struct file *file;
949 	int fd, ret;
950 	unsigned i;
951 
952 	if (ctx->file_data)
953 		return -EBUSY;
954 	if (!nr_args)
955 		return -EINVAL;
956 	if (nr_args > IORING_MAX_FIXED_FILES)
957 		return -EMFILE;
958 	if (nr_args > rlimit(RLIMIT_NOFILE))
959 		return -EMFILE;
960 	ret = io_rsrc_node_switch_start(ctx);
961 	if (ret)
962 		return ret;
963 	ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
964 				 &ctx->file_data);
965 	if (ret)
966 		return ret;
967 
968 	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
969 		io_rsrc_data_free(ctx->file_data);
970 		ctx->file_data = NULL;
971 		return -ENOMEM;
972 	}
973 
974 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
975 		struct io_fixed_file *file_slot;
976 
977 		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
978 			ret = -EFAULT;
979 			goto fail;
980 		}
981 		/* allow sparse sets */
982 		if (!fds || fd == -1) {
983 			ret = -EINVAL;
984 			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
985 				goto fail;
986 			continue;
987 		}
988 
989 		file = fget(fd);
990 		ret = -EBADF;
991 		if (unlikely(!file))
992 			goto fail;
993 
994 		/*
995 		 * Don't allow io_uring instances to be registered. If UNIX
996 		 * isn't enabled, then this causes a reference cycle and this
997 		 * instance can never get freed. If UNIX is enabled we'll
998 		 * handle it just fine, but there's still no point in allowing
999 		 * a ring fd as it doesn't support regular read/write anyway.
1000 		 */
1001 		if (io_is_uring_fops(file)) {
1002 			fput(file);
1003 			goto fail;
1004 		}
1005 		ret = io_scm_file_account(ctx, file);
1006 		if (ret) {
1007 			fput(file);
1008 			goto fail;
1009 		}
1010 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
1011 		io_fixed_file_set(file_slot, file);
1012 		io_file_bitmap_set(&ctx->file_table, i);
1013 	}
1014 
1015 	io_rsrc_node_switch(ctx, NULL);
1016 	return 0;
1017 fail:
1018 	__io_sqe_files_unregister(ctx);
1019 	return ret;
1020 }
1021 
1022 static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
1023 {
1024 	io_buffer_unmap(ctx, &prsrc->buf);
1025 	prsrc->buf = NULL;
1026 }
1027 
1028 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1029 {
1030 	unsigned int i;
1031 
1032 	for (i = 0; i < ctx->nr_user_bufs; i++)
1033 		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
1034 	kfree(ctx->user_bufs);
1035 	io_rsrc_data_free(ctx->buf_data);
1036 	ctx->user_bufs = NULL;
1037 	ctx->buf_data = NULL;
1038 	ctx->nr_user_bufs = 0;
1039 }
1040 
1041 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
1042 {
1043 	unsigned nr = ctx->nr_user_bufs;
1044 	int ret;
1045 
1046 	if (!ctx->buf_data)
1047 		return -ENXIO;
1048 
1049 	/*
1050 	 * Quiesce may unlock ->uring_lock, and while it's not held
1051 	 * prevent new requests using the table.
1052 	 */
1053 	ctx->nr_user_bufs = 0;
1054 	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
1055 	ctx->nr_user_bufs = nr;
1056 	if (!ret)
1057 		__io_sqe_buffers_unregister(ctx);
1058 	return ret;
1059 }
1060 
1061 /*
1062  * Not super efficient, but this is just a registration time. And we do cache
1063  * the last compound head, so generally we'll only do a full search if we don't
1064  * match that one.
1065  *
1066  * We check if the given compound head page has already been accounted, to
1067  * avoid double accounting it. This allows us to account the full size of the
1068  * page, not just the constituent pages of a huge page.
1069  */
1070 static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
1071 				  int nr_pages, struct page *hpage)
1072 {
1073 	int i, j;
1074 
1075 	/* check current page array */
1076 	for (i = 0; i < nr_pages; i++) {
1077 		if (!PageCompound(pages[i]))
1078 			continue;
1079 		if (compound_head(pages[i]) == hpage)
1080 			return true;
1081 	}
1082 
1083 	/* check previously registered pages */
1084 	for (i = 0; i < ctx->nr_user_bufs; i++) {
1085 		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
1086 
1087 		for (j = 0; j < imu->nr_bvecs; j++) {
1088 			if (!PageCompound(imu->bvec[j].bv_page))
1089 				continue;
1090 			if (compound_head(imu->bvec[j].bv_page) == hpage)
1091 				return true;
1092 		}
1093 	}
1094 
1095 	return false;
1096 }
1097 
1098 static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
1099 				 int nr_pages, struct io_mapped_ubuf *imu,
1100 				 struct page **last_hpage)
1101 {
1102 	int i, ret;
1103 
1104 	imu->acct_pages = 0;
1105 	for (i = 0; i < nr_pages; i++) {
1106 		if (!PageCompound(pages[i])) {
1107 			imu->acct_pages++;
1108 		} else {
1109 			struct page *hpage;
1110 
1111 			hpage = compound_head(pages[i]);
1112 			if (hpage == *last_hpage)
1113 				continue;
1114 			*last_hpage = hpage;
1115 			if (headpage_already_acct(ctx, pages, i, hpage))
1116 				continue;
1117 			imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
1118 		}
1119 	}
1120 
1121 	if (!imu->acct_pages)
1122 		return 0;
1123 
1124 	ret = io_account_mem(ctx, imu->acct_pages);
1125 	if (ret)
1126 		imu->acct_pages = 0;
1127 	return ret;
1128 }
1129 
1130 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
1131 {
1132 	unsigned long start, end, nr_pages;
1133 	struct vm_area_struct **vmas = NULL;
1134 	struct page **pages = NULL;
1135 	int i, pret, ret = -ENOMEM;
1136 
1137 	end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1138 	start = ubuf >> PAGE_SHIFT;
1139 	nr_pages = end - start;
1140 
1141 	pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
1142 	if (!pages)
1143 		goto done;
1144 
1145 	vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *),
1146 			      GFP_KERNEL);
1147 	if (!vmas)
1148 		goto done;
1149 
1150 	ret = 0;
1151 	mmap_read_lock(current->mm);
1152 	pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
1153 			      pages, vmas);
1154 	if (pret == nr_pages) {
1155 		/* don't support file backed memory */
1156 		for (i = 0; i < nr_pages; i++) {
1157 			struct vm_area_struct *vma = vmas[i];
1158 
1159 			if (vma_is_shmem(vma))
1160 				continue;
1161 			if (vma->vm_file &&
1162 			    !is_file_hugepages(vma->vm_file)) {
1163 				ret = -EOPNOTSUPP;
1164 				break;
1165 			}
1166 		}
1167 		*npages = nr_pages;
1168 	} else {
1169 		ret = pret < 0 ? pret : -EFAULT;
1170 	}
1171 	mmap_read_unlock(current->mm);
1172 	if (ret) {
1173 		/*
1174 		 * if we did partial map, or found file backed vmas,
1175 		 * release any pages we did get
1176 		 */
1177 		if (pret > 0)
1178 			unpin_user_pages(pages, pret);
1179 		goto done;
1180 	}
1181 	ret = 0;
1182 done:
1183 	kvfree(vmas);
1184 	if (ret < 0) {
1185 		kvfree(pages);
1186 		pages = ERR_PTR(ret);
1187 	}
1188 	return pages;
1189 }
1190 
1191 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
1192 				  struct io_mapped_ubuf **pimu,
1193 				  struct page **last_hpage)
1194 {
1195 	struct io_mapped_ubuf *imu = NULL;
1196 	struct page **pages = NULL;
1197 	unsigned long off;
1198 	size_t size;
1199 	int ret, nr_pages, i;
1200 
1201 	*pimu = ctx->dummy_ubuf;
1202 	if (!iov->iov_base)
1203 		return 0;
1204 
1205 	ret = -ENOMEM;
1206 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
1207 				&nr_pages);
1208 	if (IS_ERR(pages)) {
1209 		ret = PTR_ERR(pages);
1210 		pages = NULL;
1211 		goto done;
1212 	}
1213 
1214 	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
1215 	if (!imu)
1216 		goto done;
1217 
1218 	ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
1219 	if (ret) {
1220 		unpin_user_pages(pages, nr_pages);
1221 		goto done;
1222 	}
1223 
1224 	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
1225 	size = iov->iov_len;
1226 	for (i = 0; i < nr_pages; i++) {
1227 		size_t vec_len;
1228 
1229 		vec_len = min_t(size_t, size, PAGE_SIZE - off);
1230 		imu->bvec[i].bv_page = pages[i];
1231 		imu->bvec[i].bv_len = vec_len;
1232 		imu->bvec[i].bv_offset = off;
1233 		off = 0;
1234 		size -= vec_len;
1235 	}
1236 	/* store original address for later verification */
1237 	imu->ubuf = (unsigned long) iov->iov_base;
1238 	imu->ubuf_end = imu->ubuf + iov->iov_len;
1239 	imu->nr_bvecs = nr_pages;
1240 	*pimu = imu;
1241 	ret = 0;
1242 done:
1243 	if (ret)
1244 		kvfree(imu);
1245 	kvfree(pages);
1246 	return ret;
1247 }
1248 
1249 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
1250 {
1251 	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
1252 	return ctx->user_bufs ? 0 : -ENOMEM;
1253 }
1254 
1255 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
1256 			    unsigned int nr_args, u64 __user *tags)
1257 {
1258 	struct page *last_hpage = NULL;
1259 	struct io_rsrc_data *data;
1260 	int i, ret;
1261 	struct iovec iov;
1262 
1263 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
1264 
1265 	if (ctx->user_bufs)
1266 		return -EBUSY;
1267 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
1268 		return -EINVAL;
1269 	ret = io_rsrc_node_switch_start(ctx);
1270 	if (ret)
1271 		return ret;
1272 	ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
1273 	if (ret)
1274 		return ret;
1275 	ret = io_buffers_map_alloc(ctx, nr_args);
1276 	if (ret) {
1277 		io_rsrc_data_free(data);
1278 		return ret;
1279 	}
1280 
1281 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
1282 		if (arg) {
1283 			ret = io_copy_iov(ctx, &iov, arg, i);
1284 			if (ret)
1285 				break;
1286 			ret = io_buffer_validate(&iov);
1287 			if (ret)
1288 				break;
1289 		} else {
1290 			memset(&iov, 0, sizeof(iov));
1291 		}
1292 
1293 		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
1294 			ret = -EINVAL;
1295 			break;
1296 		}
1297 
1298 		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
1299 					     &last_hpage);
1300 		if (ret)
1301 			break;
1302 	}
1303 
1304 	WARN_ON_ONCE(ctx->buf_data);
1305 
1306 	ctx->buf_data = data;
1307 	if (ret)
1308 		__io_sqe_buffers_unregister(ctx);
1309 	else
1310 		io_rsrc_node_switch(ctx, NULL);
1311 	return ret;
1312 }
1313 
1314 int io_import_fixed(int ddir, struct iov_iter *iter,
1315 			   struct io_mapped_ubuf *imu,
1316 			   u64 buf_addr, size_t len)
1317 {
1318 	u64 buf_end;
1319 	size_t offset;
1320 
1321 	if (WARN_ON_ONCE(!imu))
1322 		return -EFAULT;
1323 	if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
1324 		return -EFAULT;
1325 	/* not inside the mapped region */
1326 	if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
1327 		return -EFAULT;
1328 
1329 	/*
1330 	 * May not be a start of buffer, set size appropriately
1331 	 * and advance us to the beginning.
1332 	 */
1333 	offset = buf_addr - imu->ubuf;
1334 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
1335 
1336 	if (offset) {
1337 		/*
1338 		 * Don't use iov_iter_advance() here, as it's really slow for
1339 		 * using the latter parts of a big fixed buffer - it iterates
1340 		 * over each segment manually. We can cheat a bit here, because
1341 		 * we know that:
1342 		 *
1343 		 * 1) it's a BVEC iter, we set it up
1344 		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
1345 		 *    first and last bvec
1346 		 *
1347 		 * So just find our index, and adjust the iterator afterwards.
1348 		 * If the offset is within the first bvec (or the whole first
1349 		 * bvec, just use iov_iter_advance(). This makes it easier
1350 		 * since we can just skip the first segment, which may not
1351 		 * be PAGE_SIZE aligned.
1352 		 */
1353 		const struct bio_vec *bvec = imu->bvec;
1354 
1355 		if (offset <= bvec->bv_len) {
1356 			iov_iter_advance(iter, offset);
1357 		} else {
1358 			unsigned long seg_skip;
1359 
1360 			/* skip first vec */
1361 			offset -= bvec->bv_len;
1362 			seg_skip = 1 + (offset >> PAGE_SHIFT);
1363 
1364 			iter->bvec = bvec + seg_skip;
1365 			iter->nr_segs -= seg_skip;
1366 			iter->count -= bvec->bv_len + offset;
1367 			iter->iov_offset = offset & ~PAGE_MASK;
1368 		}
1369 	}
1370 
1371 	return 0;
1372 }
1373