xref: /openbmc/linux/lib/iov_iter.c (revision f5da8354)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/highmem.h>
9 #include <linux/slab.h>
10 #include <linux/vmalloc.h>
11 #include <linux/splice.h>
12 #include <linux/compat.h>
13 #include <net/checksum.h>
14 #include <linux/scatterlist.h>
15 #include <linux/instrumented.h>
16 
17 #define PIPE_PARANOIA /* for now */
18 
19 #define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
20 	size_t left;					\
21 	size_t wanted = n;				\
22 	__p = i->iov;					\
23 	__v.iov_len = min(n, __p->iov_len - skip);	\
24 	if (likely(__v.iov_len)) {			\
25 		__v.iov_base = __p->iov_base + skip;	\
26 		left = (STEP);				\
27 		__v.iov_len -= left;			\
28 		skip += __v.iov_len;			\
29 		n -= __v.iov_len;			\
30 	} else {					\
31 		left = 0;				\
32 	}						\
33 	while (unlikely(!left && n)) {			\
34 		__p++;					\
35 		__v.iov_len = min(n, __p->iov_len);	\
36 		if (unlikely(!__v.iov_len))		\
37 			continue;			\
38 		__v.iov_base = __p->iov_base;		\
39 		left = (STEP);				\
40 		__v.iov_len -= left;			\
41 		skip = __v.iov_len;			\
42 		n -= __v.iov_len;			\
43 	}						\
44 	n = wanted - n;					\
45 }
46 
47 #define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
48 	size_t wanted = n;				\
49 	__p = i->kvec;					\
50 	__v.iov_len = min(n, __p->iov_len - skip);	\
51 	if (likely(__v.iov_len)) {			\
52 		__v.iov_base = __p->iov_base + skip;	\
53 		(void)(STEP);				\
54 		skip += __v.iov_len;			\
55 		n -= __v.iov_len;			\
56 	}						\
57 	while (unlikely(n)) {				\
58 		__p++;					\
59 		__v.iov_len = min(n, __p->iov_len);	\
60 		if (unlikely(!__v.iov_len))		\
61 			continue;			\
62 		__v.iov_base = __p->iov_base;		\
63 		(void)(STEP);				\
64 		skip = __v.iov_len;			\
65 		n -= __v.iov_len;			\
66 	}						\
67 	n = wanted;					\
68 }
69 
70 #define iterate_bvec(i, n, __v, __bi, skip, STEP) {	\
71 	struct bvec_iter __start;			\
72 	__start.bi_size = n;				\
73 	__start.bi_bvec_done = skip;			\
74 	__start.bi_idx = 0;				\
75 	for_each_bvec(__v, i->bvec, __bi, __start) {	\
76 		(void)(STEP);				\
77 	}						\
78 }
79 
80 #define iterate_xarray(i, n, __v, skip, STEP) {		\
81 	struct page *head = NULL;				\
82 	size_t wanted = n, seg, offset;				\
83 	loff_t start = i->xarray_start + skip;			\
84 	pgoff_t index = start >> PAGE_SHIFT;			\
85 	int j;							\
86 								\
87 	XA_STATE(xas, i->xarray, index);			\
88 								\
89 	rcu_read_lock();						\
90 	xas_for_each(&xas, head, ULONG_MAX) {				\
91 		if (xas_retry(&xas, head))				\
92 			continue;					\
93 		if (WARN_ON(xa_is_value(head)))				\
94 			break;						\
95 		if (WARN_ON(PageHuge(head)))				\
96 			break;						\
97 		for (j = (head->index < index) ? index - head->index : 0; \
98 		     j < thp_nr_pages(head); j++) {			\
99 			__v.bv_page = head + j;				\
100 			offset = (i->xarray_start + skip) & ~PAGE_MASK;	\
101 			seg = PAGE_SIZE - offset;			\
102 			__v.bv_offset = offset;				\
103 			__v.bv_len = min(n, seg);			\
104 			(void)(STEP);					\
105 			n -= __v.bv_len;				\
106 			skip += __v.bv_len;				\
107 			if (n == 0)					\
108 				break;					\
109 		}							\
110 		if (n == 0)						\
111 			break;						\
112 	}							\
113 	rcu_read_unlock();					\
114 	n = wanted - n;						\
115 }
116 
117 #define iterate_and_advance(i, n, v, I, B, K, X) {		\
118 	if (unlikely(i->count < n))				\
119 		n = i->count;					\
120 	if (likely(n)) {					\
121 		size_t skip = i->iov_offset;			\
122 		if (likely(iter_is_iovec(i))) {			\
123 			const struct iovec *iov;		\
124 			struct iovec v;				\
125 			iterate_iovec(i, n, v, iov, skip, (I))	\
126 			if (skip == iov->iov_len) {		\
127 				iov++;				\
128 				skip = 0;			\
129 			}					\
130 			i->nr_segs -= iov - i->iov;		\
131 			i->iov = iov;				\
132 		} else if (iov_iter_is_bvec(i)) {		\
133 			const struct bio_vec *bvec = i->bvec;	\
134 			struct bio_vec v;			\
135 			struct bvec_iter __bi;			\
136 			iterate_bvec(i, n, v, __bi, skip, (B))	\
137 			i->bvec = __bvec_iter_bvec(i->bvec, __bi);	\
138 			i->nr_segs -= i->bvec - bvec;		\
139 			skip = __bi.bi_bvec_done;		\
140 		} else if (iov_iter_is_kvec(i)) {		\
141 			const struct kvec *kvec;		\
142 			struct kvec v;				\
143 			iterate_kvec(i, n, v, kvec, skip, (K))	\
144 			if (skip == kvec->iov_len) {		\
145 				kvec++;				\
146 				skip = 0;			\
147 			}					\
148 			i->nr_segs -= kvec - i->kvec;		\
149 			i->kvec = kvec;				\
150 		} else if (iov_iter_is_xarray(i)) {		\
151 			struct bio_vec v;			\
152 			iterate_xarray(i, n, v, skip, (X))	\
153 		}						\
154 		i->count -= n;					\
155 		i->iov_offset = skip;				\
156 	}							\
157 }
158 
159 static int copyout(void __user *to, const void *from, size_t n)
160 {
161 	if (should_fail_usercopy())
162 		return n;
163 	if (access_ok(to, n)) {
164 		instrument_copy_to_user(to, from, n);
165 		n = raw_copy_to_user(to, from, n);
166 	}
167 	return n;
168 }
169 
170 static int copyin(void *to, const void __user *from, size_t n)
171 {
172 	if (should_fail_usercopy())
173 		return n;
174 	if (access_ok(from, n)) {
175 		instrument_copy_from_user(to, from, n);
176 		n = raw_copy_from_user(to, from, n);
177 	}
178 	return n;
179 }
180 
181 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
182 			 struct iov_iter *i)
183 {
184 	size_t skip, copy, left, wanted;
185 	const struct iovec *iov;
186 	char __user *buf;
187 	void *kaddr, *from;
188 
189 	if (unlikely(bytes > i->count))
190 		bytes = i->count;
191 
192 	if (unlikely(!bytes))
193 		return 0;
194 
195 	might_fault();
196 	wanted = bytes;
197 	iov = i->iov;
198 	skip = i->iov_offset;
199 	buf = iov->iov_base + skip;
200 	copy = min(bytes, iov->iov_len - skip);
201 
202 	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
203 		kaddr = kmap_atomic(page);
204 		from = kaddr + offset;
205 
206 		/* first chunk, usually the only one */
207 		left = copyout(buf, from, copy);
208 		copy -= left;
209 		skip += copy;
210 		from += copy;
211 		bytes -= copy;
212 
213 		while (unlikely(!left && bytes)) {
214 			iov++;
215 			buf = iov->iov_base;
216 			copy = min(bytes, iov->iov_len);
217 			left = copyout(buf, from, copy);
218 			copy -= left;
219 			skip = copy;
220 			from += copy;
221 			bytes -= copy;
222 		}
223 		if (likely(!bytes)) {
224 			kunmap_atomic(kaddr);
225 			goto done;
226 		}
227 		offset = from - kaddr;
228 		buf += copy;
229 		kunmap_atomic(kaddr);
230 		copy = min(bytes, iov->iov_len - skip);
231 	}
232 	/* Too bad - revert to non-atomic kmap */
233 
234 	kaddr = kmap(page);
235 	from = kaddr + offset;
236 	left = copyout(buf, from, copy);
237 	copy -= left;
238 	skip += copy;
239 	from += copy;
240 	bytes -= copy;
241 	while (unlikely(!left && bytes)) {
242 		iov++;
243 		buf = iov->iov_base;
244 		copy = min(bytes, iov->iov_len);
245 		left = copyout(buf, from, copy);
246 		copy -= left;
247 		skip = copy;
248 		from += copy;
249 		bytes -= copy;
250 	}
251 	kunmap(page);
252 
253 done:
254 	if (skip == iov->iov_len) {
255 		iov++;
256 		skip = 0;
257 	}
258 	i->count -= wanted - bytes;
259 	i->nr_segs -= iov - i->iov;
260 	i->iov = iov;
261 	i->iov_offset = skip;
262 	return wanted - bytes;
263 }
264 
265 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
266 			 struct iov_iter *i)
267 {
268 	size_t skip, copy, left, wanted;
269 	const struct iovec *iov;
270 	char __user *buf;
271 	void *kaddr, *to;
272 
273 	if (unlikely(bytes > i->count))
274 		bytes = i->count;
275 
276 	if (unlikely(!bytes))
277 		return 0;
278 
279 	might_fault();
280 	wanted = bytes;
281 	iov = i->iov;
282 	skip = i->iov_offset;
283 	buf = iov->iov_base + skip;
284 	copy = min(bytes, iov->iov_len - skip);
285 
286 	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
287 		kaddr = kmap_atomic(page);
288 		to = kaddr + offset;
289 
290 		/* first chunk, usually the only one */
291 		left = copyin(to, buf, copy);
292 		copy -= left;
293 		skip += copy;
294 		to += copy;
295 		bytes -= copy;
296 
297 		while (unlikely(!left && bytes)) {
298 			iov++;
299 			buf = iov->iov_base;
300 			copy = min(bytes, iov->iov_len);
301 			left = copyin(to, buf, copy);
302 			copy -= left;
303 			skip = copy;
304 			to += copy;
305 			bytes -= copy;
306 		}
307 		if (likely(!bytes)) {
308 			kunmap_atomic(kaddr);
309 			goto done;
310 		}
311 		offset = to - kaddr;
312 		buf += copy;
313 		kunmap_atomic(kaddr);
314 		copy = min(bytes, iov->iov_len - skip);
315 	}
316 	/* Too bad - revert to non-atomic kmap */
317 
318 	kaddr = kmap(page);
319 	to = kaddr + offset;
320 	left = copyin(to, buf, copy);
321 	copy -= left;
322 	skip += copy;
323 	to += copy;
324 	bytes -= copy;
325 	while (unlikely(!left && bytes)) {
326 		iov++;
327 		buf = iov->iov_base;
328 		copy = min(bytes, iov->iov_len);
329 		left = copyin(to, buf, copy);
330 		copy -= left;
331 		skip = copy;
332 		to += copy;
333 		bytes -= copy;
334 	}
335 	kunmap(page);
336 
337 done:
338 	if (skip == iov->iov_len) {
339 		iov++;
340 		skip = 0;
341 	}
342 	i->count -= wanted - bytes;
343 	i->nr_segs -= iov - i->iov;
344 	i->iov = iov;
345 	i->iov_offset = skip;
346 	return wanted - bytes;
347 }
348 
349 #ifdef PIPE_PARANOIA
350 static bool sanity(const struct iov_iter *i)
351 {
352 	struct pipe_inode_info *pipe = i->pipe;
353 	unsigned int p_head = pipe->head;
354 	unsigned int p_tail = pipe->tail;
355 	unsigned int p_mask = pipe->ring_size - 1;
356 	unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
357 	unsigned int i_head = i->head;
358 	unsigned int idx;
359 
360 	if (i->iov_offset) {
361 		struct pipe_buffer *p;
362 		if (unlikely(p_occupancy == 0))
363 			goto Bad;	// pipe must be non-empty
364 		if (unlikely(i_head != p_head - 1))
365 			goto Bad;	// must be at the last buffer...
366 
367 		p = &pipe->bufs[i_head & p_mask];
368 		if (unlikely(p->offset + p->len != i->iov_offset))
369 			goto Bad;	// ... at the end of segment
370 	} else {
371 		if (i_head != p_head)
372 			goto Bad;	// must be right after the last buffer
373 	}
374 	return true;
375 Bad:
376 	printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
377 	printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
378 			p_head, p_tail, pipe->ring_size);
379 	for (idx = 0; idx < pipe->ring_size; idx++)
380 		printk(KERN_ERR "[%p %p %d %d]\n",
381 			pipe->bufs[idx].ops,
382 			pipe->bufs[idx].page,
383 			pipe->bufs[idx].offset,
384 			pipe->bufs[idx].len);
385 	WARN_ON(1);
386 	return false;
387 }
388 #else
389 #define sanity(i) true
390 #endif
391 
392 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
393 			 struct iov_iter *i)
394 {
395 	struct pipe_inode_info *pipe = i->pipe;
396 	struct pipe_buffer *buf;
397 	unsigned int p_tail = pipe->tail;
398 	unsigned int p_mask = pipe->ring_size - 1;
399 	unsigned int i_head = i->head;
400 	size_t off;
401 
402 	if (unlikely(bytes > i->count))
403 		bytes = i->count;
404 
405 	if (unlikely(!bytes))
406 		return 0;
407 
408 	if (!sanity(i))
409 		return 0;
410 
411 	off = i->iov_offset;
412 	buf = &pipe->bufs[i_head & p_mask];
413 	if (off) {
414 		if (offset == off && buf->page == page) {
415 			/* merge with the last one */
416 			buf->len += bytes;
417 			i->iov_offset += bytes;
418 			goto out;
419 		}
420 		i_head++;
421 		buf = &pipe->bufs[i_head & p_mask];
422 	}
423 	if (pipe_full(i_head, p_tail, pipe->max_usage))
424 		return 0;
425 
426 	buf->ops = &page_cache_pipe_buf_ops;
427 	get_page(page);
428 	buf->page = page;
429 	buf->offset = offset;
430 	buf->len = bytes;
431 
432 	pipe->head = i_head + 1;
433 	i->iov_offset = offset + bytes;
434 	i->head = i_head;
435 out:
436 	i->count -= bytes;
437 	return bytes;
438 }
439 
440 /*
441  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
442  * bytes.  For each iovec, fault in each page that constitutes the iovec.
443  *
444  * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
445  * because it is an invalid address).
446  */
447 int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
448 {
449 	if (iter_is_iovec(i)) {
450 		const struct iovec *p;
451 		size_t skip;
452 
453 		if (bytes > i->count)
454 			bytes = i->count;
455 		for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
456 			size_t len = min(bytes, p->iov_len - skip);
457 			int err;
458 
459 			if (unlikely(!len))
460 				continue;
461 			err = fault_in_pages_readable(p->iov_base + skip, len);
462 			if (unlikely(err))
463 				return err;
464 			bytes -= len;
465 		}
466 	}
467 	return 0;
468 }
469 EXPORT_SYMBOL(iov_iter_fault_in_readable);
470 
471 void iov_iter_init(struct iov_iter *i, unsigned int direction,
472 			const struct iovec *iov, unsigned long nr_segs,
473 			size_t count)
474 {
475 	WARN_ON(direction & ~(READ | WRITE));
476 	WARN_ON_ONCE(uaccess_kernel());
477 	*i = (struct iov_iter) {
478 		.iter_type = ITER_IOVEC,
479 		.data_source = direction,
480 		.iov = iov,
481 		.nr_segs = nr_segs,
482 		.iov_offset = 0,
483 		.count = count
484 	};
485 }
486 EXPORT_SYMBOL(iov_iter_init);
487 
488 static inline bool allocated(struct pipe_buffer *buf)
489 {
490 	return buf->ops == &default_pipe_buf_ops;
491 }
492 
493 static inline void data_start(const struct iov_iter *i,
494 			      unsigned int *iter_headp, size_t *offp)
495 {
496 	unsigned int p_mask = i->pipe->ring_size - 1;
497 	unsigned int iter_head = i->head;
498 	size_t off = i->iov_offset;
499 
500 	if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
501 		    off == PAGE_SIZE)) {
502 		iter_head++;
503 		off = 0;
504 	}
505 	*iter_headp = iter_head;
506 	*offp = off;
507 }
508 
509 static size_t push_pipe(struct iov_iter *i, size_t size,
510 			int *iter_headp, size_t *offp)
511 {
512 	struct pipe_inode_info *pipe = i->pipe;
513 	unsigned int p_tail = pipe->tail;
514 	unsigned int p_mask = pipe->ring_size - 1;
515 	unsigned int iter_head;
516 	size_t off;
517 	ssize_t left;
518 
519 	if (unlikely(size > i->count))
520 		size = i->count;
521 	if (unlikely(!size))
522 		return 0;
523 
524 	left = size;
525 	data_start(i, &iter_head, &off);
526 	*iter_headp = iter_head;
527 	*offp = off;
528 	if (off) {
529 		left -= PAGE_SIZE - off;
530 		if (left <= 0) {
531 			pipe->bufs[iter_head & p_mask].len += size;
532 			return size;
533 		}
534 		pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
535 		iter_head++;
536 	}
537 	while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
538 		struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
539 		struct page *page = alloc_page(GFP_USER);
540 		if (!page)
541 			break;
542 
543 		buf->ops = &default_pipe_buf_ops;
544 		buf->page = page;
545 		buf->offset = 0;
546 		buf->len = min_t(ssize_t, left, PAGE_SIZE);
547 		left -= buf->len;
548 		iter_head++;
549 		pipe->head = iter_head;
550 
551 		if (left == 0)
552 			return size;
553 	}
554 	return size - left;
555 }
556 
557 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
558 				struct iov_iter *i)
559 {
560 	struct pipe_inode_info *pipe = i->pipe;
561 	unsigned int p_mask = pipe->ring_size - 1;
562 	unsigned int i_head;
563 	size_t n, off;
564 
565 	if (!sanity(i))
566 		return 0;
567 
568 	bytes = n = push_pipe(i, bytes, &i_head, &off);
569 	if (unlikely(!n))
570 		return 0;
571 	do {
572 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
573 		memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
574 		i->head = i_head;
575 		i->iov_offset = off + chunk;
576 		n -= chunk;
577 		addr += chunk;
578 		off = 0;
579 		i_head++;
580 	} while (n);
581 	i->count -= bytes;
582 	return bytes;
583 }
584 
585 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
586 			      __wsum sum, size_t off)
587 {
588 	__wsum next = csum_partial_copy_nocheck(from, to, len);
589 	return csum_block_add(sum, next, off);
590 }
591 
592 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
593 					 struct csum_state *csstate,
594 					 struct iov_iter *i)
595 {
596 	struct pipe_inode_info *pipe = i->pipe;
597 	unsigned int p_mask = pipe->ring_size - 1;
598 	__wsum sum = csstate->csum;
599 	size_t off = csstate->off;
600 	unsigned int i_head;
601 	size_t n, r;
602 
603 	if (!sanity(i))
604 		return 0;
605 
606 	bytes = n = push_pipe(i, bytes, &i_head, &r);
607 	if (unlikely(!n))
608 		return 0;
609 	do {
610 		size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
611 		char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
612 		sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
613 		kunmap_atomic(p);
614 		i->head = i_head;
615 		i->iov_offset = r + chunk;
616 		n -= chunk;
617 		off += chunk;
618 		addr += chunk;
619 		r = 0;
620 		i_head++;
621 	} while (n);
622 	i->count -= bytes;
623 	csstate->csum = sum;
624 	csstate->off = off;
625 	return bytes;
626 }
627 
628 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
629 {
630 	const char *from = addr;
631 	if (unlikely(iov_iter_is_pipe(i)))
632 		return copy_pipe_to_iter(addr, bytes, i);
633 	if (iter_is_iovec(i))
634 		might_fault();
635 	iterate_and_advance(i, bytes, v,
636 		copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
637 		memcpy_to_page(v.bv_page, v.bv_offset,
638 			       (from += v.bv_len) - v.bv_len, v.bv_len),
639 		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
640 		memcpy_to_page(v.bv_page, v.bv_offset,
641 			       (from += v.bv_len) - v.bv_len, v.bv_len)
642 	)
643 
644 	return bytes;
645 }
646 EXPORT_SYMBOL(_copy_to_iter);
647 
648 #ifdef CONFIG_ARCH_HAS_COPY_MC
649 static int copyout_mc(void __user *to, const void *from, size_t n)
650 {
651 	if (access_ok(to, n)) {
652 		instrument_copy_to_user(to, from, n);
653 		n = copy_mc_to_user((__force void *) to, from, n);
654 	}
655 	return n;
656 }
657 
658 static unsigned long copy_mc_to_page(struct page *page, size_t offset,
659 		const char *from, size_t len)
660 {
661 	unsigned long ret;
662 	char *to;
663 
664 	to = kmap_atomic(page);
665 	ret = copy_mc_to_kernel(to + offset, from, len);
666 	kunmap_atomic(to);
667 
668 	return ret;
669 }
670 
671 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
672 				struct iov_iter *i)
673 {
674 	struct pipe_inode_info *pipe = i->pipe;
675 	unsigned int p_mask = pipe->ring_size - 1;
676 	unsigned int i_head;
677 	size_t n, off, xfer = 0;
678 
679 	if (!sanity(i))
680 		return 0;
681 
682 	bytes = n = push_pipe(i, bytes, &i_head, &off);
683 	if (unlikely(!n))
684 		return 0;
685 	do {
686 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
687 		unsigned long rem;
688 
689 		rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
690 					    off, addr, chunk);
691 		i->head = i_head;
692 		i->iov_offset = off + chunk - rem;
693 		xfer += chunk - rem;
694 		if (rem)
695 			break;
696 		n -= chunk;
697 		addr += chunk;
698 		off = 0;
699 		i_head++;
700 	} while (n);
701 	i->count -= xfer;
702 	return xfer;
703 }
704 
705 /**
706  * _copy_mc_to_iter - copy to iter with source memory error exception handling
707  * @addr: source kernel address
708  * @bytes: total transfer length
709  * @iter: destination iterator
710  *
711  * The pmem driver deploys this for the dax operation
712  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
713  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
714  * successfully copied.
715  *
716  * The main differences between this and typical _copy_to_iter().
717  *
718  * * Typical tail/residue handling after a fault retries the copy
719  *   byte-by-byte until the fault happens again. Re-triggering machine
720  *   checks is potentially fatal so the implementation uses source
721  *   alignment and poison alignment assumptions to avoid re-triggering
722  *   hardware exceptions.
723  *
724  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
725  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
726  *   a short copy.
727  */
728 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
729 {
730 	const char *from = addr;
731 	unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
732 
733 	if (unlikely(iov_iter_is_pipe(i)))
734 		return copy_mc_pipe_to_iter(addr, bytes, i);
735 	if (iter_is_iovec(i))
736 		might_fault();
737 	iterate_and_advance(i, bytes, v,
738 		copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len,
739 			   v.iov_len),
740 		({
741 		rem = copy_mc_to_page(v.bv_page, v.bv_offset,
742 				      (from += v.bv_len) - v.bv_len, v.bv_len);
743 		if (rem) {
744 			curr_addr = (unsigned long) from;
745 			bytes = curr_addr - s_addr - rem;
746 			return bytes;
747 		}
748 		}),
749 		({
750 		rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len)
751 					- v.iov_len, v.iov_len);
752 		if (rem) {
753 			curr_addr = (unsigned long) from;
754 			bytes = curr_addr - s_addr - rem;
755 			return bytes;
756 		}
757 		}),
758 		({
759 		rem = copy_mc_to_page(v.bv_page, v.bv_offset,
760 				      (from += v.bv_len) - v.bv_len, v.bv_len);
761 		if (rem) {
762 			curr_addr = (unsigned long) from;
763 			bytes = curr_addr - s_addr - rem;
764 			rcu_read_unlock();
765 			i->iov_offset += bytes;
766 			i->count -= bytes;
767 			return bytes;
768 		}
769 		})
770 	)
771 
772 	return bytes;
773 }
774 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
775 #endif /* CONFIG_ARCH_HAS_COPY_MC */
776 
777 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
778 {
779 	char *to = addr;
780 	if (unlikely(iov_iter_is_pipe(i))) {
781 		WARN_ON(1);
782 		return 0;
783 	}
784 	if (iter_is_iovec(i))
785 		might_fault();
786 	iterate_and_advance(i, bytes, v,
787 		copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
788 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
789 				 v.bv_offset, v.bv_len),
790 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
791 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
792 				 v.bv_offset, v.bv_len)
793 	)
794 
795 	return bytes;
796 }
797 EXPORT_SYMBOL(_copy_from_iter);
798 
799 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
800 {
801 	char *to = addr;
802 	if (unlikely(iov_iter_is_pipe(i))) {
803 		WARN_ON(1);
804 		return 0;
805 	}
806 	iterate_and_advance(i, bytes, v,
807 		__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
808 					 v.iov_base, v.iov_len),
809 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
810 				 v.bv_offset, v.bv_len),
811 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
812 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
813 				 v.bv_offset, v.bv_len)
814 	)
815 
816 	return bytes;
817 }
818 EXPORT_SYMBOL(_copy_from_iter_nocache);
819 
820 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
821 /**
822  * _copy_from_iter_flushcache - write destination through cpu cache
823  * @addr: destination kernel address
824  * @bytes: total transfer length
825  * @iter: source iterator
826  *
827  * The pmem driver arranges for filesystem-dax to use this facility via
828  * dax_copy_from_iter() for ensuring that writes to persistent memory
829  * are flushed through the CPU cache. It is differentiated from
830  * _copy_from_iter_nocache() in that guarantees all data is flushed for
831  * all iterator types. The _copy_from_iter_nocache() only attempts to
832  * bypass the cache for the ITER_IOVEC case, and on some archs may use
833  * instructions that strand dirty-data in the cache.
834  */
835 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
836 {
837 	char *to = addr;
838 	if (unlikely(iov_iter_is_pipe(i))) {
839 		WARN_ON(1);
840 		return 0;
841 	}
842 	iterate_and_advance(i, bytes, v,
843 		__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
844 					 v.iov_base, v.iov_len),
845 		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
846 				 v.bv_offset, v.bv_len),
847 		memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
848 			v.iov_len),
849 		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
850 				 v.bv_offset, v.bv_len)
851 	)
852 
853 	return bytes;
854 }
855 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
856 #endif
857 
858 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
859 {
860 	struct page *head;
861 	size_t v = n + offset;
862 
863 	/*
864 	 * The general case needs to access the page order in order
865 	 * to compute the page size.
866 	 * However, we mostly deal with order-0 pages and thus can
867 	 * avoid a possible cache line miss for requests that fit all
868 	 * page orders.
869 	 */
870 	if (n <= v && v <= PAGE_SIZE)
871 		return true;
872 
873 	head = compound_head(page);
874 	v += (page - head) << PAGE_SHIFT;
875 
876 	if (likely(n <= v && v <= (page_size(head))))
877 		return true;
878 	WARN_ON(1);
879 	return false;
880 }
881 
882 static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
883 			 struct iov_iter *i)
884 {
885 	if (likely(iter_is_iovec(i)))
886 		return copy_page_to_iter_iovec(page, offset, bytes, i);
887 	if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
888 		void *kaddr = kmap_atomic(page);
889 		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
890 		kunmap_atomic(kaddr);
891 		return wanted;
892 	}
893 	if (iov_iter_is_pipe(i))
894 		return copy_page_to_iter_pipe(page, offset, bytes, i);
895 	if (unlikely(iov_iter_is_discard(i))) {
896 		if (unlikely(i->count < bytes))
897 			bytes = i->count;
898 		i->count -= bytes;
899 		return bytes;
900 	}
901 	WARN_ON(1);
902 	return 0;
903 }
904 
905 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
906 			 struct iov_iter *i)
907 {
908 	size_t res = 0;
909 	if (unlikely(!page_copy_sane(page, offset, bytes)))
910 		return 0;
911 	page += offset / PAGE_SIZE; // first subpage
912 	offset %= PAGE_SIZE;
913 	while (1) {
914 		size_t n = __copy_page_to_iter(page, offset,
915 				min(bytes, (size_t)PAGE_SIZE - offset), i);
916 		res += n;
917 		bytes -= n;
918 		if (!bytes || !n)
919 			break;
920 		offset += n;
921 		if (offset == PAGE_SIZE) {
922 			page++;
923 			offset = 0;
924 		}
925 	}
926 	return res;
927 }
928 EXPORT_SYMBOL(copy_page_to_iter);
929 
930 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
931 			 struct iov_iter *i)
932 {
933 	if (unlikely(!page_copy_sane(page, offset, bytes)))
934 		return 0;
935 	if (likely(iter_is_iovec(i)))
936 		return copy_page_from_iter_iovec(page, offset, bytes, i);
937 	if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
938 		void *kaddr = kmap_atomic(page);
939 		size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
940 		kunmap_atomic(kaddr);
941 		return wanted;
942 	}
943 	WARN_ON(1);
944 	return 0;
945 }
946 EXPORT_SYMBOL(copy_page_from_iter);
947 
948 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
949 {
950 	struct pipe_inode_info *pipe = i->pipe;
951 	unsigned int p_mask = pipe->ring_size - 1;
952 	unsigned int i_head;
953 	size_t n, off;
954 
955 	if (!sanity(i))
956 		return 0;
957 
958 	bytes = n = push_pipe(i, bytes, &i_head, &off);
959 	if (unlikely(!n))
960 		return 0;
961 
962 	do {
963 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
964 		memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
965 		i->head = i_head;
966 		i->iov_offset = off + chunk;
967 		n -= chunk;
968 		off = 0;
969 		i_head++;
970 	} while (n);
971 	i->count -= bytes;
972 	return bytes;
973 }
974 
975 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
976 {
977 	if (unlikely(iov_iter_is_pipe(i)))
978 		return pipe_zero(bytes, i);
979 	iterate_and_advance(i, bytes, v,
980 		clear_user(v.iov_base, v.iov_len),
981 		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
982 		memset(v.iov_base, 0, v.iov_len),
983 		memzero_page(v.bv_page, v.bv_offset, v.bv_len)
984 	)
985 
986 	return bytes;
987 }
988 EXPORT_SYMBOL(iov_iter_zero);
989 
990 size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes,
991 				  struct iov_iter *i)
992 {
993 	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
994 	if (unlikely(!page_copy_sane(page, offset, bytes))) {
995 		kunmap_atomic(kaddr);
996 		return 0;
997 	}
998 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
999 		kunmap_atomic(kaddr);
1000 		WARN_ON(1);
1001 		return 0;
1002 	}
1003 	iterate_and_advance(i, bytes, v,
1004 		copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
1005 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
1006 				 v.bv_offset, v.bv_len),
1007 		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
1008 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
1009 				 v.bv_offset, v.bv_len)
1010 	)
1011 	kunmap_atomic(kaddr);
1012 	return bytes;
1013 }
1014 EXPORT_SYMBOL(copy_page_from_iter_atomic);
1015 
1016 static inline void pipe_truncate(struct iov_iter *i)
1017 {
1018 	struct pipe_inode_info *pipe = i->pipe;
1019 	unsigned int p_tail = pipe->tail;
1020 	unsigned int p_head = pipe->head;
1021 	unsigned int p_mask = pipe->ring_size - 1;
1022 
1023 	if (!pipe_empty(p_head, p_tail)) {
1024 		struct pipe_buffer *buf;
1025 		unsigned int i_head = i->head;
1026 		size_t off = i->iov_offset;
1027 
1028 		if (off) {
1029 			buf = &pipe->bufs[i_head & p_mask];
1030 			buf->len = off - buf->offset;
1031 			i_head++;
1032 		}
1033 		while (p_head != i_head) {
1034 			p_head--;
1035 			pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
1036 		}
1037 
1038 		pipe->head = p_head;
1039 	}
1040 }
1041 
1042 static void pipe_advance(struct iov_iter *i, size_t size)
1043 {
1044 	struct pipe_inode_info *pipe = i->pipe;
1045 	if (size) {
1046 		struct pipe_buffer *buf;
1047 		unsigned int p_mask = pipe->ring_size - 1;
1048 		unsigned int i_head = i->head;
1049 		size_t off = i->iov_offset, left = size;
1050 
1051 		if (off) /* make it relative to the beginning of buffer */
1052 			left += off - pipe->bufs[i_head & p_mask].offset;
1053 		while (1) {
1054 			buf = &pipe->bufs[i_head & p_mask];
1055 			if (left <= buf->len)
1056 				break;
1057 			left -= buf->len;
1058 			i_head++;
1059 		}
1060 		i->head = i_head;
1061 		i->iov_offset = buf->offset + left;
1062 	}
1063 	i->count -= size;
1064 	/* ... and discard everything past that point */
1065 	pipe_truncate(i);
1066 }
1067 
1068 static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
1069 {
1070 	struct bvec_iter bi;
1071 
1072 	bi.bi_size = i->count;
1073 	bi.bi_bvec_done = i->iov_offset;
1074 	bi.bi_idx = 0;
1075 	bvec_iter_advance(i->bvec, &bi, size);
1076 
1077 	i->bvec += bi.bi_idx;
1078 	i->nr_segs -= bi.bi_idx;
1079 	i->count = bi.bi_size;
1080 	i->iov_offset = bi.bi_bvec_done;
1081 }
1082 
1083 static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
1084 {
1085 	const struct iovec *iov, *end;
1086 
1087 	if (!i->count)
1088 		return;
1089 	i->count -= size;
1090 
1091 	size += i->iov_offset; // from beginning of current segment
1092 	for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
1093 		if (likely(size < iov->iov_len))
1094 			break;
1095 		size -= iov->iov_len;
1096 	}
1097 	i->iov_offset = size;
1098 	i->nr_segs -= iov - i->iov;
1099 	i->iov = iov;
1100 }
1101 
1102 void iov_iter_advance(struct iov_iter *i, size_t size)
1103 {
1104 	if (unlikely(i->count < size))
1105 		size = i->count;
1106 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
1107 		/* iovec and kvec have identical layouts */
1108 		iov_iter_iovec_advance(i, size);
1109 	} else if (iov_iter_is_bvec(i)) {
1110 		iov_iter_bvec_advance(i, size);
1111 	} else if (iov_iter_is_pipe(i)) {
1112 		pipe_advance(i, size);
1113 	} else if (unlikely(iov_iter_is_xarray(i))) {
1114 		i->iov_offset += size;
1115 		i->count -= size;
1116 	} else if (iov_iter_is_discard(i)) {
1117 		i->count -= size;
1118 	}
1119 }
1120 EXPORT_SYMBOL(iov_iter_advance);
1121 
1122 void iov_iter_revert(struct iov_iter *i, size_t unroll)
1123 {
1124 	if (!unroll)
1125 		return;
1126 	if (WARN_ON(unroll > MAX_RW_COUNT))
1127 		return;
1128 	i->count += unroll;
1129 	if (unlikely(iov_iter_is_pipe(i))) {
1130 		struct pipe_inode_info *pipe = i->pipe;
1131 		unsigned int p_mask = pipe->ring_size - 1;
1132 		unsigned int i_head = i->head;
1133 		size_t off = i->iov_offset;
1134 		while (1) {
1135 			struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1136 			size_t n = off - b->offset;
1137 			if (unroll < n) {
1138 				off -= unroll;
1139 				break;
1140 			}
1141 			unroll -= n;
1142 			if (!unroll && i_head == i->start_head) {
1143 				off = 0;
1144 				break;
1145 			}
1146 			i_head--;
1147 			b = &pipe->bufs[i_head & p_mask];
1148 			off = b->offset + b->len;
1149 		}
1150 		i->iov_offset = off;
1151 		i->head = i_head;
1152 		pipe_truncate(i);
1153 		return;
1154 	}
1155 	if (unlikely(iov_iter_is_discard(i)))
1156 		return;
1157 	if (unroll <= i->iov_offset) {
1158 		i->iov_offset -= unroll;
1159 		return;
1160 	}
1161 	unroll -= i->iov_offset;
1162 	if (iov_iter_is_xarray(i)) {
1163 		BUG(); /* We should never go beyond the start of the specified
1164 			* range since we might then be straying into pages that
1165 			* aren't pinned.
1166 			*/
1167 	} else if (iov_iter_is_bvec(i)) {
1168 		const struct bio_vec *bvec = i->bvec;
1169 		while (1) {
1170 			size_t n = (--bvec)->bv_len;
1171 			i->nr_segs++;
1172 			if (unroll <= n) {
1173 				i->bvec = bvec;
1174 				i->iov_offset = n - unroll;
1175 				return;
1176 			}
1177 			unroll -= n;
1178 		}
1179 	} else { /* same logics for iovec and kvec */
1180 		const struct iovec *iov = i->iov;
1181 		while (1) {
1182 			size_t n = (--iov)->iov_len;
1183 			i->nr_segs++;
1184 			if (unroll <= n) {
1185 				i->iov = iov;
1186 				i->iov_offset = n - unroll;
1187 				return;
1188 			}
1189 			unroll -= n;
1190 		}
1191 	}
1192 }
1193 EXPORT_SYMBOL(iov_iter_revert);
1194 
1195 /*
1196  * Return the count of just the current iov_iter segment.
1197  */
1198 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1199 {
1200 	if (i->nr_segs > 1) {
1201 		if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1202 			return min(i->count, i->iov->iov_len - i->iov_offset);
1203 		if (iov_iter_is_bvec(i))
1204 			return min(i->count, i->bvec->bv_len - i->iov_offset);
1205 	}
1206 	return i->count;
1207 }
1208 EXPORT_SYMBOL(iov_iter_single_seg_count);
1209 
1210 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1211 			const struct kvec *kvec, unsigned long nr_segs,
1212 			size_t count)
1213 {
1214 	WARN_ON(direction & ~(READ | WRITE));
1215 	*i = (struct iov_iter){
1216 		.iter_type = ITER_KVEC,
1217 		.data_source = direction,
1218 		.kvec = kvec,
1219 		.nr_segs = nr_segs,
1220 		.iov_offset = 0,
1221 		.count = count
1222 	};
1223 }
1224 EXPORT_SYMBOL(iov_iter_kvec);
1225 
1226 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1227 			const struct bio_vec *bvec, unsigned long nr_segs,
1228 			size_t count)
1229 {
1230 	WARN_ON(direction & ~(READ | WRITE));
1231 	*i = (struct iov_iter){
1232 		.iter_type = ITER_BVEC,
1233 		.data_source = direction,
1234 		.bvec = bvec,
1235 		.nr_segs = nr_segs,
1236 		.iov_offset = 0,
1237 		.count = count
1238 	};
1239 }
1240 EXPORT_SYMBOL(iov_iter_bvec);
1241 
1242 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1243 			struct pipe_inode_info *pipe,
1244 			size_t count)
1245 {
1246 	BUG_ON(direction != READ);
1247 	WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1248 	*i = (struct iov_iter){
1249 		.iter_type = ITER_PIPE,
1250 		.data_source = false,
1251 		.pipe = pipe,
1252 		.head = pipe->head,
1253 		.start_head = pipe->head,
1254 		.iov_offset = 0,
1255 		.count = count
1256 	};
1257 }
1258 EXPORT_SYMBOL(iov_iter_pipe);
1259 
1260 /**
1261  * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
1262  * @i: The iterator to initialise.
1263  * @direction: The direction of the transfer.
1264  * @xarray: The xarray to access.
1265  * @start: The start file position.
1266  * @count: The size of the I/O buffer in bytes.
1267  *
1268  * Set up an I/O iterator to either draw data out of the pages attached to an
1269  * inode or to inject data into those pages.  The pages *must* be prevented
1270  * from evaporation, either by taking a ref on them or locking them by the
1271  * caller.
1272  */
1273 void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
1274 		     struct xarray *xarray, loff_t start, size_t count)
1275 {
1276 	BUG_ON(direction & ~1);
1277 	*i = (struct iov_iter) {
1278 		.iter_type = ITER_XARRAY,
1279 		.data_source = direction,
1280 		.xarray = xarray,
1281 		.xarray_start = start,
1282 		.count = count,
1283 		.iov_offset = 0
1284 	};
1285 }
1286 EXPORT_SYMBOL(iov_iter_xarray);
1287 
1288 /**
1289  * iov_iter_discard - Initialise an I/O iterator that discards data
1290  * @i: The iterator to initialise.
1291  * @direction: The direction of the transfer.
1292  * @count: The size of the I/O buffer in bytes.
1293  *
1294  * Set up an I/O iterator that just discards everything that's written to it.
1295  * It's only available as a READ iterator.
1296  */
1297 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1298 {
1299 	BUG_ON(direction != READ);
1300 	*i = (struct iov_iter){
1301 		.iter_type = ITER_DISCARD,
1302 		.data_source = false,
1303 		.count = count,
1304 		.iov_offset = 0
1305 	};
1306 }
1307 EXPORT_SYMBOL(iov_iter_discard);
1308 
1309 static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
1310 {
1311 	unsigned long res = 0;
1312 	size_t size = i->count;
1313 	size_t skip = i->iov_offset;
1314 	unsigned k;
1315 
1316 	for (k = 0; k < i->nr_segs; k++, skip = 0) {
1317 		size_t len = i->iov[k].iov_len - skip;
1318 		if (len) {
1319 			res |= (unsigned long)i->iov[k].iov_base + skip;
1320 			if (len > size)
1321 				len = size;
1322 			res |= len;
1323 			size -= len;
1324 			if (!size)
1325 				break;
1326 		}
1327 	}
1328 	return res;
1329 }
1330 
1331 static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
1332 {
1333 	unsigned res = 0;
1334 	size_t size = i->count;
1335 	unsigned skip = i->iov_offset;
1336 	unsigned k;
1337 
1338 	for (k = 0; k < i->nr_segs; k++, skip = 0) {
1339 		size_t len = i->bvec[k].bv_len - skip;
1340 		res |= (unsigned long)i->bvec[k].bv_offset + skip;
1341 		if (len > size)
1342 			len = size;
1343 		res |= len;
1344 		size -= len;
1345 		if (!size)
1346 			break;
1347 	}
1348 	return res;
1349 }
1350 
1351 unsigned long iov_iter_alignment(const struct iov_iter *i)
1352 {
1353 	/* iovec and kvec have identical layouts */
1354 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1355 		return iov_iter_alignment_iovec(i);
1356 
1357 	if (iov_iter_is_bvec(i))
1358 		return iov_iter_alignment_bvec(i);
1359 
1360 	if (iov_iter_is_pipe(i)) {
1361 		unsigned int p_mask = i->pipe->ring_size - 1;
1362 		size_t size = i->count;
1363 
1364 		if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1365 			return size | i->iov_offset;
1366 		return size;
1367 	}
1368 
1369 	if (iov_iter_is_xarray(i))
1370 		return (i->xarray_start + i->iov_offset) | i->count;
1371 
1372 	return 0;
1373 }
1374 EXPORT_SYMBOL(iov_iter_alignment);
1375 
1376 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1377 {
1378 	unsigned long res = 0;
1379 	unsigned long v = 0;
1380 	size_t size = i->count;
1381 	unsigned k;
1382 
1383 	if (WARN_ON(!iter_is_iovec(i)))
1384 		return ~0U;
1385 
1386 	for (k = 0; k < i->nr_segs; k++) {
1387 		if (i->iov[k].iov_len) {
1388 			unsigned long base = (unsigned long)i->iov[k].iov_base;
1389 			if (v) // if not the first one
1390 				res |= base | v; // this start | previous end
1391 			v = base + i->iov[k].iov_len;
1392 			if (size <= i->iov[k].iov_len)
1393 				break;
1394 			size -= i->iov[k].iov_len;
1395 		}
1396 	}
1397 	return res;
1398 }
1399 EXPORT_SYMBOL(iov_iter_gap_alignment);
1400 
1401 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1402 				size_t maxsize,
1403 				struct page **pages,
1404 				int iter_head,
1405 				size_t *start)
1406 {
1407 	struct pipe_inode_info *pipe = i->pipe;
1408 	unsigned int p_mask = pipe->ring_size - 1;
1409 	ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1410 	if (!n)
1411 		return -EFAULT;
1412 
1413 	maxsize = n;
1414 	n += *start;
1415 	while (n > 0) {
1416 		get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1417 		iter_head++;
1418 		n -= PAGE_SIZE;
1419 	}
1420 
1421 	return maxsize;
1422 }
1423 
1424 static ssize_t pipe_get_pages(struct iov_iter *i,
1425 		   struct page **pages, size_t maxsize, unsigned maxpages,
1426 		   size_t *start)
1427 {
1428 	unsigned int iter_head, npages;
1429 	size_t capacity;
1430 
1431 	if (!sanity(i))
1432 		return -EFAULT;
1433 
1434 	data_start(i, &iter_head, start);
1435 	/* Amount of free space: some of this one + all after this one */
1436 	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1437 	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1438 
1439 	return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1440 }
1441 
1442 static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
1443 					  pgoff_t index, unsigned int nr_pages)
1444 {
1445 	XA_STATE(xas, xa, index);
1446 	struct page *page;
1447 	unsigned int ret = 0;
1448 
1449 	rcu_read_lock();
1450 	for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1451 		if (xas_retry(&xas, page))
1452 			continue;
1453 
1454 		/* Has the page moved or been split? */
1455 		if (unlikely(page != xas_reload(&xas))) {
1456 			xas_reset(&xas);
1457 			continue;
1458 		}
1459 
1460 		pages[ret] = find_subpage(page, xas.xa_index);
1461 		get_page(pages[ret]);
1462 		if (++ret == nr_pages)
1463 			break;
1464 	}
1465 	rcu_read_unlock();
1466 	return ret;
1467 }
1468 
1469 static ssize_t iter_xarray_get_pages(struct iov_iter *i,
1470 				     struct page **pages, size_t maxsize,
1471 				     unsigned maxpages, size_t *_start_offset)
1472 {
1473 	unsigned nr, offset;
1474 	pgoff_t index, count;
1475 	size_t size = maxsize, actual;
1476 	loff_t pos;
1477 
1478 	if (!size || !maxpages)
1479 		return 0;
1480 
1481 	pos = i->xarray_start + i->iov_offset;
1482 	index = pos >> PAGE_SHIFT;
1483 	offset = pos & ~PAGE_MASK;
1484 	*_start_offset = offset;
1485 
1486 	count = 1;
1487 	if (size > PAGE_SIZE - offset) {
1488 		size -= PAGE_SIZE - offset;
1489 		count += size >> PAGE_SHIFT;
1490 		size &= ~PAGE_MASK;
1491 		if (size)
1492 			count++;
1493 	}
1494 
1495 	if (count > maxpages)
1496 		count = maxpages;
1497 
1498 	nr = iter_xarray_populate_pages(pages, i->xarray, index, count);
1499 	if (nr == 0)
1500 		return 0;
1501 
1502 	actual = PAGE_SIZE * nr;
1503 	actual -= offset;
1504 	if (nr == count && size > 0) {
1505 		unsigned last_offset = (nr > 1) ? 0 : offset;
1506 		actual -= PAGE_SIZE - (last_offset + size);
1507 	}
1508 	return actual;
1509 }
1510 
1511 /* must be done on non-empty ITER_IOVEC one */
1512 static unsigned long first_iovec_segment(const struct iov_iter *i,
1513 					 size_t *size, size_t *start,
1514 					 size_t maxsize, unsigned maxpages)
1515 {
1516 	size_t skip;
1517 	long k;
1518 
1519 	for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
1520 		unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
1521 		size_t len = i->iov[k].iov_len - skip;
1522 
1523 		if (unlikely(!len))
1524 			continue;
1525 		if (len > maxsize)
1526 			len = maxsize;
1527 		len += (*start = addr % PAGE_SIZE);
1528 		if (len > maxpages * PAGE_SIZE)
1529 			len = maxpages * PAGE_SIZE;
1530 		*size = len;
1531 		return addr & PAGE_MASK;
1532 	}
1533 	BUG(); // if it had been empty, we wouldn't get called
1534 }
1535 
1536 /* must be done on non-empty ITER_BVEC one */
1537 static struct page *first_bvec_segment(const struct iov_iter *i,
1538 				       size_t *size, size_t *start,
1539 				       size_t maxsize, unsigned maxpages)
1540 {
1541 	struct page *page;
1542 	size_t skip = i->iov_offset, len;
1543 
1544 	len = i->bvec->bv_len - skip;
1545 	if (len > maxsize)
1546 		len = maxsize;
1547 	skip += i->bvec->bv_offset;
1548 	page = i->bvec->bv_page + skip / PAGE_SIZE;
1549 	len += (*start = skip % PAGE_SIZE);
1550 	if (len > maxpages * PAGE_SIZE)
1551 		len = maxpages * PAGE_SIZE;
1552 	*size = len;
1553 	return page;
1554 }
1555 
1556 ssize_t iov_iter_get_pages(struct iov_iter *i,
1557 		   struct page **pages, size_t maxsize, unsigned maxpages,
1558 		   size_t *start)
1559 {
1560 	size_t len;
1561 	int n, res;
1562 
1563 	if (maxsize > i->count)
1564 		maxsize = i->count;
1565 	if (!maxsize)
1566 		return 0;
1567 
1568 	if (likely(iter_is_iovec(i))) {
1569 		unsigned long addr;
1570 
1571 		addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
1572 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1573 		res = get_user_pages_fast(addr, n,
1574 				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1575 				pages);
1576 		if (unlikely(res < 0))
1577 			return res;
1578 		return (res == n ? len : res * PAGE_SIZE) - *start;
1579 	}
1580 	if (iov_iter_is_bvec(i)) {
1581 		struct page *page;
1582 
1583 		page = first_bvec_segment(i, &len, start, maxsize, maxpages);
1584 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1585 		while (n--)
1586 			get_page(*pages++ = page++);
1587 		return len - *start;
1588 	}
1589 	if (iov_iter_is_pipe(i))
1590 		return pipe_get_pages(i, pages, maxsize, maxpages, start);
1591 	if (iov_iter_is_xarray(i))
1592 		return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
1593 	return -EFAULT;
1594 }
1595 EXPORT_SYMBOL(iov_iter_get_pages);
1596 
1597 static struct page **get_pages_array(size_t n)
1598 {
1599 	return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1600 }
1601 
1602 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1603 		   struct page ***pages, size_t maxsize,
1604 		   size_t *start)
1605 {
1606 	struct page **p;
1607 	unsigned int iter_head, npages;
1608 	ssize_t n;
1609 
1610 	if (!sanity(i))
1611 		return -EFAULT;
1612 
1613 	data_start(i, &iter_head, start);
1614 	/* Amount of free space: some of this one + all after this one */
1615 	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1616 	n = npages * PAGE_SIZE - *start;
1617 	if (maxsize > n)
1618 		maxsize = n;
1619 	else
1620 		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1621 	p = get_pages_array(npages);
1622 	if (!p)
1623 		return -ENOMEM;
1624 	n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1625 	if (n > 0)
1626 		*pages = p;
1627 	else
1628 		kvfree(p);
1629 	return n;
1630 }
1631 
1632 static ssize_t iter_xarray_get_pages_alloc(struct iov_iter *i,
1633 					   struct page ***pages, size_t maxsize,
1634 					   size_t *_start_offset)
1635 {
1636 	struct page **p;
1637 	unsigned nr, offset;
1638 	pgoff_t index, count;
1639 	size_t size = maxsize, actual;
1640 	loff_t pos;
1641 
1642 	if (!size)
1643 		return 0;
1644 
1645 	pos = i->xarray_start + i->iov_offset;
1646 	index = pos >> PAGE_SHIFT;
1647 	offset = pos & ~PAGE_MASK;
1648 	*_start_offset = offset;
1649 
1650 	count = 1;
1651 	if (size > PAGE_SIZE - offset) {
1652 		size -= PAGE_SIZE - offset;
1653 		count += size >> PAGE_SHIFT;
1654 		size &= ~PAGE_MASK;
1655 		if (size)
1656 			count++;
1657 	}
1658 
1659 	p = get_pages_array(count);
1660 	if (!p)
1661 		return -ENOMEM;
1662 	*pages = p;
1663 
1664 	nr = iter_xarray_populate_pages(p, i->xarray, index, count);
1665 	if (nr == 0)
1666 		return 0;
1667 
1668 	actual = PAGE_SIZE * nr;
1669 	actual -= offset;
1670 	if (nr == count && size > 0) {
1671 		unsigned last_offset = (nr > 1) ? 0 : offset;
1672 		actual -= PAGE_SIZE - (last_offset + size);
1673 	}
1674 	return actual;
1675 }
1676 
1677 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1678 		   struct page ***pages, size_t maxsize,
1679 		   size_t *start)
1680 {
1681 	struct page **p;
1682 	size_t len;
1683 	int n, res;
1684 
1685 	if (maxsize > i->count)
1686 		maxsize = i->count;
1687 	if (!maxsize)
1688 		return 0;
1689 
1690 	if (likely(iter_is_iovec(i))) {
1691 		unsigned long addr;
1692 
1693 		addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
1694 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1695 		p = get_pages_array(n);
1696 		if (!p)
1697 			return -ENOMEM;
1698 		res = get_user_pages_fast(addr, n,
1699 				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1700 		if (unlikely(res < 0)) {
1701 			kvfree(p);
1702 			return res;
1703 		}
1704 		*pages = p;
1705 		return (res == n ? len : res * PAGE_SIZE) - *start;
1706 	}
1707 	if (iov_iter_is_bvec(i)) {
1708 		struct page *page;
1709 
1710 		page = first_bvec_segment(i, &len, start, maxsize, ~0U);
1711 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1712 		*pages = p = get_pages_array(n);
1713 		if (!p)
1714 			return -ENOMEM;
1715 		while (n--)
1716 			get_page(*p++ = page++);
1717 		return len - *start;
1718 	}
1719 	if (iov_iter_is_pipe(i))
1720 		return pipe_get_pages_alloc(i, pages, maxsize, start);
1721 	if (iov_iter_is_xarray(i))
1722 		return iter_xarray_get_pages_alloc(i, pages, maxsize, start);
1723 	return -EFAULT;
1724 }
1725 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1726 
1727 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1728 			       struct iov_iter *i)
1729 {
1730 	char *to = addr;
1731 	__wsum sum, next;
1732 	size_t off = 0;
1733 	sum = *csum;
1734 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1735 		WARN_ON(1);
1736 		return 0;
1737 	}
1738 	iterate_and_advance(i, bytes, v, ({
1739 		next = csum_and_copy_from_user(v.iov_base,
1740 					       (to += v.iov_len) - v.iov_len,
1741 					       v.iov_len);
1742 		if (next) {
1743 			sum = csum_block_add(sum, next, off);
1744 			off += v.iov_len;
1745 		}
1746 		next ? 0 : v.iov_len;
1747 	}), ({
1748 		char *p = kmap_atomic(v.bv_page);
1749 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1750 				      p + v.bv_offset, v.bv_len,
1751 				      sum, off);
1752 		kunmap_atomic(p);
1753 		off += v.bv_len;
1754 	}),({
1755 		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1756 				      v.iov_base, v.iov_len,
1757 				      sum, off);
1758 		off += v.iov_len;
1759 	}), ({
1760 		char *p = kmap_atomic(v.bv_page);
1761 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1762 				      p + v.bv_offset, v.bv_len,
1763 				      sum, off);
1764 		kunmap_atomic(p);
1765 		off += v.bv_len;
1766 	})
1767 	)
1768 	*csum = sum;
1769 	return bytes;
1770 }
1771 EXPORT_SYMBOL(csum_and_copy_from_iter);
1772 
1773 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1774 			     struct iov_iter *i)
1775 {
1776 	struct csum_state *csstate = _csstate;
1777 	const char *from = addr;
1778 	__wsum sum, next;
1779 	size_t off;
1780 
1781 	if (unlikely(iov_iter_is_pipe(i)))
1782 		return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
1783 
1784 	sum = csum_shift(csstate->csum, csstate->off);
1785 	off = 0;
1786 	if (unlikely(iov_iter_is_discard(i))) {
1787 		WARN_ON(1);	/* for now */
1788 		return 0;
1789 	}
1790 	iterate_and_advance(i, bytes, v, ({
1791 		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
1792 					     v.iov_base,
1793 					     v.iov_len);
1794 		if (next) {
1795 			sum = csum_block_add(sum, next, off);
1796 			off += v.iov_len;
1797 		}
1798 		next ? 0 : v.iov_len;
1799 	}), ({
1800 		char *p = kmap_atomic(v.bv_page);
1801 		sum = csum_and_memcpy(p + v.bv_offset,
1802 				      (from += v.bv_len) - v.bv_len,
1803 				      v.bv_len, sum, off);
1804 		kunmap_atomic(p);
1805 		off += v.bv_len;
1806 	}),({
1807 		sum = csum_and_memcpy(v.iov_base,
1808 				     (from += v.iov_len) - v.iov_len,
1809 				     v.iov_len, sum, off);
1810 		off += v.iov_len;
1811 	}), ({
1812 		char *p = kmap_atomic(v.bv_page);
1813 		sum = csum_and_memcpy(p + v.bv_offset,
1814 				      (from += v.bv_len) - v.bv_len,
1815 				      v.bv_len, sum, off);
1816 		kunmap_atomic(p);
1817 		off += v.bv_len;
1818 	})
1819 	)
1820 	csstate->csum = csum_shift(sum, csstate->off);
1821 	csstate->off += bytes;
1822 	return bytes;
1823 }
1824 EXPORT_SYMBOL(csum_and_copy_to_iter);
1825 
1826 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1827 		struct iov_iter *i)
1828 {
1829 #ifdef CONFIG_CRYPTO_HASH
1830 	struct ahash_request *hash = hashp;
1831 	struct scatterlist sg;
1832 	size_t copied;
1833 
1834 	copied = copy_to_iter(addr, bytes, i);
1835 	sg_init_one(&sg, addr, copied);
1836 	ahash_request_set_crypt(hash, &sg, NULL, copied);
1837 	crypto_ahash_update(hash);
1838 	return copied;
1839 #else
1840 	return 0;
1841 #endif
1842 }
1843 EXPORT_SYMBOL(hash_and_copy_to_iter);
1844 
1845 static int iov_npages(const struct iov_iter *i, int maxpages)
1846 {
1847 	size_t skip = i->iov_offset, size = i->count;
1848 	const struct iovec *p;
1849 	int npages = 0;
1850 
1851 	for (p = i->iov; size; skip = 0, p++) {
1852 		unsigned offs = offset_in_page(p->iov_base + skip);
1853 		size_t len = min(p->iov_len - skip, size);
1854 
1855 		if (len) {
1856 			size -= len;
1857 			npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1858 			if (unlikely(npages > maxpages))
1859 				return maxpages;
1860 		}
1861 	}
1862 	return npages;
1863 }
1864 
1865 static int bvec_npages(const struct iov_iter *i, int maxpages)
1866 {
1867 	size_t skip = i->iov_offset, size = i->count;
1868 	const struct bio_vec *p;
1869 	int npages = 0;
1870 
1871 	for (p = i->bvec; size; skip = 0, p++) {
1872 		unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
1873 		size_t len = min(p->bv_len - skip, size);
1874 
1875 		size -= len;
1876 		npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
1877 		if (unlikely(npages > maxpages))
1878 			return maxpages;
1879 	}
1880 	return npages;
1881 }
1882 
1883 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1884 {
1885 	if (unlikely(!i->count))
1886 		return 0;
1887 	/* iovec and kvec have identical layouts */
1888 	if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
1889 		return iov_npages(i, maxpages);
1890 	if (iov_iter_is_bvec(i))
1891 		return bvec_npages(i, maxpages);
1892 	if (iov_iter_is_pipe(i)) {
1893 		unsigned int iter_head;
1894 		int npages;
1895 		size_t off;
1896 
1897 		if (!sanity(i))
1898 			return 0;
1899 
1900 		data_start(i, &iter_head, &off);
1901 		/* some of this one + all after this one */
1902 		npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1903 		return min(npages, maxpages);
1904 	}
1905 	if (iov_iter_is_xarray(i)) {
1906 		unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
1907 		int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
1908 		return min(npages, maxpages);
1909 	}
1910 	return 0;
1911 }
1912 EXPORT_SYMBOL(iov_iter_npages);
1913 
1914 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1915 {
1916 	*new = *old;
1917 	if (unlikely(iov_iter_is_pipe(new))) {
1918 		WARN_ON(1);
1919 		return NULL;
1920 	}
1921 	if (unlikely(iov_iter_is_discard(new) || iov_iter_is_xarray(new)))
1922 		return NULL;
1923 	if (iov_iter_is_bvec(new))
1924 		return new->bvec = kmemdup(new->bvec,
1925 				    new->nr_segs * sizeof(struct bio_vec),
1926 				    flags);
1927 	else
1928 		/* iovec and kvec have identical layout */
1929 		return new->iov = kmemdup(new->iov,
1930 				   new->nr_segs * sizeof(struct iovec),
1931 				   flags);
1932 }
1933 EXPORT_SYMBOL(dup_iter);
1934 
1935 static int copy_compat_iovec_from_user(struct iovec *iov,
1936 		const struct iovec __user *uvec, unsigned long nr_segs)
1937 {
1938 	const struct compat_iovec __user *uiov =
1939 		(const struct compat_iovec __user *)uvec;
1940 	int ret = -EFAULT, i;
1941 
1942 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1943 		return -EFAULT;
1944 
1945 	for (i = 0; i < nr_segs; i++) {
1946 		compat_uptr_t buf;
1947 		compat_ssize_t len;
1948 
1949 		unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1950 		unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1951 
1952 		/* check for compat_size_t not fitting in compat_ssize_t .. */
1953 		if (len < 0) {
1954 			ret = -EINVAL;
1955 			goto uaccess_end;
1956 		}
1957 		iov[i].iov_base = compat_ptr(buf);
1958 		iov[i].iov_len = len;
1959 	}
1960 
1961 	ret = 0;
1962 uaccess_end:
1963 	user_access_end();
1964 	return ret;
1965 }
1966 
1967 static int copy_iovec_from_user(struct iovec *iov,
1968 		const struct iovec __user *uvec, unsigned long nr_segs)
1969 {
1970 	unsigned long seg;
1971 
1972 	if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1973 		return -EFAULT;
1974 	for (seg = 0; seg < nr_segs; seg++) {
1975 		if ((ssize_t)iov[seg].iov_len < 0)
1976 			return -EINVAL;
1977 	}
1978 
1979 	return 0;
1980 }
1981 
1982 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1983 		unsigned long nr_segs, unsigned long fast_segs,
1984 		struct iovec *fast_iov, bool compat)
1985 {
1986 	struct iovec *iov = fast_iov;
1987 	int ret;
1988 
1989 	/*
1990 	 * SuS says "The readv() function *may* fail if the iovcnt argument was
1991 	 * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1992 	 * traditionally returned zero for zero segments, so...
1993 	 */
1994 	if (nr_segs == 0)
1995 		return iov;
1996 	if (nr_segs > UIO_MAXIOV)
1997 		return ERR_PTR(-EINVAL);
1998 	if (nr_segs > fast_segs) {
1999 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
2000 		if (!iov)
2001 			return ERR_PTR(-ENOMEM);
2002 	}
2003 
2004 	if (compat)
2005 		ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
2006 	else
2007 		ret = copy_iovec_from_user(iov, uvec, nr_segs);
2008 	if (ret) {
2009 		if (iov != fast_iov)
2010 			kfree(iov);
2011 		return ERR_PTR(ret);
2012 	}
2013 
2014 	return iov;
2015 }
2016 
2017 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
2018 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
2019 		 struct iov_iter *i, bool compat)
2020 {
2021 	ssize_t total_len = 0;
2022 	unsigned long seg;
2023 	struct iovec *iov;
2024 
2025 	iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
2026 	if (IS_ERR(iov)) {
2027 		*iovp = NULL;
2028 		return PTR_ERR(iov);
2029 	}
2030 
2031 	/*
2032 	 * According to the Single Unix Specification we should return EINVAL if
2033 	 * an element length is < 0 when cast to ssize_t or if the total length
2034 	 * would overflow the ssize_t return value of the system call.
2035 	 *
2036 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
2037 	 * overflow case.
2038 	 */
2039 	for (seg = 0; seg < nr_segs; seg++) {
2040 		ssize_t len = (ssize_t)iov[seg].iov_len;
2041 
2042 		if (!access_ok(iov[seg].iov_base, len)) {
2043 			if (iov != *iovp)
2044 				kfree(iov);
2045 			*iovp = NULL;
2046 			return -EFAULT;
2047 		}
2048 
2049 		if (len > MAX_RW_COUNT - total_len) {
2050 			len = MAX_RW_COUNT - total_len;
2051 			iov[seg].iov_len = len;
2052 		}
2053 		total_len += len;
2054 	}
2055 
2056 	iov_iter_init(i, type, iov, nr_segs, total_len);
2057 	if (iov == *iovp)
2058 		*iovp = NULL;
2059 	else
2060 		*iovp = iov;
2061 	return total_len;
2062 }
2063 
2064 /**
2065  * import_iovec() - Copy an array of &struct iovec from userspace
2066  *     into the kernel, check that it is valid, and initialize a new
2067  *     &struct iov_iter iterator to access it.
2068  *
2069  * @type: One of %READ or %WRITE.
2070  * @uvec: Pointer to the userspace array.
2071  * @nr_segs: Number of elements in userspace array.
2072  * @fast_segs: Number of elements in @iov.
2073  * @iovp: (input and output parameter) Pointer to pointer to (usually small
2074  *     on-stack) kernel array.
2075  * @i: Pointer to iterator that will be initialized on success.
2076  *
2077  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
2078  * then this function places %NULL in *@iov on return. Otherwise, a new
2079  * array will be allocated and the result placed in *@iov. This means that
2080  * the caller may call kfree() on *@iov regardless of whether the small
2081  * on-stack array was used or not (and regardless of whether this function
2082  * returns an error or not).
2083  *
2084  * Return: Negative error code on error, bytes imported on success
2085  */
2086 ssize_t import_iovec(int type, const struct iovec __user *uvec,
2087 		 unsigned nr_segs, unsigned fast_segs,
2088 		 struct iovec **iovp, struct iov_iter *i)
2089 {
2090 	return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
2091 			      in_compat_syscall());
2092 }
2093 EXPORT_SYMBOL(import_iovec);
2094 
2095 int import_single_range(int rw, void __user *buf, size_t len,
2096 		 struct iovec *iov, struct iov_iter *i)
2097 {
2098 	if (len > MAX_RW_COUNT)
2099 		len = MAX_RW_COUNT;
2100 	if (unlikely(!access_ok(buf, len)))
2101 		return -EFAULT;
2102 
2103 	iov->iov_base = buf;
2104 	iov->iov_len = len;
2105 	iov_iter_init(i, rw, iov, 1, len);
2106 	return 0;
2107 }
2108 EXPORT_SYMBOL(import_single_range);
2109