xref: /openbmc/linux/lib/iov_iter.c (revision 84b102f5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <crypto/hash.h>
3 #include <linux/export.h>
4 #include <linux/bvec.h>
5 #include <linux/fault-inject-usercopy.h>
6 #include <linux/uio.h>
7 #include <linux/pagemap.h>
8 #include <linux/slab.h>
9 #include <linux/vmalloc.h>
10 #include <linux/splice.h>
11 #include <linux/compat.h>
12 #include <net/checksum.h>
13 #include <linux/scatterlist.h>
14 #include <linux/instrumented.h>
15 
16 #define PIPE_PARANOIA /* for now */
17 
18 #define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
19 	size_t left;					\
20 	size_t wanted = n;				\
21 	__p = i->iov;					\
22 	__v.iov_len = min(n, __p->iov_len - skip);	\
23 	if (likely(__v.iov_len)) {			\
24 		__v.iov_base = __p->iov_base + skip;	\
25 		left = (STEP);				\
26 		__v.iov_len -= left;			\
27 		skip += __v.iov_len;			\
28 		n -= __v.iov_len;			\
29 	} else {					\
30 		left = 0;				\
31 	}						\
32 	while (unlikely(!left && n)) {			\
33 		__p++;					\
34 		__v.iov_len = min(n, __p->iov_len);	\
35 		if (unlikely(!__v.iov_len))		\
36 			continue;			\
37 		__v.iov_base = __p->iov_base;		\
38 		left = (STEP);				\
39 		__v.iov_len -= left;			\
40 		skip = __v.iov_len;			\
41 		n -= __v.iov_len;			\
42 	}						\
43 	n = wanted - n;					\
44 }
45 
46 #define iterate_kvec(i, n, __v, __p, skip, STEP) {	\
47 	size_t wanted = n;				\
48 	__p = i->kvec;					\
49 	__v.iov_len = min(n, __p->iov_len - skip);	\
50 	if (likely(__v.iov_len)) {			\
51 		__v.iov_base = __p->iov_base + skip;	\
52 		(void)(STEP);				\
53 		skip += __v.iov_len;			\
54 		n -= __v.iov_len;			\
55 	}						\
56 	while (unlikely(n)) {				\
57 		__p++;					\
58 		__v.iov_len = min(n, __p->iov_len);	\
59 		if (unlikely(!__v.iov_len))		\
60 			continue;			\
61 		__v.iov_base = __p->iov_base;		\
62 		(void)(STEP);				\
63 		skip = __v.iov_len;			\
64 		n -= __v.iov_len;			\
65 	}						\
66 	n = wanted;					\
67 }
68 
69 #define iterate_bvec(i, n, __v, __bi, skip, STEP) {	\
70 	struct bvec_iter __start;			\
71 	__start.bi_size = n;				\
72 	__start.bi_bvec_done = skip;			\
73 	__start.bi_idx = 0;				\
74 	for_each_bvec(__v, i->bvec, __bi, __start) {	\
75 		if (!__v.bv_len)			\
76 			continue;			\
77 		(void)(STEP);				\
78 	}						\
79 }
80 
81 #define iterate_all_kinds(i, n, v, I, B, K) {			\
82 	if (likely(n)) {					\
83 		size_t skip = i->iov_offset;			\
84 		if (unlikely(i->type & ITER_BVEC)) {		\
85 			struct bio_vec v;			\
86 			struct bvec_iter __bi;			\
87 			iterate_bvec(i, n, v, __bi, skip, (B))	\
88 		} else if (unlikely(i->type & ITER_KVEC)) {	\
89 			const struct kvec *kvec;		\
90 			struct kvec v;				\
91 			iterate_kvec(i, n, v, kvec, skip, (K))	\
92 		} else if (unlikely(i->type & ITER_DISCARD)) {	\
93 		} else {					\
94 			const struct iovec *iov;		\
95 			struct iovec v;				\
96 			iterate_iovec(i, n, v, iov, skip, (I))	\
97 		}						\
98 	}							\
99 }
100 
101 #define iterate_and_advance(i, n, v, I, B, K) {			\
102 	if (unlikely(i->count < n))				\
103 		n = i->count;					\
104 	if (i->count) {						\
105 		size_t skip = i->iov_offset;			\
106 		if (unlikely(i->type & ITER_BVEC)) {		\
107 			const struct bio_vec *bvec = i->bvec;	\
108 			struct bio_vec v;			\
109 			struct bvec_iter __bi;			\
110 			iterate_bvec(i, n, v, __bi, skip, (B))	\
111 			i->bvec = __bvec_iter_bvec(i->bvec, __bi);	\
112 			i->nr_segs -= i->bvec - bvec;		\
113 			skip = __bi.bi_bvec_done;		\
114 		} else if (unlikely(i->type & ITER_KVEC)) {	\
115 			const struct kvec *kvec;		\
116 			struct kvec v;				\
117 			iterate_kvec(i, n, v, kvec, skip, (K))	\
118 			if (skip == kvec->iov_len) {		\
119 				kvec++;				\
120 				skip = 0;			\
121 			}					\
122 			i->nr_segs -= kvec - i->kvec;		\
123 			i->kvec = kvec;				\
124 		} else if (unlikely(i->type & ITER_DISCARD)) {	\
125 			skip += n;				\
126 		} else {					\
127 			const struct iovec *iov;		\
128 			struct iovec v;				\
129 			iterate_iovec(i, n, v, iov, skip, (I))	\
130 			if (skip == iov->iov_len) {		\
131 				iov++;				\
132 				skip = 0;			\
133 			}					\
134 			i->nr_segs -= iov - i->iov;		\
135 			i->iov = iov;				\
136 		}						\
137 		i->count -= n;					\
138 		i->iov_offset = skip;				\
139 	}							\
140 }
141 
142 static int copyout(void __user *to, const void *from, size_t n)
143 {
144 	if (should_fail_usercopy())
145 		return n;
146 	if (access_ok(to, n)) {
147 		instrument_copy_to_user(to, from, n);
148 		n = raw_copy_to_user(to, from, n);
149 	}
150 	return n;
151 }
152 
153 static int copyin(void *to, const void __user *from, size_t n)
154 {
155 	if (should_fail_usercopy())
156 		return n;
157 	if (access_ok(from, n)) {
158 		instrument_copy_from_user(to, from, n);
159 		n = raw_copy_from_user(to, from, n);
160 	}
161 	return n;
162 }
163 
164 static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
165 			 struct iov_iter *i)
166 {
167 	size_t skip, copy, left, wanted;
168 	const struct iovec *iov;
169 	char __user *buf;
170 	void *kaddr, *from;
171 
172 	if (unlikely(bytes > i->count))
173 		bytes = i->count;
174 
175 	if (unlikely(!bytes))
176 		return 0;
177 
178 	might_fault();
179 	wanted = bytes;
180 	iov = i->iov;
181 	skip = i->iov_offset;
182 	buf = iov->iov_base + skip;
183 	copy = min(bytes, iov->iov_len - skip);
184 
185 	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) {
186 		kaddr = kmap_atomic(page);
187 		from = kaddr + offset;
188 
189 		/* first chunk, usually the only one */
190 		left = copyout(buf, from, copy);
191 		copy -= left;
192 		skip += copy;
193 		from += copy;
194 		bytes -= copy;
195 
196 		while (unlikely(!left && bytes)) {
197 			iov++;
198 			buf = iov->iov_base;
199 			copy = min(bytes, iov->iov_len);
200 			left = copyout(buf, from, copy);
201 			copy -= left;
202 			skip = copy;
203 			from += copy;
204 			bytes -= copy;
205 		}
206 		if (likely(!bytes)) {
207 			kunmap_atomic(kaddr);
208 			goto done;
209 		}
210 		offset = from - kaddr;
211 		buf += copy;
212 		kunmap_atomic(kaddr);
213 		copy = min(bytes, iov->iov_len - skip);
214 	}
215 	/* Too bad - revert to non-atomic kmap */
216 
217 	kaddr = kmap(page);
218 	from = kaddr + offset;
219 	left = copyout(buf, from, copy);
220 	copy -= left;
221 	skip += copy;
222 	from += copy;
223 	bytes -= copy;
224 	while (unlikely(!left && bytes)) {
225 		iov++;
226 		buf = iov->iov_base;
227 		copy = min(bytes, iov->iov_len);
228 		left = copyout(buf, from, copy);
229 		copy -= left;
230 		skip = copy;
231 		from += copy;
232 		bytes -= copy;
233 	}
234 	kunmap(page);
235 
236 done:
237 	if (skip == iov->iov_len) {
238 		iov++;
239 		skip = 0;
240 	}
241 	i->count -= wanted - bytes;
242 	i->nr_segs -= iov - i->iov;
243 	i->iov = iov;
244 	i->iov_offset = skip;
245 	return wanted - bytes;
246 }
247 
248 static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
249 			 struct iov_iter *i)
250 {
251 	size_t skip, copy, left, wanted;
252 	const struct iovec *iov;
253 	char __user *buf;
254 	void *kaddr, *to;
255 
256 	if (unlikely(bytes > i->count))
257 		bytes = i->count;
258 
259 	if (unlikely(!bytes))
260 		return 0;
261 
262 	might_fault();
263 	wanted = bytes;
264 	iov = i->iov;
265 	skip = i->iov_offset;
266 	buf = iov->iov_base + skip;
267 	copy = min(bytes, iov->iov_len - skip);
268 
269 	if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
270 		kaddr = kmap_atomic(page);
271 		to = kaddr + offset;
272 
273 		/* first chunk, usually the only one */
274 		left = copyin(to, buf, copy);
275 		copy -= left;
276 		skip += copy;
277 		to += copy;
278 		bytes -= copy;
279 
280 		while (unlikely(!left && bytes)) {
281 			iov++;
282 			buf = iov->iov_base;
283 			copy = min(bytes, iov->iov_len);
284 			left = copyin(to, buf, copy);
285 			copy -= left;
286 			skip = copy;
287 			to += copy;
288 			bytes -= copy;
289 		}
290 		if (likely(!bytes)) {
291 			kunmap_atomic(kaddr);
292 			goto done;
293 		}
294 		offset = to - kaddr;
295 		buf += copy;
296 		kunmap_atomic(kaddr);
297 		copy = min(bytes, iov->iov_len - skip);
298 	}
299 	/* Too bad - revert to non-atomic kmap */
300 
301 	kaddr = kmap(page);
302 	to = kaddr + offset;
303 	left = copyin(to, buf, copy);
304 	copy -= left;
305 	skip += copy;
306 	to += copy;
307 	bytes -= copy;
308 	while (unlikely(!left && bytes)) {
309 		iov++;
310 		buf = iov->iov_base;
311 		copy = min(bytes, iov->iov_len);
312 		left = copyin(to, buf, copy);
313 		copy -= left;
314 		skip = copy;
315 		to += copy;
316 		bytes -= copy;
317 	}
318 	kunmap(page);
319 
320 done:
321 	if (skip == iov->iov_len) {
322 		iov++;
323 		skip = 0;
324 	}
325 	i->count -= wanted - bytes;
326 	i->nr_segs -= iov - i->iov;
327 	i->iov = iov;
328 	i->iov_offset = skip;
329 	return wanted - bytes;
330 }
331 
332 #ifdef PIPE_PARANOIA
333 static bool sanity(const struct iov_iter *i)
334 {
335 	struct pipe_inode_info *pipe = i->pipe;
336 	unsigned int p_head = pipe->head;
337 	unsigned int p_tail = pipe->tail;
338 	unsigned int p_mask = pipe->ring_size - 1;
339 	unsigned int p_occupancy = pipe_occupancy(p_head, p_tail);
340 	unsigned int i_head = i->head;
341 	unsigned int idx;
342 
343 	if (i->iov_offset) {
344 		struct pipe_buffer *p;
345 		if (unlikely(p_occupancy == 0))
346 			goto Bad;	// pipe must be non-empty
347 		if (unlikely(i_head != p_head - 1))
348 			goto Bad;	// must be at the last buffer...
349 
350 		p = &pipe->bufs[i_head & p_mask];
351 		if (unlikely(p->offset + p->len != i->iov_offset))
352 			goto Bad;	// ... at the end of segment
353 	} else {
354 		if (i_head != p_head)
355 			goto Bad;	// must be right after the last buffer
356 	}
357 	return true;
358 Bad:
359 	printk(KERN_ERR "idx = %d, offset = %zd\n", i_head, i->iov_offset);
360 	printk(KERN_ERR "head = %d, tail = %d, buffers = %d\n",
361 			p_head, p_tail, pipe->ring_size);
362 	for (idx = 0; idx < pipe->ring_size; idx++)
363 		printk(KERN_ERR "[%p %p %d %d]\n",
364 			pipe->bufs[idx].ops,
365 			pipe->bufs[idx].page,
366 			pipe->bufs[idx].offset,
367 			pipe->bufs[idx].len);
368 	WARN_ON(1);
369 	return false;
370 }
371 #else
372 #define sanity(i) true
373 #endif
374 
375 static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
376 			 struct iov_iter *i)
377 {
378 	struct pipe_inode_info *pipe = i->pipe;
379 	struct pipe_buffer *buf;
380 	unsigned int p_tail = pipe->tail;
381 	unsigned int p_mask = pipe->ring_size - 1;
382 	unsigned int i_head = i->head;
383 	size_t off;
384 
385 	if (unlikely(bytes > i->count))
386 		bytes = i->count;
387 
388 	if (unlikely(!bytes))
389 		return 0;
390 
391 	if (!sanity(i))
392 		return 0;
393 
394 	off = i->iov_offset;
395 	buf = &pipe->bufs[i_head & p_mask];
396 	if (off) {
397 		if (offset == off && buf->page == page) {
398 			/* merge with the last one */
399 			buf->len += bytes;
400 			i->iov_offset += bytes;
401 			goto out;
402 		}
403 		i_head++;
404 		buf = &pipe->bufs[i_head & p_mask];
405 	}
406 	if (pipe_full(i_head, p_tail, pipe->max_usage))
407 		return 0;
408 
409 	buf->ops = &page_cache_pipe_buf_ops;
410 	get_page(page);
411 	buf->page = page;
412 	buf->offset = offset;
413 	buf->len = bytes;
414 
415 	pipe->head = i_head + 1;
416 	i->iov_offset = offset + bytes;
417 	i->head = i_head;
418 out:
419 	i->count -= bytes;
420 	return bytes;
421 }
422 
423 /*
424  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
425  * bytes.  For each iovec, fault in each page that constitutes the iovec.
426  *
427  * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
428  * because it is an invalid address).
429  */
430 int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
431 {
432 	size_t skip = i->iov_offset;
433 	const struct iovec *iov;
434 	int err;
435 	struct iovec v;
436 
437 	if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
438 		iterate_iovec(i, bytes, v, iov, skip, ({
439 			err = fault_in_pages_readable(v.iov_base, v.iov_len);
440 			if (unlikely(err))
441 			return err;
442 		0;}))
443 	}
444 	return 0;
445 }
446 EXPORT_SYMBOL(iov_iter_fault_in_readable);
447 
448 void iov_iter_init(struct iov_iter *i, unsigned int direction,
449 			const struct iovec *iov, unsigned long nr_segs,
450 			size_t count)
451 {
452 	WARN_ON(direction & ~(READ | WRITE));
453 	direction &= READ | WRITE;
454 
455 	/* It will get better.  Eventually... */
456 	if (uaccess_kernel()) {
457 		i->type = ITER_KVEC | direction;
458 		i->kvec = (struct kvec *)iov;
459 	} else {
460 		i->type = ITER_IOVEC | direction;
461 		i->iov = iov;
462 	}
463 	i->nr_segs = nr_segs;
464 	i->iov_offset = 0;
465 	i->count = count;
466 }
467 EXPORT_SYMBOL(iov_iter_init);
468 
469 static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
470 {
471 	char *from = kmap_atomic(page);
472 	memcpy(to, from + offset, len);
473 	kunmap_atomic(from);
474 }
475 
476 static void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len)
477 {
478 	char *to = kmap_atomic(page);
479 	memcpy(to + offset, from, len);
480 	kunmap_atomic(to);
481 }
482 
483 static void memzero_page(struct page *page, size_t offset, size_t len)
484 {
485 	char *addr = kmap_atomic(page);
486 	memset(addr + offset, 0, len);
487 	kunmap_atomic(addr);
488 }
489 
490 static inline bool allocated(struct pipe_buffer *buf)
491 {
492 	return buf->ops == &default_pipe_buf_ops;
493 }
494 
495 static inline void data_start(const struct iov_iter *i,
496 			      unsigned int *iter_headp, size_t *offp)
497 {
498 	unsigned int p_mask = i->pipe->ring_size - 1;
499 	unsigned int iter_head = i->head;
500 	size_t off = i->iov_offset;
501 
502 	if (off && (!allocated(&i->pipe->bufs[iter_head & p_mask]) ||
503 		    off == PAGE_SIZE)) {
504 		iter_head++;
505 		off = 0;
506 	}
507 	*iter_headp = iter_head;
508 	*offp = off;
509 }
510 
511 static size_t push_pipe(struct iov_iter *i, size_t size,
512 			int *iter_headp, size_t *offp)
513 {
514 	struct pipe_inode_info *pipe = i->pipe;
515 	unsigned int p_tail = pipe->tail;
516 	unsigned int p_mask = pipe->ring_size - 1;
517 	unsigned int iter_head;
518 	size_t off;
519 	ssize_t left;
520 
521 	if (unlikely(size > i->count))
522 		size = i->count;
523 	if (unlikely(!size))
524 		return 0;
525 
526 	left = size;
527 	data_start(i, &iter_head, &off);
528 	*iter_headp = iter_head;
529 	*offp = off;
530 	if (off) {
531 		left -= PAGE_SIZE - off;
532 		if (left <= 0) {
533 			pipe->bufs[iter_head & p_mask].len += size;
534 			return size;
535 		}
536 		pipe->bufs[iter_head & p_mask].len = PAGE_SIZE;
537 		iter_head++;
538 	}
539 	while (!pipe_full(iter_head, p_tail, pipe->max_usage)) {
540 		struct pipe_buffer *buf = &pipe->bufs[iter_head & p_mask];
541 		struct page *page = alloc_page(GFP_USER);
542 		if (!page)
543 			break;
544 
545 		buf->ops = &default_pipe_buf_ops;
546 		buf->page = page;
547 		buf->offset = 0;
548 		buf->len = min_t(ssize_t, left, PAGE_SIZE);
549 		left -= buf->len;
550 		iter_head++;
551 		pipe->head = iter_head;
552 
553 		if (left == 0)
554 			return size;
555 	}
556 	return size - left;
557 }
558 
559 static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
560 				struct iov_iter *i)
561 {
562 	struct pipe_inode_info *pipe = i->pipe;
563 	unsigned int p_mask = pipe->ring_size - 1;
564 	unsigned int i_head;
565 	size_t n, off;
566 
567 	if (!sanity(i))
568 		return 0;
569 
570 	bytes = n = push_pipe(i, bytes, &i_head, &off);
571 	if (unlikely(!n))
572 		return 0;
573 	do {
574 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
575 		memcpy_to_page(pipe->bufs[i_head & p_mask].page, off, addr, chunk);
576 		i->head = i_head;
577 		i->iov_offset = off + chunk;
578 		n -= chunk;
579 		addr += chunk;
580 		off = 0;
581 		i_head++;
582 	} while (n);
583 	i->count -= bytes;
584 	return bytes;
585 }
586 
587 static __wsum csum_and_memcpy(void *to, const void *from, size_t len,
588 			      __wsum sum, size_t off)
589 {
590 	__wsum next = csum_partial_copy_nocheck(from, to, len);
591 	return csum_block_add(sum, next, off);
592 }
593 
594 static size_t csum_and_copy_to_pipe_iter(const void *addr, size_t bytes,
595 					 struct csum_state *csstate,
596 					 struct iov_iter *i)
597 {
598 	struct pipe_inode_info *pipe = i->pipe;
599 	unsigned int p_mask = pipe->ring_size - 1;
600 	__wsum sum = csstate->csum;
601 	size_t off = csstate->off;
602 	unsigned int i_head;
603 	size_t n, r;
604 
605 	if (!sanity(i))
606 		return 0;
607 
608 	bytes = n = push_pipe(i, bytes, &i_head, &r);
609 	if (unlikely(!n))
610 		return 0;
611 	do {
612 		size_t chunk = min_t(size_t, n, PAGE_SIZE - r);
613 		char *p = kmap_atomic(pipe->bufs[i_head & p_mask].page);
614 		sum = csum_and_memcpy(p + r, addr, chunk, sum, off);
615 		kunmap_atomic(p);
616 		i->head = i_head;
617 		i->iov_offset = r + chunk;
618 		n -= chunk;
619 		off += chunk;
620 		addr += chunk;
621 		r = 0;
622 		i_head++;
623 	} while (n);
624 	i->count -= bytes;
625 	csstate->csum = sum;
626 	csstate->off = off;
627 	return bytes;
628 }
629 
630 size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
631 {
632 	const char *from = addr;
633 	if (unlikely(iov_iter_is_pipe(i)))
634 		return copy_pipe_to_iter(addr, bytes, i);
635 	if (iter_is_iovec(i))
636 		might_fault();
637 	iterate_and_advance(i, bytes, v,
638 		copyout(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len),
639 		memcpy_to_page(v.bv_page, v.bv_offset,
640 			       (from += v.bv_len) - v.bv_len, v.bv_len),
641 		memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
642 	)
643 
644 	return bytes;
645 }
646 EXPORT_SYMBOL(_copy_to_iter);
647 
648 #ifdef CONFIG_ARCH_HAS_COPY_MC
649 static int copyout_mc(void __user *to, const void *from, size_t n)
650 {
651 	if (access_ok(to, n)) {
652 		instrument_copy_to_user(to, from, n);
653 		n = copy_mc_to_user((__force void *) to, from, n);
654 	}
655 	return n;
656 }
657 
658 static unsigned long copy_mc_to_page(struct page *page, size_t offset,
659 		const char *from, size_t len)
660 {
661 	unsigned long ret;
662 	char *to;
663 
664 	to = kmap_atomic(page);
665 	ret = copy_mc_to_kernel(to + offset, from, len);
666 	kunmap_atomic(to);
667 
668 	return ret;
669 }
670 
671 static size_t copy_mc_pipe_to_iter(const void *addr, size_t bytes,
672 				struct iov_iter *i)
673 {
674 	struct pipe_inode_info *pipe = i->pipe;
675 	unsigned int p_mask = pipe->ring_size - 1;
676 	unsigned int i_head;
677 	size_t n, off, xfer = 0;
678 
679 	if (!sanity(i))
680 		return 0;
681 
682 	bytes = n = push_pipe(i, bytes, &i_head, &off);
683 	if (unlikely(!n))
684 		return 0;
685 	do {
686 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
687 		unsigned long rem;
688 
689 		rem = copy_mc_to_page(pipe->bufs[i_head & p_mask].page,
690 					    off, addr, chunk);
691 		i->head = i_head;
692 		i->iov_offset = off + chunk - rem;
693 		xfer += chunk - rem;
694 		if (rem)
695 			break;
696 		n -= chunk;
697 		addr += chunk;
698 		off = 0;
699 		i_head++;
700 	} while (n);
701 	i->count -= xfer;
702 	return xfer;
703 }
704 
705 /**
706  * _copy_mc_to_iter - copy to iter with source memory error exception handling
707  * @addr: source kernel address
708  * @bytes: total transfer length
709  * @iter: destination iterator
710  *
711  * The pmem driver deploys this for the dax operation
712  * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
713  * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
714  * successfully copied.
715  *
716  * The main differences between this and typical _copy_to_iter().
717  *
718  * * Typical tail/residue handling after a fault retries the copy
719  *   byte-by-byte until the fault happens again. Re-triggering machine
720  *   checks is potentially fatal so the implementation uses source
721  *   alignment and poison alignment assumptions to avoid re-triggering
722  *   hardware exceptions.
723  *
724  * * ITER_KVEC, ITER_PIPE, and ITER_BVEC can return short copies.
725  *   Compare to copy_to_iter() where only ITER_IOVEC attempts might return
726  *   a short copy.
727  */
728 size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
729 {
730 	const char *from = addr;
731 	unsigned long rem, curr_addr, s_addr = (unsigned long) addr;
732 
733 	if (unlikely(iov_iter_is_pipe(i)))
734 		return copy_mc_pipe_to_iter(addr, bytes, i);
735 	if (iter_is_iovec(i))
736 		might_fault();
737 	iterate_and_advance(i, bytes, v,
738 		copyout_mc(v.iov_base, (from += v.iov_len) - v.iov_len,
739 			   v.iov_len),
740 		({
741 		rem = copy_mc_to_page(v.bv_page, v.bv_offset,
742 				      (from += v.bv_len) - v.bv_len, v.bv_len);
743 		if (rem) {
744 			curr_addr = (unsigned long) from;
745 			bytes = curr_addr - s_addr - rem;
746 			return bytes;
747 		}
748 		}),
749 		({
750 		rem = copy_mc_to_kernel(v.iov_base, (from += v.iov_len)
751 					- v.iov_len, v.iov_len);
752 		if (rem) {
753 			curr_addr = (unsigned long) from;
754 			bytes = curr_addr - s_addr - rem;
755 			return bytes;
756 		}
757 		})
758 	)
759 
760 	return bytes;
761 }
762 EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
763 #endif /* CONFIG_ARCH_HAS_COPY_MC */
764 
765 size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
766 {
767 	char *to = addr;
768 	if (unlikely(iov_iter_is_pipe(i))) {
769 		WARN_ON(1);
770 		return 0;
771 	}
772 	if (iter_is_iovec(i))
773 		might_fault();
774 	iterate_and_advance(i, bytes, v,
775 		copyin((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
776 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
777 				 v.bv_offset, v.bv_len),
778 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
779 	)
780 
781 	return bytes;
782 }
783 EXPORT_SYMBOL(_copy_from_iter);
784 
785 bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
786 {
787 	char *to = addr;
788 	if (unlikely(iov_iter_is_pipe(i))) {
789 		WARN_ON(1);
790 		return false;
791 	}
792 	if (unlikely(i->count < bytes))
793 		return false;
794 
795 	if (iter_is_iovec(i))
796 		might_fault();
797 	iterate_all_kinds(i, bytes, v, ({
798 		if (copyin((to += v.iov_len) - v.iov_len,
799 				      v.iov_base, v.iov_len))
800 			return false;
801 		0;}),
802 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
803 				 v.bv_offset, v.bv_len),
804 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
805 	)
806 
807 	iov_iter_advance(i, bytes);
808 	return true;
809 }
810 EXPORT_SYMBOL(_copy_from_iter_full);
811 
812 size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
813 {
814 	char *to = addr;
815 	if (unlikely(iov_iter_is_pipe(i))) {
816 		WARN_ON(1);
817 		return 0;
818 	}
819 	iterate_and_advance(i, bytes, v,
820 		__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
821 					 v.iov_base, v.iov_len),
822 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
823 				 v.bv_offset, v.bv_len),
824 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
825 	)
826 
827 	return bytes;
828 }
829 EXPORT_SYMBOL(_copy_from_iter_nocache);
830 
831 #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
832 /**
833  * _copy_from_iter_flushcache - write destination through cpu cache
834  * @addr: destination kernel address
835  * @bytes: total transfer length
836  * @iter: source iterator
837  *
838  * The pmem driver arranges for filesystem-dax to use this facility via
839  * dax_copy_from_iter() for ensuring that writes to persistent memory
840  * are flushed through the CPU cache. It is differentiated from
841  * _copy_from_iter_nocache() in that guarantees all data is flushed for
842  * all iterator types. The _copy_from_iter_nocache() only attempts to
843  * bypass the cache for the ITER_IOVEC case, and on some archs may use
844  * instructions that strand dirty-data in the cache.
845  */
846 size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
847 {
848 	char *to = addr;
849 	if (unlikely(iov_iter_is_pipe(i))) {
850 		WARN_ON(1);
851 		return 0;
852 	}
853 	iterate_and_advance(i, bytes, v,
854 		__copy_from_user_flushcache((to += v.iov_len) - v.iov_len,
855 					 v.iov_base, v.iov_len),
856 		memcpy_page_flushcache((to += v.bv_len) - v.bv_len, v.bv_page,
857 				 v.bv_offset, v.bv_len),
858 		memcpy_flushcache((to += v.iov_len) - v.iov_len, v.iov_base,
859 			v.iov_len)
860 	)
861 
862 	return bytes;
863 }
864 EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
865 #endif
866 
867 bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
868 {
869 	char *to = addr;
870 	if (unlikely(iov_iter_is_pipe(i))) {
871 		WARN_ON(1);
872 		return false;
873 	}
874 	if (unlikely(i->count < bytes))
875 		return false;
876 	iterate_all_kinds(i, bytes, v, ({
877 		if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len,
878 					     v.iov_base, v.iov_len))
879 			return false;
880 		0;}),
881 		memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
882 				 v.bv_offset, v.bv_len),
883 		memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
884 	)
885 
886 	iov_iter_advance(i, bytes);
887 	return true;
888 }
889 EXPORT_SYMBOL(_copy_from_iter_full_nocache);
890 
891 static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
892 {
893 	struct page *head;
894 	size_t v = n + offset;
895 
896 	/*
897 	 * The general case needs to access the page order in order
898 	 * to compute the page size.
899 	 * However, we mostly deal with order-0 pages and thus can
900 	 * avoid a possible cache line miss for requests that fit all
901 	 * page orders.
902 	 */
903 	if (n <= v && v <= PAGE_SIZE)
904 		return true;
905 
906 	head = compound_head(page);
907 	v += (page - head) << PAGE_SHIFT;
908 
909 	if (likely(n <= v && v <= (page_size(head))))
910 		return true;
911 	WARN_ON(1);
912 	return false;
913 }
914 
915 size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
916 			 struct iov_iter *i)
917 {
918 	if (unlikely(!page_copy_sane(page, offset, bytes)))
919 		return 0;
920 	if (i->type & (ITER_BVEC|ITER_KVEC)) {
921 		void *kaddr = kmap_atomic(page);
922 		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
923 		kunmap_atomic(kaddr);
924 		return wanted;
925 	} else if (unlikely(iov_iter_is_discard(i)))
926 		return bytes;
927 	else if (likely(!iov_iter_is_pipe(i)))
928 		return copy_page_to_iter_iovec(page, offset, bytes, i);
929 	else
930 		return copy_page_to_iter_pipe(page, offset, bytes, i);
931 }
932 EXPORT_SYMBOL(copy_page_to_iter);
933 
934 size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
935 			 struct iov_iter *i)
936 {
937 	if (unlikely(!page_copy_sane(page, offset, bytes)))
938 		return 0;
939 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
940 		WARN_ON(1);
941 		return 0;
942 	}
943 	if (i->type & (ITER_BVEC|ITER_KVEC)) {
944 		void *kaddr = kmap_atomic(page);
945 		size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
946 		kunmap_atomic(kaddr);
947 		return wanted;
948 	} else
949 		return copy_page_from_iter_iovec(page, offset, bytes, i);
950 }
951 EXPORT_SYMBOL(copy_page_from_iter);
952 
953 static size_t pipe_zero(size_t bytes, struct iov_iter *i)
954 {
955 	struct pipe_inode_info *pipe = i->pipe;
956 	unsigned int p_mask = pipe->ring_size - 1;
957 	unsigned int i_head;
958 	size_t n, off;
959 
960 	if (!sanity(i))
961 		return 0;
962 
963 	bytes = n = push_pipe(i, bytes, &i_head, &off);
964 	if (unlikely(!n))
965 		return 0;
966 
967 	do {
968 		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
969 		memzero_page(pipe->bufs[i_head & p_mask].page, off, chunk);
970 		i->head = i_head;
971 		i->iov_offset = off + chunk;
972 		n -= chunk;
973 		off = 0;
974 		i_head++;
975 	} while (n);
976 	i->count -= bytes;
977 	return bytes;
978 }
979 
980 size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
981 {
982 	if (unlikely(iov_iter_is_pipe(i)))
983 		return pipe_zero(bytes, i);
984 	iterate_and_advance(i, bytes, v,
985 		clear_user(v.iov_base, v.iov_len),
986 		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
987 		memset(v.iov_base, 0, v.iov_len)
988 	)
989 
990 	return bytes;
991 }
992 EXPORT_SYMBOL(iov_iter_zero);
993 
994 size_t iov_iter_copy_from_user_atomic(struct page *page,
995 		struct iov_iter *i, unsigned long offset, size_t bytes)
996 {
997 	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
998 	if (unlikely(!page_copy_sane(page, offset, bytes))) {
999 		kunmap_atomic(kaddr);
1000 		return 0;
1001 	}
1002 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1003 		kunmap_atomic(kaddr);
1004 		WARN_ON(1);
1005 		return 0;
1006 	}
1007 	iterate_all_kinds(i, bytes, v,
1008 		copyin((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len),
1009 		memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
1010 				 v.bv_offset, v.bv_len),
1011 		memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
1012 	)
1013 	kunmap_atomic(kaddr);
1014 	return bytes;
1015 }
1016 EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
1017 
1018 static inline void pipe_truncate(struct iov_iter *i)
1019 {
1020 	struct pipe_inode_info *pipe = i->pipe;
1021 	unsigned int p_tail = pipe->tail;
1022 	unsigned int p_head = pipe->head;
1023 	unsigned int p_mask = pipe->ring_size - 1;
1024 
1025 	if (!pipe_empty(p_head, p_tail)) {
1026 		struct pipe_buffer *buf;
1027 		unsigned int i_head = i->head;
1028 		size_t off = i->iov_offset;
1029 
1030 		if (off) {
1031 			buf = &pipe->bufs[i_head & p_mask];
1032 			buf->len = off - buf->offset;
1033 			i_head++;
1034 		}
1035 		while (p_head != i_head) {
1036 			p_head--;
1037 			pipe_buf_release(pipe, &pipe->bufs[p_head & p_mask]);
1038 		}
1039 
1040 		pipe->head = p_head;
1041 	}
1042 }
1043 
1044 static void pipe_advance(struct iov_iter *i, size_t size)
1045 {
1046 	struct pipe_inode_info *pipe = i->pipe;
1047 	if (unlikely(i->count < size))
1048 		size = i->count;
1049 	if (size) {
1050 		struct pipe_buffer *buf;
1051 		unsigned int p_mask = pipe->ring_size - 1;
1052 		unsigned int i_head = i->head;
1053 		size_t off = i->iov_offset, left = size;
1054 
1055 		if (off) /* make it relative to the beginning of buffer */
1056 			left += off - pipe->bufs[i_head & p_mask].offset;
1057 		while (1) {
1058 			buf = &pipe->bufs[i_head & p_mask];
1059 			if (left <= buf->len)
1060 				break;
1061 			left -= buf->len;
1062 			i_head++;
1063 		}
1064 		i->head = i_head;
1065 		i->iov_offset = buf->offset + left;
1066 	}
1067 	i->count -= size;
1068 	/* ... and discard everything past that point */
1069 	pipe_truncate(i);
1070 }
1071 
1072 void iov_iter_advance(struct iov_iter *i, size_t size)
1073 {
1074 	if (unlikely(iov_iter_is_pipe(i))) {
1075 		pipe_advance(i, size);
1076 		return;
1077 	}
1078 	if (unlikely(iov_iter_is_discard(i))) {
1079 		i->count -= size;
1080 		return;
1081 	}
1082 	iterate_and_advance(i, size, v, 0, 0, 0)
1083 }
1084 EXPORT_SYMBOL(iov_iter_advance);
1085 
1086 void iov_iter_revert(struct iov_iter *i, size_t unroll)
1087 {
1088 	if (!unroll)
1089 		return;
1090 	if (WARN_ON(unroll > MAX_RW_COUNT))
1091 		return;
1092 	i->count += unroll;
1093 	if (unlikely(iov_iter_is_pipe(i))) {
1094 		struct pipe_inode_info *pipe = i->pipe;
1095 		unsigned int p_mask = pipe->ring_size - 1;
1096 		unsigned int i_head = i->head;
1097 		size_t off = i->iov_offset;
1098 		while (1) {
1099 			struct pipe_buffer *b = &pipe->bufs[i_head & p_mask];
1100 			size_t n = off - b->offset;
1101 			if (unroll < n) {
1102 				off -= unroll;
1103 				break;
1104 			}
1105 			unroll -= n;
1106 			if (!unroll && i_head == i->start_head) {
1107 				off = 0;
1108 				break;
1109 			}
1110 			i_head--;
1111 			b = &pipe->bufs[i_head & p_mask];
1112 			off = b->offset + b->len;
1113 		}
1114 		i->iov_offset = off;
1115 		i->head = i_head;
1116 		pipe_truncate(i);
1117 		return;
1118 	}
1119 	if (unlikely(iov_iter_is_discard(i)))
1120 		return;
1121 	if (unroll <= i->iov_offset) {
1122 		i->iov_offset -= unroll;
1123 		return;
1124 	}
1125 	unroll -= i->iov_offset;
1126 	if (iov_iter_is_bvec(i)) {
1127 		const struct bio_vec *bvec = i->bvec;
1128 		while (1) {
1129 			size_t n = (--bvec)->bv_len;
1130 			i->nr_segs++;
1131 			if (unroll <= n) {
1132 				i->bvec = bvec;
1133 				i->iov_offset = n - unroll;
1134 				return;
1135 			}
1136 			unroll -= n;
1137 		}
1138 	} else { /* same logics for iovec and kvec */
1139 		const struct iovec *iov = i->iov;
1140 		while (1) {
1141 			size_t n = (--iov)->iov_len;
1142 			i->nr_segs++;
1143 			if (unroll <= n) {
1144 				i->iov = iov;
1145 				i->iov_offset = n - unroll;
1146 				return;
1147 			}
1148 			unroll -= n;
1149 		}
1150 	}
1151 }
1152 EXPORT_SYMBOL(iov_iter_revert);
1153 
1154 /*
1155  * Return the count of just the current iov_iter segment.
1156  */
1157 size_t iov_iter_single_seg_count(const struct iov_iter *i)
1158 {
1159 	if (unlikely(iov_iter_is_pipe(i)))
1160 		return i->count;	// it is a silly place, anyway
1161 	if (i->nr_segs == 1)
1162 		return i->count;
1163 	if (unlikely(iov_iter_is_discard(i)))
1164 		return i->count;
1165 	else if (iov_iter_is_bvec(i))
1166 		return min(i->count, i->bvec->bv_len - i->iov_offset);
1167 	else
1168 		return min(i->count, i->iov->iov_len - i->iov_offset);
1169 }
1170 EXPORT_SYMBOL(iov_iter_single_seg_count);
1171 
1172 void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
1173 			const struct kvec *kvec, unsigned long nr_segs,
1174 			size_t count)
1175 {
1176 	WARN_ON(direction & ~(READ | WRITE));
1177 	i->type = ITER_KVEC | (direction & (READ | WRITE));
1178 	i->kvec = kvec;
1179 	i->nr_segs = nr_segs;
1180 	i->iov_offset = 0;
1181 	i->count = count;
1182 }
1183 EXPORT_SYMBOL(iov_iter_kvec);
1184 
1185 void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
1186 			const struct bio_vec *bvec, unsigned long nr_segs,
1187 			size_t count)
1188 {
1189 	WARN_ON(direction & ~(READ | WRITE));
1190 	i->type = ITER_BVEC | (direction & (READ | WRITE));
1191 	i->bvec = bvec;
1192 	i->nr_segs = nr_segs;
1193 	i->iov_offset = 0;
1194 	i->count = count;
1195 }
1196 EXPORT_SYMBOL(iov_iter_bvec);
1197 
1198 void iov_iter_pipe(struct iov_iter *i, unsigned int direction,
1199 			struct pipe_inode_info *pipe,
1200 			size_t count)
1201 {
1202 	BUG_ON(direction != READ);
1203 	WARN_ON(pipe_full(pipe->head, pipe->tail, pipe->ring_size));
1204 	i->type = ITER_PIPE | READ;
1205 	i->pipe = pipe;
1206 	i->head = pipe->head;
1207 	i->iov_offset = 0;
1208 	i->count = count;
1209 	i->start_head = i->head;
1210 }
1211 EXPORT_SYMBOL(iov_iter_pipe);
1212 
1213 /**
1214  * iov_iter_discard - Initialise an I/O iterator that discards data
1215  * @i: The iterator to initialise.
1216  * @direction: The direction of the transfer.
1217  * @count: The size of the I/O buffer in bytes.
1218  *
1219  * Set up an I/O iterator that just discards everything that's written to it.
1220  * It's only available as a READ iterator.
1221  */
1222 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
1223 {
1224 	BUG_ON(direction != READ);
1225 	i->type = ITER_DISCARD | READ;
1226 	i->count = count;
1227 	i->iov_offset = 0;
1228 }
1229 EXPORT_SYMBOL(iov_iter_discard);
1230 
1231 unsigned long iov_iter_alignment(const struct iov_iter *i)
1232 {
1233 	unsigned long res = 0;
1234 	size_t size = i->count;
1235 
1236 	if (unlikely(iov_iter_is_pipe(i))) {
1237 		unsigned int p_mask = i->pipe->ring_size - 1;
1238 
1239 		if (size && i->iov_offset && allocated(&i->pipe->bufs[i->head & p_mask]))
1240 			return size | i->iov_offset;
1241 		return size;
1242 	}
1243 	iterate_all_kinds(i, size, v,
1244 		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
1245 		res |= v.bv_offset | v.bv_len,
1246 		res |= (unsigned long)v.iov_base | v.iov_len
1247 	)
1248 	return res;
1249 }
1250 EXPORT_SYMBOL(iov_iter_alignment);
1251 
1252 unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
1253 {
1254 	unsigned long res = 0;
1255 	size_t size = i->count;
1256 
1257 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1258 		WARN_ON(1);
1259 		return ~0U;
1260 	}
1261 
1262 	iterate_all_kinds(i, size, v,
1263 		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
1264 			(size != v.iov_len ? size : 0), 0),
1265 		(res |= (!res ? 0 : (unsigned long)v.bv_offset) |
1266 			(size != v.bv_len ? size : 0)),
1267 		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
1268 			(size != v.iov_len ? size : 0))
1269 		);
1270 	return res;
1271 }
1272 EXPORT_SYMBOL(iov_iter_gap_alignment);
1273 
1274 static inline ssize_t __pipe_get_pages(struct iov_iter *i,
1275 				size_t maxsize,
1276 				struct page **pages,
1277 				int iter_head,
1278 				size_t *start)
1279 {
1280 	struct pipe_inode_info *pipe = i->pipe;
1281 	unsigned int p_mask = pipe->ring_size - 1;
1282 	ssize_t n = push_pipe(i, maxsize, &iter_head, start);
1283 	if (!n)
1284 		return -EFAULT;
1285 
1286 	maxsize = n;
1287 	n += *start;
1288 	while (n > 0) {
1289 		get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
1290 		iter_head++;
1291 		n -= PAGE_SIZE;
1292 	}
1293 
1294 	return maxsize;
1295 }
1296 
1297 static ssize_t pipe_get_pages(struct iov_iter *i,
1298 		   struct page **pages, size_t maxsize, unsigned maxpages,
1299 		   size_t *start)
1300 {
1301 	unsigned int iter_head, npages;
1302 	size_t capacity;
1303 
1304 	if (!maxsize)
1305 		return 0;
1306 
1307 	if (!sanity(i))
1308 		return -EFAULT;
1309 
1310 	data_start(i, &iter_head, start);
1311 	/* Amount of free space: some of this one + all after this one */
1312 	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1313 	capacity = min(npages, maxpages) * PAGE_SIZE - *start;
1314 
1315 	return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, start);
1316 }
1317 
1318 ssize_t iov_iter_get_pages(struct iov_iter *i,
1319 		   struct page **pages, size_t maxsize, unsigned maxpages,
1320 		   size_t *start)
1321 {
1322 	if (maxsize > i->count)
1323 		maxsize = i->count;
1324 
1325 	if (unlikely(iov_iter_is_pipe(i)))
1326 		return pipe_get_pages(i, pages, maxsize, maxpages, start);
1327 	if (unlikely(iov_iter_is_discard(i)))
1328 		return -EFAULT;
1329 
1330 	iterate_all_kinds(i, maxsize, v, ({
1331 		unsigned long addr = (unsigned long)v.iov_base;
1332 		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
1333 		int n;
1334 		int res;
1335 
1336 		if (len > maxpages * PAGE_SIZE)
1337 			len = maxpages * PAGE_SIZE;
1338 		addr &= ~(PAGE_SIZE - 1);
1339 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1340 		res = get_user_pages_fast(addr, n,
1341 				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
1342 				pages);
1343 		if (unlikely(res < 0))
1344 			return res;
1345 		return (res == n ? len : res * PAGE_SIZE) - *start;
1346 	0;}),({
1347 		/* can't be more than PAGE_SIZE */
1348 		*start = v.bv_offset;
1349 		get_page(*pages = v.bv_page);
1350 		return v.bv_len;
1351 	}),({
1352 		return -EFAULT;
1353 	})
1354 	)
1355 	return 0;
1356 }
1357 EXPORT_SYMBOL(iov_iter_get_pages);
1358 
1359 static struct page **get_pages_array(size_t n)
1360 {
1361 	return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
1362 }
1363 
1364 static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
1365 		   struct page ***pages, size_t maxsize,
1366 		   size_t *start)
1367 {
1368 	struct page **p;
1369 	unsigned int iter_head, npages;
1370 	ssize_t n;
1371 
1372 	if (!maxsize)
1373 		return 0;
1374 
1375 	if (!sanity(i))
1376 		return -EFAULT;
1377 
1378 	data_start(i, &iter_head, start);
1379 	/* Amount of free space: some of this one + all after this one */
1380 	npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
1381 	n = npages * PAGE_SIZE - *start;
1382 	if (maxsize > n)
1383 		maxsize = n;
1384 	else
1385 		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
1386 	p = get_pages_array(npages);
1387 	if (!p)
1388 		return -ENOMEM;
1389 	n = __pipe_get_pages(i, maxsize, p, iter_head, start);
1390 	if (n > 0)
1391 		*pages = p;
1392 	else
1393 		kvfree(p);
1394 	return n;
1395 }
1396 
1397 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
1398 		   struct page ***pages, size_t maxsize,
1399 		   size_t *start)
1400 {
1401 	struct page **p;
1402 
1403 	if (maxsize > i->count)
1404 		maxsize = i->count;
1405 
1406 	if (unlikely(iov_iter_is_pipe(i)))
1407 		return pipe_get_pages_alloc(i, pages, maxsize, start);
1408 	if (unlikely(iov_iter_is_discard(i)))
1409 		return -EFAULT;
1410 
1411 	iterate_all_kinds(i, maxsize, v, ({
1412 		unsigned long addr = (unsigned long)v.iov_base;
1413 		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
1414 		int n;
1415 		int res;
1416 
1417 		addr &= ~(PAGE_SIZE - 1);
1418 		n = DIV_ROUND_UP(len, PAGE_SIZE);
1419 		p = get_pages_array(n);
1420 		if (!p)
1421 			return -ENOMEM;
1422 		res = get_user_pages_fast(addr, n,
1423 				iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
1424 		if (unlikely(res < 0)) {
1425 			kvfree(p);
1426 			return res;
1427 		}
1428 		*pages = p;
1429 		return (res == n ? len : res * PAGE_SIZE) - *start;
1430 	0;}),({
1431 		/* can't be more than PAGE_SIZE */
1432 		*start = v.bv_offset;
1433 		*pages = p = get_pages_array(1);
1434 		if (!p)
1435 			return -ENOMEM;
1436 		get_page(*p = v.bv_page);
1437 		return v.bv_len;
1438 	}),({
1439 		return -EFAULT;
1440 	})
1441 	)
1442 	return 0;
1443 }
1444 EXPORT_SYMBOL(iov_iter_get_pages_alloc);
1445 
1446 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
1447 			       struct iov_iter *i)
1448 {
1449 	char *to = addr;
1450 	__wsum sum, next;
1451 	size_t off = 0;
1452 	sum = *csum;
1453 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1454 		WARN_ON(1);
1455 		return 0;
1456 	}
1457 	iterate_and_advance(i, bytes, v, ({
1458 		next = csum_and_copy_from_user(v.iov_base,
1459 					       (to += v.iov_len) - v.iov_len,
1460 					       v.iov_len);
1461 		if (next) {
1462 			sum = csum_block_add(sum, next, off);
1463 			off += v.iov_len;
1464 		}
1465 		next ? 0 : v.iov_len;
1466 	}), ({
1467 		char *p = kmap_atomic(v.bv_page);
1468 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1469 				      p + v.bv_offset, v.bv_len,
1470 				      sum, off);
1471 		kunmap_atomic(p);
1472 		off += v.bv_len;
1473 	}),({
1474 		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1475 				      v.iov_base, v.iov_len,
1476 				      sum, off);
1477 		off += v.iov_len;
1478 	})
1479 	)
1480 	*csum = sum;
1481 	return bytes;
1482 }
1483 EXPORT_SYMBOL(csum_and_copy_from_iter);
1484 
1485 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
1486 			       struct iov_iter *i)
1487 {
1488 	char *to = addr;
1489 	__wsum sum, next;
1490 	size_t off = 0;
1491 	sum = *csum;
1492 	if (unlikely(iov_iter_is_pipe(i) || iov_iter_is_discard(i))) {
1493 		WARN_ON(1);
1494 		return false;
1495 	}
1496 	if (unlikely(i->count < bytes))
1497 		return false;
1498 	iterate_all_kinds(i, bytes, v, ({
1499 		next = csum_and_copy_from_user(v.iov_base,
1500 					       (to += v.iov_len) - v.iov_len,
1501 					       v.iov_len);
1502 		if (!next)
1503 			return false;
1504 		sum = csum_block_add(sum, next, off);
1505 		off += v.iov_len;
1506 		0;
1507 	}), ({
1508 		char *p = kmap_atomic(v.bv_page);
1509 		sum = csum_and_memcpy((to += v.bv_len) - v.bv_len,
1510 				      p + v.bv_offset, v.bv_len,
1511 				      sum, off);
1512 		kunmap_atomic(p);
1513 		off += v.bv_len;
1514 	}),({
1515 		sum = csum_and_memcpy((to += v.iov_len) - v.iov_len,
1516 				      v.iov_base, v.iov_len,
1517 				      sum, off);
1518 		off += v.iov_len;
1519 	})
1520 	)
1521 	*csum = sum;
1522 	iov_iter_advance(i, bytes);
1523 	return true;
1524 }
1525 EXPORT_SYMBOL(csum_and_copy_from_iter_full);
1526 
1527 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
1528 			     struct iov_iter *i)
1529 {
1530 	struct csum_state *csstate = _csstate;
1531 	const char *from = addr;
1532 	__wsum sum, next;
1533 	size_t off;
1534 
1535 	if (unlikely(iov_iter_is_pipe(i)))
1536 		return csum_and_copy_to_pipe_iter(addr, bytes, _csstate, i);
1537 
1538 	sum = csstate->csum;
1539 	off = csstate->off;
1540 	if (unlikely(iov_iter_is_discard(i))) {
1541 		WARN_ON(1);	/* for now */
1542 		return 0;
1543 	}
1544 	iterate_and_advance(i, bytes, v, ({
1545 		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
1546 					     v.iov_base,
1547 					     v.iov_len);
1548 		if (next) {
1549 			sum = csum_block_add(sum, next, off);
1550 			off += v.iov_len;
1551 		}
1552 		next ? 0 : v.iov_len;
1553 	}), ({
1554 		char *p = kmap_atomic(v.bv_page);
1555 		sum = csum_and_memcpy(p + v.bv_offset,
1556 				      (from += v.bv_len) - v.bv_len,
1557 				      v.bv_len, sum, off);
1558 		kunmap_atomic(p);
1559 		off += v.bv_len;
1560 	}),({
1561 		sum = csum_and_memcpy(v.iov_base,
1562 				     (from += v.iov_len) - v.iov_len,
1563 				     v.iov_len, sum, off);
1564 		off += v.iov_len;
1565 	})
1566 	)
1567 	csstate->csum = sum;
1568 	csstate->off = off;
1569 	return bytes;
1570 }
1571 EXPORT_SYMBOL(csum_and_copy_to_iter);
1572 
1573 size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
1574 		struct iov_iter *i)
1575 {
1576 #ifdef CONFIG_CRYPTO_HASH
1577 	struct ahash_request *hash = hashp;
1578 	struct scatterlist sg;
1579 	size_t copied;
1580 
1581 	copied = copy_to_iter(addr, bytes, i);
1582 	sg_init_one(&sg, addr, copied);
1583 	ahash_request_set_crypt(hash, &sg, NULL, copied);
1584 	crypto_ahash_update(hash);
1585 	return copied;
1586 #else
1587 	return 0;
1588 #endif
1589 }
1590 EXPORT_SYMBOL(hash_and_copy_to_iter);
1591 
1592 int iov_iter_npages(const struct iov_iter *i, int maxpages)
1593 {
1594 	size_t size = i->count;
1595 	int npages = 0;
1596 
1597 	if (!size)
1598 		return 0;
1599 	if (unlikely(iov_iter_is_discard(i)))
1600 		return 0;
1601 
1602 	if (unlikely(iov_iter_is_pipe(i))) {
1603 		struct pipe_inode_info *pipe = i->pipe;
1604 		unsigned int iter_head;
1605 		size_t off;
1606 
1607 		if (!sanity(i))
1608 			return 0;
1609 
1610 		data_start(i, &iter_head, &off);
1611 		/* some of this one + all after this one */
1612 		npages = pipe_space_for_user(iter_head, pipe->tail, pipe);
1613 		if (npages >= maxpages)
1614 			return maxpages;
1615 	} else iterate_all_kinds(i, size, v, ({
1616 		unsigned long p = (unsigned long)v.iov_base;
1617 		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
1618 			- p / PAGE_SIZE;
1619 		if (npages >= maxpages)
1620 			return maxpages;
1621 	0;}),({
1622 		npages++;
1623 		if (npages >= maxpages)
1624 			return maxpages;
1625 	}),({
1626 		unsigned long p = (unsigned long)v.iov_base;
1627 		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
1628 			- p / PAGE_SIZE;
1629 		if (npages >= maxpages)
1630 			return maxpages;
1631 	})
1632 	)
1633 	return npages;
1634 }
1635 EXPORT_SYMBOL(iov_iter_npages);
1636 
1637 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
1638 {
1639 	*new = *old;
1640 	if (unlikely(iov_iter_is_pipe(new))) {
1641 		WARN_ON(1);
1642 		return NULL;
1643 	}
1644 	if (unlikely(iov_iter_is_discard(new)))
1645 		return NULL;
1646 	if (iov_iter_is_bvec(new))
1647 		return new->bvec = kmemdup(new->bvec,
1648 				    new->nr_segs * sizeof(struct bio_vec),
1649 				    flags);
1650 	else
1651 		/* iovec and kvec have identical layout */
1652 		return new->iov = kmemdup(new->iov,
1653 				   new->nr_segs * sizeof(struct iovec),
1654 				   flags);
1655 }
1656 EXPORT_SYMBOL(dup_iter);
1657 
1658 static int copy_compat_iovec_from_user(struct iovec *iov,
1659 		const struct iovec __user *uvec, unsigned long nr_segs)
1660 {
1661 	const struct compat_iovec __user *uiov =
1662 		(const struct compat_iovec __user *)uvec;
1663 	int ret = -EFAULT, i;
1664 
1665 	if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
1666 		return -EFAULT;
1667 
1668 	for (i = 0; i < nr_segs; i++) {
1669 		compat_uptr_t buf;
1670 		compat_ssize_t len;
1671 
1672 		unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
1673 		unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
1674 
1675 		/* check for compat_size_t not fitting in compat_ssize_t .. */
1676 		if (len < 0) {
1677 			ret = -EINVAL;
1678 			goto uaccess_end;
1679 		}
1680 		iov[i].iov_base = compat_ptr(buf);
1681 		iov[i].iov_len = len;
1682 	}
1683 
1684 	ret = 0;
1685 uaccess_end:
1686 	user_access_end();
1687 	return ret;
1688 }
1689 
1690 static int copy_iovec_from_user(struct iovec *iov,
1691 		const struct iovec __user *uvec, unsigned long nr_segs)
1692 {
1693 	unsigned long seg;
1694 
1695 	if (copy_from_user(iov, uvec, nr_segs * sizeof(*uvec)))
1696 		return -EFAULT;
1697 	for (seg = 0; seg < nr_segs; seg++) {
1698 		if ((ssize_t)iov[seg].iov_len < 0)
1699 			return -EINVAL;
1700 	}
1701 
1702 	return 0;
1703 }
1704 
1705 struct iovec *iovec_from_user(const struct iovec __user *uvec,
1706 		unsigned long nr_segs, unsigned long fast_segs,
1707 		struct iovec *fast_iov, bool compat)
1708 {
1709 	struct iovec *iov = fast_iov;
1710 	int ret;
1711 
1712 	/*
1713 	 * SuS says "The readv() function *may* fail if the iovcnt argument was
1714 	 * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
1715 	 * traditionally returned zero for zero segments, so...
1716 	 */
1717 	if (nr_segs == 0)
1718 		return iov;
1719 	if (nr_segs > UIO_MAXIOV)
1720 		return ERR_PTR(-EINVAL);
1721 	if (nr_segs > fast_segs) {
1722 		iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
1723 		if (!iov)
1724 			return ERR_PTR(-ENOMEM);
1725 	}
1726 
1727 	if (compat)
1728 		ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
1729 	else
1730 		ret = copy_iovec_from_user(iov, uvec, nr_segs);
1731 	if (ret) {
1732 		if (iov != fast_iov)
1733 			kfree(iov);
1734 		return ERR_PTR(ret);
1735 	}
1736 
1737 	return iov;
1738 }
1739 
1740 ssize_t __import_iovec(int type, const struct iovec __user *uvec,
1741 		 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
1742 		 struct iov_iter *i, bool compat)
1743 {
1744 	ssize_t total_len = 0;
1745 	unsigned long seg;
1746 	struct iovec *iov;
1747 
1748 	iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
1749 	if (IS_ERR(iov)) {
1750 		*iovp = NULL;
1751 		return PTR_ERR(iov);
1752 	}
1753 
1754 	/*
1755 	 * According to the Single Unix Specification we should return EINVAL if
1756 	 * an element length is < 0 when cast to ssize_t or if the total length
1757 	 * would overflow the ssize_t return value of the system call.
1758 	 *
1759 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
1760 	 * overflow case.
1761 	 */
1762 	for (seg = 0; seg < nr_segs; seg++) {
1763 		ssize_t len = (ssize_t)iov[seg].iov_len;
1764 
1765 		if (!access_ok(iov[seg].iov_base, len)) {
1766 			if (iov != *iovp)
1767 				kfree(iov);
1768 			*iovp = NULL;
1769 			return -EFAULT;
1770 		}
1771 
1772 		if (len > MAX_RW_COUNT - total_len) {
1773 			len = MAX_RW_COUNT - total_len;
1774 			iov[seg].iov_len = len;
1775 		}
1776 		total_len += len;
1777 	}
1778 
1779 	iov_iter_init(i, type, iov, nr_segs, total_len);
1780 	if (iov == *iovp)
1781 		*iovp = NULL;
1782 	else
1783 		*iovp = iov;
1784 	return total_len;
1785 }
1786 
1787 /**
1788  * import_iovec() - Copy an array of &struct iovec from userspace
1789  *     into the kernel, check that it is valid, and initialize a new
1790  *     &struct iov_iter iterator to access it.
1791  *
1792  * @type: One of %READ or %WRITE.
1793  * @uvec: Pointer to the userspace array.
1794  * @nr_segs: Number of elements in userspace array.
1795  * @fast_segs: Number of elements in @iov.
1796  * @iovp: (input and output parameter) Pointer to pointer to (usually small
1797  *     on-stack) kernel array.
1798  * @i: Pointer to iterator that will be initialized on success.
1799  *
1800  * If the array pointed to by *@iov is large enough to hold all @nr_segs,
1801  * then this function places %NULL in *@iov on return. Otherwise, a new
1802  * array will be allocated and the result placed in *@iov. This means that
1803  * the caller may call kfree() on *@iov regardless of whether the small
1804  * on-stack array was used or not (and regardless of whether this function
1805  * returns an error or not).
1806  *
1807  * Return: Negative error code on error, bytes imported on success
1808  */
1809 ssize_t import_iovec(int type, const struct iovec __user *uvec,
1810 		 unsigned nr_segs, unsigned fast_segs,
1811 		 struct iovec **iovp, struct iov_iter *i)
1812 {
1813 	return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
1814 			      in_compat_syscall());
1815 }
1816 EXPORT_SYMBOL(import_iovec);
1817 
1818 int import_single_range(int rw, void __user *buf, size_t len,
1819 		 struct iovec *iov, struct iov_iter *i)
1820 {
1821 	if (len > MAX_RW_COUNT)
1822 		len = MAX_RW_COUNT;
1823 	if (unlikely(!access_ok(buf, len)))
1824 		return -EFAULT;
1825 
1826 	iov->iov_base = buf;
1827 	iov->iov_len = len;
1828 	iov_iter_init(i, rw, iov, 1, len);
1829 	return 0;
1830 }
1831 EXPORT_SYMBOL(import_single_range);
1832 
1833 int iov_iter_for_each_range(struct iov_iter *i, size_t bytes,
1834 			    int (*f)(struct kvec *vec, void *context),
1835 			    void *context)
1836 {
1837 	struct kvec w;
1838 	int err = -EINVAL;
1839 	if (!bytes)
1840 		return 0;
1841 
1842 	iterate_all_kinds(i, bytes, v, -EINVAL, ({
1843 		w.iov_base = kmap(v.bv_page) + v.bv_offset;
1844 		w.iov_len = v.bv_len;
1845 		err = f(&w, context);
1846 		kunmap(v.bv_page);
1847 		err;}), ({
1848 		w = v;
1849 		err = f(&w, context);})
1850 	)
1851 	return err;
1852 }
1853 EXPORT_SYMBOL(iov_iter_for_each_range);
1854