xref: /openbmc/linux/fs/iomap/buffered-io.c (revision 9659281c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Red Hat, Inc.
4  * Copyright (C) 2016-2019 Christoph Hellwig.
5  */
6 #include <linux/module.h>
7 #include <linux/compiler.h>
8 #include <linux/fs.h>
9 #include <linux/iomap.h>
10 #include <linux/pagemap.h>
11 #include <linux/uio.h>
12 #include <linux/buffer_head.h>
13 #include <linux/dax.h>
14 #include <linux/writeback.h>
15 #include <linux/list_sort.h>
16 #include <linux/swap.h>
17 #include <linux/bio.h>
18 #include <linux/sched/signal.h>
19 #include <linux/migrate.h>
20 #include "trace.h"
21 
22 #include "../internal.h"
23 
24 /*
25  * Structure allocated for each page or THP when block size < page size
26  * to track sub-page uptodate status and I/O completions.
27  */
28 struct iomap_page {
29 	atomic_t		read_bytes_pending;
30 	atomic_t		write_bytes_pending;
31 	spinlock_t		uptodate_lock;
32 	unsigned long		uptodate[];
33 };
34 
35 static inline struct iomap_page *to_iomap_page(struct page *page)
36 {
37 	/*
38 	 * per-block data is stored in the head page.  Callers should
39 	 * not be dealing with tail pages (and if they are, they can
40 	 * call thp_head() first.
41 	 */
42 	VM_BUG_ON_PGFLAGS(PageTail(page), page);
43 
44 	if (page_has_private(page))
45 		return (struct iomap_page *)page_private(page);
46 	return NULL;
47 }
48 
49 static struct bio_set iomap_ioend_bioset;
50 
51 static struct iomap_page *
52 iomap_page_create(struct inode *inode, struct page *page)
53 {
54 	struct iomap_page *iop = to_iomap_page(page);
55 	unsigned int nr_blocks = i_blocks_per_page(inode, page);
56 
57 	if (iop || nr_blocks <= 1)
58 		return iop;
59 
60 	iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
61 			GFP_NOFS | __GFP_NOFAIL);
62 	spin_lock_init(&iop->uptodate_lock);
63 	if (PageUptodate(page))
64 		bitmap_fill(iop->uptodate, nr_blocks);
65 	attach_page_private(page, iop);
66 	return iop;
67 }
68 
69 static void
70 iomap_page_release(struct page *page)
71 {
72 	struct iomap_page *iop = detach_page_private(page);
73 	unsigned int nr_blocks = i_blocks_per_page(page->mapping->host, page);
74 
75 	if (!iop)
76 		return;
77 	WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending));
78 	WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending));
79 	WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) !=
80 			PageUptodate(page));
81 	kfree(iop);
82 }
83 
84 /*
85  * Calculate the range inside the page that we actually need to read.
86  */
87 static void
88 iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
89 		loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
90 {
91 	loff_t orig_pos = *pos;
92 	loff_t isize = i_size_read(inode);
93 	unsigned block_bits = inode->i_blkbits;
94 	unsigned block_size = (1 << block_bits);
95 	unsigned poff = offset_in_page(*pos);
96 	unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
97 	unsigned first = poff >> block_bits;
98 	unsigned last = (poff + plen - 1) >> block_bits;
99 
100 	/*
101 	 * If the block size is smaller than the page size we need to check the
102 	 * per-block uptodate status and adjust the offset and length if needed
103 	 * to avoid reading in already uptodate ranges.
104 	 */
105 	if (iop) {
106 		unsigned int i;
107 
108 		/* move forward for each leading block marked uptodate */
109 		for (i = first; i <= last; i++) {
110 			if (!test_bit(i, iop->uptodate))
111 				break;
112 			*pos += block_size;
113 			poff += block_size;
114 			plen -= block_size;
115 			first++;
116 		}
117 
118 		/* truncate len if we find any trailing uptodate block(s) */
119 		for ( ; i <= last; i++) {
120 			if (test_bit(i, iop->uptodate)) {
121 				plen -= (last - i + 1) * block_size;
122 				last = i - 1;
123 				break;
124 			}
125 		}
126 	}
127 
128 	/*
129 	 * If the extent spans the block that contains the i_size we need to
130 	 * handle both halves separately so that we properly zero data in the
131 	 * page cache for blocks that are entirely outside of i_size.
132 	 */
133 	if (orig_pos <= isize && orig_pos + length > isize) {
134 		unsigned end = offset_in_page(isize - 1) >> block_bits;
135 
136 		if (first <= end && last > end)
137 			plen -= (last - end) * block_size;
138 	}
139 
140 	*offp = poff;
141 	*lenp = plen;
142 }
143 
144 static void
145 iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len)
146 {
147 	struct iomap_page *iop = to_iomap_page(page);
148 	struct inode *inode = page->mapping->host;
149 	unsigned first = off >> inode->i_blkbits;
150 	unsigned last = (off + len - 1) >> inode->i_blkbits;
151 	unsigned long flags;
152 
153 	spin_lock_irqsave(&iop->uptodate_lock, flags);
154 	bitmap_set(iop->uptodate, first, last - first + 1);
155 	if (bitmap_full(iop->uptodate, i_blocks_per_page(inode, page)))
156 		SetPageUptodate(page);
157 	spin_unlock_irqrestore(&iop->uptodate_lock, flags);
158 }
159 
160 static void
161 iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len)
162 {
163 	if (PageError(page))
164 		return;
165 
166 	if (page_has_private(page))
167 		iomap_iop_set_range_uptodate(page, off, len);
168 	else
169 		SetPageUptodate(page);
170 }
171 
172 static void
173 iomap_read_page_end_io(struct bio_vec *bvec, int error)
174 {
175 	struct page *page = bvec->bv_page;
176 	struct iomap_page *iop = to_iomap_page(page);
177 
178 	if (unlikely(error)) {
179 		ClearPageUptodate(page);
180 		SetPageError(page);
181 	} else {
182 		iomap_set_range_uptodate(page, bvec->bv_offset, bvec->bv_len);
183 	}
184 
185 	if (!iop || atomic_sub_and_test(bvec->bv_len, &iop->read_bytes_pending))
186 		unlock_page(page);
187 }
188 
189 static void
190 iomap_read_end_io(struct bio *bio)
191 {
192 	int error = blk_status_to_errno(bio->bi_status);
193 	struct bio_vec *bvec;
194 	struct bvec_iter_all iter_all;
195 
196 	bio_for_each_segment_all(bvec, bio, iter_all)
197 		iomap_read_page_end_io(bvec, error);
198 	bio_put(bio);
199 }
200 
201 struct iomap_readpage_ctx {
202 	struct page		*cur_page;
203 	bool			cur_page_in_bio;
204 	struct bio		*bio;
205 	struct readahead_control *rac;
206 };
207 
208 static void
209 iomap_read_inline_data(struct inode *inode, struct page *page,
210 		struct iomap *iomap)
211 {
212 	size_t size = i_size_read(inode);
213 	void *addr;
214 
215 	if (PageUptodate(page))
216 		return;
217 
218 	BUG_ON(page_has_private(page));
219 	BUG_ON(page->index);
220 	BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data));
221 
222 	addr = kmap_atomic(page);
223 	memcpy(addr, iomap->inline_data, size);
224 	memset(addr + size, 0, PAGE_SIZE - size);
225 	kunmap_atomic(addr);
226 	SetPageUptodate(page);
227 }
228 
229 static inline bool iomap_block_needs_zeroing(struct inode *inode,
230 		struct iomap *iomap, loff_t pos)
231 {
232 	return iomap->type != IOMAP_MAPPED ||
233 		(iomap->flags & IOMAP_F_NEW) ||
234 		pos >= i_size_read(inode);
235 }
236 
237 static loff_t
238 iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
239 		struct iomap *iomap, struct iomap *srcmap)
240 {
241 	struct iomap_readpage_ctx *ctx = data;
242 	struct page *page = ctx->cur_page;
243 	struct iomap_page *iop;
244 	bool same_page = false, is_contig = false;
245 	loff_t orig_pos = pos;
246 	unsigned poff, plen;
247 	sector_t sector;
248 
249 	if (iomap->type == IOMAP_INLINE) {
250 		WARN_ON_ONCE(pos);
251 		iomap_read_inline_data(inode, page, iomap);
252 		return PAGE_SIZE;
253 	}
254 
255 	/* zero post-eof blocks as the page may be mapped */
256 	iop = iomap_page_create(inode, page);
257 	iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen);
258 	if (plen == 0)
259 		goto done;
260 
261 	if (iomap_block_needs_zeroing(inode, iomap, pos)) {
262 		zero_user(page, poff, plen);
263 		iomap_set_range_uptodate(page, poff, plen);
264 		goto done;
265 	}
266 
267 	ctx->cur_page_in_bio = true;
268 	if (iop)
269 		atomic_add(plen, &iop->read_bytes_pending);
270 
271 	/* Try to merge into a previous segment if we can */
272 	sector = iomap_sector(iomap, pos);
273 	if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
274 		if (__bio_try_merge_page(ctx->bio, page, plen, poff,
275 				&same_page))
276 			goto done;
277 		is_contig = true;
278 	}
279 
280 	if (!is_contig || bio_full(ctx->bio, plen)) {
281 		gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
282 		gfp_t orig_gfp = gfp;
283 		unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
284 
285 		if (ctx->bio)
286 			submit_bio(ctx->bio);
287 
288 		if (ctx->rac) /* same as readahead_gfp_mask */
289 			gfp |= __GFP_NORETRY | __GFP_NOWARN;
290 		ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs));
291 		/*
292 		 * If the bio_alloc fails, try it again for a single page to
293 		 * avoid having to deal with partial page reads.  This emulates
294 		 * what do_mpage_readpage does.
295 		 */
296 		if (!ctx->bio)
297 			ctx->bio = bio_alloc(orig_gfp, 1);
298 		ctx->bio->bi_opf = REQ_OP_READ;
299 		if (ctx->rac)
300 			ctx->bio->bi_opf |= REQ_RAHEAD;
301 		ctx->bio->bi_iter.bi_sector = sector;
302 		bio_set_dev(ctx->bio, iomap->bdev);
303 		ctx->bio->bi_end_io = iomap_read_end_io;
304 	}
305 
306 	bio_add_page(ctx->bio, page, plen, poff);
307 done:
308 	/*
309 	 * Move the caller beyond our range so that it keeps making progress.
310 	 * For that we have to include any leading non-uptodate ranges, but
311 	 * we can skip trailing ones as they will be handled in the next
312 	 * iteration.
313 	 */
314 	return pos - orig_pos + plen;
315 }
316 
317 int
318 iomap_readpage(struct page *page, const struct iomap_ops *ops)
319 {
320 	struct iomap_readpage_ctx ctx = { .cur_page = page };
321 	struct inode *inode = page->mapping->host;
322 	unsigned poff;
323 	loff_t ret;
324 
325 	trace_iomap_readpage(page->mapping->host, 1);
326 
327 	for (poff = 0; poff < PAGE_SIZE; poff += ret) {
328 		ret = iomap_apply(inode, page_offset(page) + poff,
329 				PAGE_SIZE - poff, 0, ops, &ctx,
330 				iomap_readpage_actor);
331 		if (ret <= 0) {
332 			WARN_ON_ONCE(ret == 0);
333 			SetPageError(page);
334 			break;
335 		}
336 	}
337 
338 	if (ctx.bio) {
339 		submit_bio(ctx.bio);
340 		WARN_ON_ONCE(!ctx.cur_page_in_bio);
341 	} else {
342 		WARN_ON_ONCE(ctx.cur_page_in_bio);
343 		unlock_page(page);
344 	}
345 
346 	/*
347 	 * Just like mpage_readahead and block_read_full_page we always
348 	 * return 0 and just mark the page as PageError on errors.  This
349 	 * should be cleaned up all through the stack eventually.
350 	 */
351 	return 0;
352 }
353 EXPORT_SYMBOL_GPL(iomap_readpage);
354 
355 static loff_t
356 iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length,
357 		void *data, struct iomap *iomap, struct iomap *srcmap)
358 {
359 	struct iomap_readpage_ctx *ctx = data;
360 	loff_t done, ret;
361 
362 	for (done = 0; done < length; done += ret) {
363 		if (ctx->cur_page && offset_in_page(pos + done) == 0) {
364 			if (!ctx->cur_page_in_bio)
365 				unlock_page(ctx->cur_page);
366 			put_page(ctx->cur_page);
367 			ctx->cur_page = NULL;
368 		}
369 		if (!ctx->cur_page) {
370 			ctx->cur_page = readahead_page(ctx->rac);
371 			ctx->cur_page_in_bio = false;
372 		}
373 		ret = iomap_readpage_actor(inode, pos + done, length - done,
374 				ctx, iomap, srcmap);
375 	}
376 
377 	return done;
378 }
379 
380 /**
381  * iomap_readahead - Attempt to read pages from a file.
382  * @rac: Describes the pages to be read.
383  * @ops: The operations vector for the filesystem.
384  *
385  * This function is for filesystems to call to implement their readahead
386  * address_space operation.
387  *
388  * Context: The @ops callbacks may submit I/O (eg to read the addresses of
389  * blocks from disc), and may wait for it.  The caller may be trying to
390  * access a different page, and so sleeping excessively should be avoided.
391  * It may allocate memory, but should avoid costly allocations.  This
392  * function is called with memalloc_nofs set, so allocations will not cause
393  * the filesystem to be reentered.
394  */
395 void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
396 {
397 	struct inode *inode = rac->mapping->host;
398 	loff_t pos = readahead_pos(rac);
399 	size_t length = readahead_length(rac);
400 	struct iomap_readpage_ctx ctx = {
401 		.rac	= rac,
402 	};
403 
404 	trace_iomap_readahead(inode, readahead_count(rac));
405 
406 	while (length > 0) {
407 		ssize_t ret = iomap_apply(inode, pos, length, 0, ops,
408 				&ctx, iomap_readahead_actor);
409 		if (ret <= 0) {
410 			WARN_ON_ONCE(ret == 0);
411 			break;
412 		}
413 		pos += ret;
414 		length -= ret;
415 	}
416 
417 	if (ctx.bio)
418 		submit_bio(ctx.bio);
419 	if (ctx.cur_page) {
420 		if (!ctx.cur_page_in_bio)
421 			unlock_page(ctx.cur_page);
422 		put_page(ctx.cur_page);
423 	}
424 }
425 EXPORT_SYMBOL_GPL(iomap_readahead);
426 
427 /*
428  * iomap_is_partially_uptodate checks whether blocks within a page are
429  * uptodate or not.
430  *
431  * Returns true if all blocks which correspond to a file portion
432  * we want to read within the page are uptodate.
433  */
434 int
435 iomap_is_partially_uptodate(struct page *page, unsigned long from,
436 		unsigned long count)
437 {
438 	struct iomap_page *iop = to_iomap_page(page);
439 	struct inode *inode = page->mapping->host;
440 	unsigned len, first, last;
441 	unsigned i;
442 
443 	/* Limit range to one page */
444 	len = min_t(unsigned, PAGE_SIZE - from, count);
445 
446 	/* First and last blocks in range within page */
447 	first = from >> inode->i_blkbits;
448 	last = (from + len - 1) >> inode->i_blkbits;
449 
450 	if (iop) {
451 		for (i = first; i <= last; i++)
452 			if (!test_bit(i, iop->uptodate))
453 				return 0;
454 		return 1;
455 	}
456 
457 	return 0;
458 }
459 EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
460 
461 int
462 iomap_releasepage(struct page *page, gfp_t gfp_mask)
463 {
464 	trace_iomap_releasepage(page->mapping->host, page_offset(page),
465 			PAGE_SIZE);
466 
467 	/*
468 	 * mm accommodates an old ext3 case where clean pages might not have had
469 	 * the dirty bit cleared. Thus, it can send actual dirty pages to
470 	 * ->releasepage() via shrink_active_list(), skip those here.
471 	 */
472 	if (PageDirty(page) || PageWriteback(page))
473 		return 0;
474 	iomap_page_release(page);
475 	return 1;
476 }
477 EXPORT_SYMBOL_GPL(iomap_releasepage);
478 
479 void
480 iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
481 {
482 	trace_iomap_invalidatepage(page->mapping->host, offset, len);
483 
484 	/*
485 	 * If we are invalidating the entire page, clear the dirty state from it
486 	 * and release it to avoid unnecessary buildup of the LRU.
487 	 */
488 	if (offset == 0 && len == PAGE_SIZE) {
489 		WARN_ON_ONCE(PageWriteback(page));
490 		cancel_dirty_page(page);
491 		iomap_page_release(page);
492 	}
493 }
494 EXPORT_SYMBOL_GPL(iomap_invalidatepage);
495 
496 #ifdef CONFIG_MIGRATION
497 int
498 iomap_migrate_page(struct address_space *mapping, struct page *newpage,
499 		struct page *page, enum migrate_mode mode)
500 {
501 	int ret;
502 
503 	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
504 	if (ret != MIGRATEPAGE_SUCCESS)
505 		return ret;
506 
507 	if (page_has_private(page))
508 		attach_page_private(newpage, detach_page_private(page));
509 
510 	if (mode != MIGRATE_SYNC_NO_COPY)
511 		migrate_page_copy(newpage, page);
512 	else
513 		migrate_page_states(newpage, page);
514 	return MIGRATEPAGE_SUCCESS;
515 }
516 EXPORT_SYMBOL_GPL(iomap_migrate_page);
517 #endif /* CONFIG_MIGRATION */
518 
519 enum {
520 	IOMAP_WRITE_F_UNSHARE		= (1 << 0),
521 };
522 
523 static void
524 iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
525 {
526 	loff_t i_size = i_size_read(inode);
527 
528 	/*
529 	 * Only truncate newly allocated pages beyoned EOF, even if the
530 	 * write started inside the existing inode size.
531 	 */
532 	if (pos + len > i_size)
533 		truncate_pagecache_range(inode, max(pos, i_size), pos + len);
534 }
535 
536 static int
537 iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
538 		unsigned plen, struct iomap *iomap)
539 {
540 	struct bio_vec bvec;
541 	struct bio bio;
542 
543 	bio_init(&bio, &bvec, 1);
544 	bio.bi_opf = REQ_OP_READ;
545 	bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
546 	bio_set_dev(&bio, iomap->bdev);
547 	__bio_add_page(&bio, page, plen, poff);
548 	return submit_bio_wait(&bio);
549 }
550 
551 static int
552 __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
553 		struct page *page, struct iomap *srcmap)
554 {
555 	struct iomap_page *iop = iomap_page_create(inode, page);
556 	loff_t block_size = i_blocksize(inode);
557 	loff_t block_start = round_down(pos, block_size);
558 	loff_t block_end = round_up(pos + len, block_size);
559 	unsigned from = offset_in_page(pos), to = from + len, poff, plen;
560 
561 	if (PageUptodate(page))
562 		return 0;
563 	ClearPageError(page);
564 
565 	do {
566 		iomap_adjust_read_range(inode, iop, &block_start,
567 				block_end - block_start, &poff, &plen);
568 		if (plen == 0)
569 			break;
570 
571 		if (!(flags & IOMAP_WRITE_F_UNSHARE) &&
572 		    (from <= poff || from >= poff + plen) &&
573 		    (to <= poff || to >= poff + plen))
574 			continue;
575 
576 		if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
577 			if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
578 				return -EIO;
579 			zero_user_segments(page, poff, from, to, poff + plen);
580 		} else {
581 			int status = iomap_read_page_sync(block_start, page,
582 					poff, plen, srcmap);
583 			if (status)
584 				return status;
585 		}
586 		iomap_set_range_uptodate(page, poff, plen);
587 	} while ((block_start += plen) < block_end);
588 
589 	return 0;
590 }
591 
592 static int
593 iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
594 		struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
595 {
596 	const struct iomap_page_ops *page_ops = iomap->page_ops;
597 	struct page *page;
598 	int status = 0;
599 
600 	BUG_ON(pos + len > iomap->offset + iomap->length);
601 	if (srcmap != iomap)
602 		BUG_ON(pos + len > srcmap->offset + srcmap->length);
603 
604 	if (fatal_signal_pending(current))
605 		return -EINTR;
606 
607 	if (page_ops && page_ops->page_prepare) {
608 		status = page_ops->page_prepare(inode, pos, len, iomap);
609 		if (status)
610 			return status;
611 	}
612 
613 	page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
614 			AOP_FLAG_NOFS);
615 	if (!page) {
616 		status = -ENOMEM;
617 		goto out_no_page;
618 	}
619 
620 	if (srcmap->type == IOMAP_INLINE)
621 		iomap_read_inline_data(inode, page, srcmap);
622 	else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
623 		status = __block_write_begin_int(page, pos, len, NULL, srcmap);
624 	else
625 		status = __iomap_write_begin(inode, pos, len, flags, page,
626 				srcmap);
627 
628 	if (unlikely(status))
629 		goto out_unlock;
630 
631 	*pagep = page;
632 	return 0;
633 
634 out_unlock:
635 	unlock_page(page);
636 	put_page(page);
637 	iomap_write_failed(inode, pos, len);
638 
639 out_no_page:
640 	if (page_ops && page_ops->page_done)
641 		page_ops->page_done(inode, pos, 0, NULL, iomap);
642 	return status;
643 }
644 
645 static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len,
646 		size_t copied, struct page *page)
647 {
648 	flush_dcache_page(page);
649 
650 	/*
651 	 * The blocks that were entirely written will now be uptodate, so we
652 	 * don't have to worry about a readpage reading them and overwriting a
653 	 * partial write.  However if we have encountered a short write and only
654 	 * partially written into a block, it will not be marked uptodate, so a
655 	 * readpage might come in and destroy our partial write.
656 	 *
657 	 * Do the simplest thing, and just treat any short write to a non
658 	 * uptodate page as a zero-length write, and force the caller to redo
659 	 * the whole thing.
660 	 */
661 	if (unlikely(copied < len && !PageUptodate(page)))
662 		return 0;
663 	iomap_set_range_uptodate(page, offset_in_page(pos), len);
664 	__set_page_dirty_nobuffers(page);
665 	return copied;
666 }
667 
668 static size_t iomap_write_end_inline(struct inode *inode, struct page *page,
669 		struct iomap *iomap, loff_t pos, size_t copied)
670 {
671 	void *addr;
672 
673 	WARN_ON_ONCE(!PageUptodate(page));
674 	BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data));
675 
676 	flush_dcache_page(page);
677 	addr = kmap_atomic(page);
678 	memcpy(iomap->inline_data + pos, addr + pos, copied);
679 	kunmap_atomic(addr);
680 
681 	mark_inode_dirty(inode);
682 	return copied;
683 }
684 
685 /* Returns the number of bytes copied.  May be 0.  Cannot be an errno. */
686 static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len,
687 		size_t copied, struct page *page, struct iomap *iomap,
688 		struct iomap *srcmap)
689 {
690 	const struct iomap_page_ops *page_ops = iomap->page_ops;
691 	loff_t old_size = inode->i_size;
692 	size_t ret;
693 
694 	if (srcmap->type == IOMAP_INLINE) {
695 		ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
696 	} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
697 		ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
698 				page, NULL);
699 	} else {
700 		ret = __iomap_write_end(inode, pos, len, copied, page);
701 	}
702 
703 	/*
704 	 * Update the in-memory inode size after copying the data into the page
705 	 * cache.  It's up to the file system to write the updated size to disk,
706 	 * preferably after I/O completion so that no stale data is exposed.
707 	 */
708 	if (pos + ret > old_size) {
709 		i_size_write(inode, pos + ret);
710 		iomap->flags |= IOMAP_F_SIZE_CHANGED;
711 	}
712 	unlock_page(page);
713 
714 	if (old_size < pos)
715 		pagecache_isize_extended(inode, old_size, pos);
716 	if (page_ops && page_ops->page_done)
717 		page_ops->page_done(inode, pos, ret, page, iomap);
718 	put_page(page);
719 
720 	if (ret < len)
721 		iomap_write_failed(inode, pos, len);
722 	return ret;
723 }
724 
725 static loff_t
726 iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
727 		struct iomap *iomap, struct iomap *srcmap)
728 {
729 	struct iov_iter *i = data;
730 	long status = 0;
731 	ssize_t written = 0;
732 
733 	do {
734 		struct page *page;
735 		unsigned long offset;	/* Offset into pagecache page */
736 		unsigned long bytes;	/* Bytes to write to page */
737 		size_t copied;		/* Bytes copied from user */
738 
739 		offset = offset_in_page(pos);
740 		bytes = min_t(unsigned long, PAGE_SIZE - offset,
741 						iov_iter_count(i));
742 again:
743 		if (bytes > length)
744 			bytes = length;
745 
746 		/*
747 		 * Bring in the user page that we will copy from _first_.
748 		 * Otherwise there's a nasty deadlock on copying from the
749 		 * same page as we're writing to, without it being marked
750 		 * up-to-date.
751 		 */
752 		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
753 			status = -EFAULT;
754 			break;
755 		}
756 
757 		status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
758 				srcmap);
759 		if (unlikely(status))
760 			break;
761 
762 		if (mapping_writably_mapped(inode->i_mapping))
763 			flush_dcache_page(page);
764 
765 		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
766 
767 		status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
768 				srcmap);
769 
770 		if (unlikely(copied != status))
771 			iov_iter_revert(i, copied - status);
772 
773 		cond_resched();
774 		if (unlikely(status == 0)) {
775 			/*
776 			 * A short copy made iomap_write_end() reject the
777 			 * thing entirely.  Might be memory poisoning
778 			 * halfway through, might be a race with munmap,
779 			 * might be severe memory pressure.
780 			 */
781 			if (copied)
782 				bytes = copied;
783 			goto again;
784 		}
785 		pos += status;
786 		written += status;
787 		length -= status;
788 
789 		balance_dirty_pages_ratelimited(inode->i_mapping);
790 	} while (iov_iter_count(i) && length);
791 
792 	return written ? written : status;
793 }
794 
795 ssize_t
796 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
797 		const struct iomap_ops *ops)
798 {
799 	struct inode *inode = iocb->ki_filp->f_mapping->host;
800 	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
801 
802 	while (iov_iter_count(iter)) {
803 		ret = iomap_apply(inode, pos, iov_iter_count(iter),
804 				IOMAP_WRITE, ops, iter, iomap_write_actor);
805 		if (ret <= 0)
806 			break;
807 		pos += ret;
808 		written += ret;
809 	}
810 
811 	return written ? written : ret;
812 }
813 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
814 
815 static loff_t
816 iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
817 		struct iomap *iomap, struct iomap *srcmap)
818 {
819 	long status = 0;
820 	loff_t written = 0;
821 
822 	/* don't bother with blocks that are not shared to start with */
823 	if (!(iomap->flags & IOMAP_F_SHARED))
824 		return length;
825 	/* don't bother with holes or unwritten extents */
826 	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
827 		return length;
828 
829 	do {
830 		unsigned long offset = offset_in_page(pos);
831 		unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
832 		struct page *page;
833 
834 		status = iomap_write_begin(inode, pos, bytes,
835 				IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap);
836 		if (unlikely(status))
837 			return status;
838 
839 		status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
840 				srcmap);
841 		if (WARN_ON_ONCE(status == 0))
842 			return -EIO;
843 
844 		cond_resched();
845 
846 		pos += status;
847 		written += status;
848 		length -= status;
849 
850 		balance_dirty_pages_ratelimited(inode->i_mapping);
851 	} while (length);
852 
853 	return written;
854 }
855 
856 int
857 iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
858 		const struct iomap_ops *ops)
859 {
860 	loff_t ret;
861 
862 	while (len) {
863 		ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
864 				iomap_unshare_actor);
865 		if (ret <= 0)
866 			return ret;
867 		pos += ret;
868 		len -= ret;
869 	}
870 
871 	return 0;
872 }
873 EXPORT_SYMBOL_GPL(iomap_file_unshare);
874 
875 static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length,
876 		struct iomap *iomap, struct iomap *srcmap)
877 {
878 	struct page *page;
879 	int status;
880 	unsigned offset = offset_in_page(pos);
881 	unsigned bytes = min_t(u64, PAGE_SIZE - offset, length);
882 
883 	status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
884 	if (status)
885 		return status;
886 
887 	zero_user(page, offset, bytes);
888 	mark_page_accessed(page);
889 
890 	return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
891 }
892 
893 static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos,
894 		loff_t length, void *data, struct iomap *iomap,
895 		struct iomap *srcmap)
896 {
897 	bool *did_zero = data;
898 	loff_t written = 0;
899 
900 	/* already zeroed?  we're done. */
901 	if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
902 		return length;
903 
904 	do {
905 		s64 bytes;
906 
907 		if (IS_DAX(inode))
908 			bytes = dax_iomap_zero(pos, length, iomap);
909 		else
910 			bytes = iomap_zero(inode, pos, length, iomap, srcmap);
911 		if (bytes < 0)
912 			return bytes;
913 
914 		pos += bytes;
915 		length -= bytes;
916 		written += bytes;
917 		if (did_zero)
918 			*did_zero = true;
919 	} while (length > 0);
920 
921 	return written;
922 }
923 
924 int
925 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
926 		const struct iomap_ops *ops)
927 {
928 	loff_t ret;
929 
930 	while (len > 0) {
931 		ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
932 				ops, did_zero, iomap_zero_range_actor);
933 		if (ret <= 0)
934 			return ret;
935 
936 		pos += ret;
937 		len -= ret;
938 	}
939 
940 	return 0;
941 }
942 EXPORT_SYMBOL_GPL(iomap_zero_range);
943 
944 int
945 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
946 		const struct iomap_ops *ops)
947 {
948 	unsigned int blocksize = i_blocksize(inode);
949 	unsigned int off = pos & (blocksize - 1);
950 
951 	/* Block boundary? Nothing to do */
952 	if (!off)
953 		return 0;
954 	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
955 }
956 EXPORT_SYMBOL_GPL(iomap_truncate_page);
957 
958 static loff_t
959 iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
960 		void *data, struct iomap *iomap, struct iomap *srcmap)
961 {
962 	struct page *page = data;
963 	int ret;
964 
965 	if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
966 		ret = __block_write_begin_int(page, pos, length, NULL, iomap);
967 		if (ret)
968 			return ret;
969 		block_commit_write(page, 0, length);
970 	} else {
971 		WARN_ON_ONCE(!PageUptodate(page));
972 		set_page_dirty(page);
973 	}
974 
975 	return length;
976 }
977 
978 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
979 {
980 	struct page *page = vmf->page;
981 	struct inode *inode = file_inode(vmf->vma->vm_file);
982 	unsigned long length;
983 	loff_t offset;
984 	ssize_t ret;
985 
986 	lock_page(page);
987 	ret = page_mkwrite_check_truncate(page, inode);
988 	if (ret < 0)
989 		goto out_unlock;
990 	length = ret;
991 
992 	offset = page_offset(page);
993 	while (length > 0) {
994 		ret = iomap_apply(inode, offset, length,
995 				IOMAP_WRITE | IOMAP_FAULT, ops, page,
996 				iomap_page_mkwrite_actor);
997 		if (unlikely(ret <= 0))
998 			goto out_unlock;
999 		offset += ret;
1000 		length -= ret;
1001 	}
1002 
1003 	wait_for_stable_page(page);
1004 	return VM_FAULT_LOCKED;
1005 out_unlock:
1006 	unlock_page(page);
1007 	return block_page_mkwrite_return(ret);
1008 }
1009 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
1010 
1011 static void
1012 iomap_finish_page_writeback(struct inode *inode, struct page *page,
1013 		int error, unsigned int len)
1014 {
1015 	struct iomap_page *iop = to_iomap_page(page);
1016 
1017 	if (error) {
1018 		SetPageError(page);
1019 		mapping_set_error(inode->i_mapping, -EIO);
1020 	}
1021 
1022 	WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop);
1023 	WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0);
1024 
1025 	if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending))
1026 		end_page_writeback(page);
1027 }
1028 
1029 /*
1030  * We're now finished for good with this ioend structure.  Update the page
1031  * state, release holds on bios, and finally free up memory.  Do not use the
1032  * ioend after this.
1033  */
1034 static void
1035 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
1036 {
1037 	struct inode *inode = ioend->io_inode;
1038 	struct bio *bio = &ioend->io_inline_bio;
1039 	struct bio *last = ioend->io_bio, *next;
1040 	u64 start = bio->bi_iter.bi_sector;
1041 	loff_t offset = ioend->io_offset;
1042 	bool quiet = bio_flagged(bio, BIO_QUIET);
1043 
1044 	for (bio = &ioend->io_inline_bio; bio; bio = next) {
1045 		struct bio_vec *bv;
1046 		struct bvec_iter_all iter_all;
1047 
1048 		/*
1049 		 * For the last bio, bi_private points to the ioend, so we
1050 		 * need to explicitly end the iteration here.
1051 		 */
1052 		if (bio == last)
1053 			next = NULL;
1054 		else
1055 			next = bio->bi_private;
1056 
1057 		/* walk each page on bio, ending page IO on them */
1058 		bio_for_each_segment_all(bv, bio, iter_all)
1059 			iomap_finish_page_writeback(inode, bv->bv_page, error,
1060 					bv->bv_len);
1061 		bio_put(bio);
1062 	}
1063 	/* The ioend has been freed by bio_put() */
1064 
1065 	if (unlikely(error && !quiet)) {
1066 		printk_ratelimited(KERN_ERR
1067 "%s: writeback error on inode %lu, offset %lld, sector %llu",
1068 			inode->i_sb->s_id, inode->i_ino, offset, start);
1069 	}
1070 }
1071 
1072 void
1073 iomap_finish_ioends(struct iomap_ioend *ioend, int error)
1074 {
1075 	struct list_head tmp;
1076 
1077 	list_replace_init(&ioend->io_list, &tmp);
1078 	iomap_finish_ioend(ioend, error);
1079 
1080 	while (!list_empty(&tmp)) {
1081 		ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
1082 		list_del_init(&ioend->io_list);
1083 		iomap_finish_ioend(ioend, error);
1084 	}
1085 }
1086 EXPORT_SYMBOL_GPL(iomap_finish_ioends);
1087 
1088 /*
1089  * We can merge two adjacent ioends if they have the same set of work to do.
1090  */
1091 static bool
1092 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
1093 {
1094 	if (ioend->io_bio->bi_status != next->io_bio->bi_status)
1095 		return false;
1096 	if ((ioend->io_flags & IOMAP_F_SHARED) ^
1097 	    (next->io_flags & IOMAP_F_SHARED))
1098 		return false;
1099 	if ((ioend->io_type == IOMAP_UNWRITTEN) ^
1100 	    (next->io_type == IOMAP_UNWRITTEN))
1101 		return false;
1102 	if (ioend->io_offset + ioend->io_size != next->io_offset)
1103 		return false;
1104 	return true;
1105 }
1106 
1107 void
1108 iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
1109 {
1110 	struct iomap_ioend *next;
1111 
1112 	INIT_LIST_HEAD(&ioend->io_list);
1113 
1114 	while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
1115 			io_list))) {
1116 		if (!iomap_ioend_can_merge(ioend, next))
1117 			break;
1118 		list_move_tail(&next->io_list, &ioend->io_list);
1119 		ioend->io_size += next->io_size;
1120 	}
1121 }
1122 EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
1123 
1124 static int
1125 iomap_ioend_compare(void *priv, const struct list_head *a,
1126 		const struct list_head *b)
1127 {
1128 	struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
1129 	struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
1130 
1131 	if (ia->io_offset < ib->io_offset)
1132 		return -1;
1133 	if (ia->io_offset > ib->io_offset)
1134 		return 1;
1135 	return 0;
1136 }
1137 
1138 void
1139 iomap_sort_ioends(struct list_head *ioend_list)
1140 {
1141 	list_sort(NULL, ioend_list, iomap_ioend_compare);
1142 }
1143 EXPORT_SYMBOL_GPL(iomap_sort_ioends);
1144 
1145 static void iomap_writepage_end_bio(struct bio *bio)
1146 {
1147 	struct iomap_ioend *ioend = bio->bi_private;
1148 
1149 	iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
1150 }
1151 
1152 /*
1153  * Submit the final bio for an ioend.
1154  *
1155  * If @error is non-zero, it means that we have a situation where some part of
1156  * the submission process has failed after we have marked paged for writeback
1157  * and unlocked them.  In this situation, we need to fail the bio instead of
1158  * submitting it.  This typically only happens on a filesystem shutdown.
1159  */
1160 static int
1161 iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
1162 		int error)
1163 {
1164 	ioend->io_bio->bi_private = ioend;
1165 	ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
1166 
1167 	if (wpc->ops->prepare_ioend)
1168 		error = wpc->ops->prepare_ioend(ioend, error);
1169 	if (error) {
1170 		/*
1171 		 * If we are failing the IO now, just mark the ioend with an
1172 		 * error and finish it.  This will run IO completion immediately
1173 		 * as there is only one reference to the ioend at this point in
1174 		 * time.
1175 		 */
1176 		ioend->io_bio->bi_status = errno_to_blk_status(error);
1177 		bio_endio(ioend->io_bio);
1178 		return error;
1179 	}
1180 
1181 	submit_bio(ioend->io_bio);
1182 	return 0;
1183 }
1184 
1185 static struct iomap_ioend *
1186 iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
1187 		loff_t offset, sector_t sector, struct writeback_control *wbc)
1188 {
1189 	struct iomap_ioend *ioend;
1190 	struct bio *bio;
1191 
1192 	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &iomap_ioend_bioset);
1193 	bio_set_dev(bio, wpc->iomap.bdev);
1194 	bio->bi_iter.bi_sector = sector;
1195 	bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
1196 	bio->bi_write_hint = inode->i_write_hint;
1197 	wbc_init_bio(wbc, bio);
1198 
1199 	ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
1200 	INIT_LIST_HEAD(&ioend->io_list);
1201 	ioend->io_type = wpc->iomap.type;
1202 	ioend->io_flags = wpc->iomap.flags;
1203 	ioend->io_inode = inode;
1204 	ioend->io_size = 0;
1205 	ioend->io_offset = offset;
1206 	ioend->io_bio = bio;
1207 	return ioend;
1208 }
1209 
1210 /*
1211  * Allocate a new bio, and chain the old bio to the new one.
1212  *
1213  * Note that we have to do perform the chaining in this unintuitive order
1214  * so that the bi_private linkage is set up in the right direction for the
1215  * traversal in iomap_finish_ioend().
1216  */
1217 static struct bio *
1218 iomap_chain_bio(struct bio *prev)
1219 {
1220 	struct bio *new;
1221 
1222 	new = bio_alloc(GFP_NOFS, BIO_MAX_VECS);
1223 	bio_copy_dev(new, prev);/* also copies over blkcg information */
1224 	new->bi_iter.bi_sector = bio_end_sector(prev);
1225 	new->bi_opf = prev->bi_opf;
1226 	new->bi_write_hint = prev->bi_write_hint;
1227 
1228 	bio_chain(prev, new);
1229 	bio_get(prev);		/* for iomap_finish_ioend */
1230 	submit_bio(prev);
1231 	return new;
1232 }
1233 
1234 static bool
1235 iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
1236 		sector_t sector)
1237 {
1238 	if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
1239 	    (wpc->ioend->io_flags & IOMAP_F_SHARED))
1240 		return false;
1241 	if (wpc->iomap.type != wpc->ioend->io_type)
1242 		return false;
1243 	if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
1244 		return false;
1245 	if (sector != bio_end_sector(wpc->ioend->io_bio))
1246 		return false;
1247 	return true;
1248 }
1249 
1250 /*
1251  * Test to see if we have an existing ioend structure that we could append to
1252  * first, otherwise finish off the current ioend and start another.
1253  */
1254 static void
1255 iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
1256 		struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
1257 		struct writeback_control *wbc, struct list_head *iolist)
1258 {
1259 	sector_t sector = iomap_sector(&wpc->iomap, offset);
1260 	unsigned len = i_blocksize(inode);
1261 	unsigned poff = offset & (PAGE_SIZE - 1);
1262 	bool merged, same_page = false;
1263 
1264 	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
1265 		if (wpc->ioend)
1266 			list_add(&wpc->ioend->io_list, iolist);
1267 		wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
1268 	}
1269 
1270 	merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
1271 			&same_page);
1272 	if (iop)
1273 		atomic_add(len, &iop->write_bytes_pending);
1274 
1275 	if (!merged) {
1276 		if (bio_full(wpc->ioend->io_bio, len)) {
1277 			wpc->ioend->io_bio =
1278 				iomap_chain_bio(wpc->ioend->io_bio);
1279 		}
1280 		bio_add_page(wpc->ioend->io_bio, page, len, poff);
1281 	}
1282 
1283 	wpc->ioend->io_size += len;
1284 	wbc_account_cgroup_owner(wbc, page, len);
1285 }
1286 
1287 /*
1288  * We implement an immediate ioend submission policy here to avoid needing to
1289  * chain multiple ioends and hence nest mempool allocations which can violate
1290  * forward progress guarantees we need to provide. The current ioend we are
1291  * adding blocks to is cached on the writepage context, and if the new block
1292  * does not append to the cached ioend it will create a new ioend and cache that
1293  * instead.
1294  *
1295  * If a new ioend is created and cached, the old ioend is returned and queued
1296  * locally for submission once the entire page is processed or an error has been
1297  * detected.  While ioends are submitted immediately after they are completed,
1298  * batching optimisations are provided by higher level block plugging.
1299  *
1300  * At the end of a writeback pass, there will be a cached ioend remaining on the
1301  * writepage context that the caller will need to submit.
1302  */
1303 static int
1304 iomap_writepage_map(struct iomap_writepage_ctx *wpc,
1305 		struct writeback_control *wbc, struct inode *inode,
1306 		struct page *page, u64 end_offset)
1307 {
1308 	struct iomap_page *iop = iomap_page_create(inode, page);
1309 	struct iomap_ioend *ioend, *next;
1310 	unsigned len = i_blocksize(inode);
1311 	u64 file_offset; /* file offset of page */
1312 	int error = 0, count = 0, i;
1313 	LIST_HEAD(submit_list);
1314 
1315 	WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0);
1316 
1317 	/*
1318 	 * Walk through the page to find areas to write back. If we run off the
1319 	 * end of the current map or find the current map invalid, grab a new
1320 	 * one.
1321 	 */
1322 	for (i = 0, file_offset = page_offset(page);
1323 	     i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
1324 	     i++, file_offset += len) {
1325 		if (iop && !test_bit(i, iop->uptodate))
1326 			continue;
1327 
1328 		error = wpc->ops->map_blocks(wpc, inode, file_offset);
1329 		if (error)
1330 			break;
1331 		if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
1332 			continue;
1333 		if (wpc->iomap.type == IOMAP_HOLE)
1334 			continue;
1335 		iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
1336 				 &submit_list);
1337 		count++;
1338 	}
1339 
1340 	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
1341 	WARN_ON_ONCE(!PageLocked(page));
1342 	WARN_ON_ONCE(PageWriteback(page));
1343 	WARN_ON_ONCE(PageDirty(page));
1344 
1345 	/*
1346 	 * We cannot cancel the ioend directly here on error.  We may have
1347 	 * already set other pages under writeback and hence we have to run I/O
1348 	 * completion to mark the error state of the pages under writeback
1349 	 * appropriately.
1350 	 */
1351 	if (unlikely(error)) {
1352 		/*
1353 		 * Let the filesystem know what portion of the current page
1354 		 * failed to map. If the page wasn't been added to ioend, it
1355 		 * won't be affected by I/O completion and we must unlock it
1356 		 * now.
1357 		 */
1358 		if (wpc->ops->discard_page)
1359 			wpc->ops->discard_page(page, file_offset);
1360 		if (!count) {
1361 			ClearPageUptodate(page);
1362 			unlock_page(page);
1363 			goto done;
1364 		}
1365 	}
1366 
1367 	set_page_writeback(page);
1368 	unlock_page(page);
1369 
1370 	/*
1371 	 * Preserve the original error if there was one, otherwise catch
1372 	 * submission errors here and propagate into subsequent ioend
1373 	 * submissions.
1374 	 */
1375 	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1376 		int error2;
1377 
1378 		list_del_init(&ioend->io_list);
1379 		error2 = iomap_submit_ioend(wpc, ioend, error);
1380 		if (error2 && !error)
1381 			error = error2;
1382 	}
1383 
1384 	/*
1385 	 * We can end up here with no error and nothing to write only if we race
1386 	 * with a partial page truncate on a sub-page block sized filesystem.
1387 	 */
1388 	if (!count)
1389 		end_page_writeback(page);
1390 done:
1391 	mapping_set_error(page->mapping, error);
1392 	return error;
1393 }
1394 
1395 /*
1396  * Write out a dirty page.
1397  *
1398  * For delalloc space on the page we need to allocate space and flush it.
1399  * For unwritten space on the page we need to start the conversion to
1400  * regular allocated space.
1401  */
1402 static int
1403 iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
1404 {
1405 	struct iomap_writepage_ctx *wpc = data;
1406 	struct inode *inode = page->mapping->host;
1407 	pgoff_t end_index;
1408 	u64 end_offset;
1409 	loff_t offset;
1410 
1411 	trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE);
1412 
1413 	/*
1414 	 * Refuse to write the page out if we are called from reclaim context.
1415 	 *
1416 	 * This avoids stack overflows when called from deeply used stacks in
1417 	 * random callers for direct reclaim or memcg reclaim.  We explicitly
1418 	 * allow reclaim from kswapd as the stack usage there is relatively low.
1419 	 *
1420 	 * This should never happen except in the case of a VM regression so
1421 	 * warn about it.
1422 	 */
1423 	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1424 			PF_MEMALLOC))
1425 		goto redirty;
1426 
1427 	/*
1428 	 * Is this page beyond the end of the file?
1429 	 *
1430 	 * The page index is less than the end_index, adjust the end_offset
1431 	 * to the highest offset that this page should represent.
1432 	 * -----------------------------------------------------
1433 	 * |			file mapping	       | <EOF> |
1434 	 * -----------------------------------------------------
1435 	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
1436 	 * ^--------------------------------^----------|--------
1437 	 * |     desired writeback range    |      see else    |
1438 	 * ---------------------------------^------------------|
1439 	 */
1440 	offset = i_size_read(inode);
1441 	end_index = offset >> PAGE_SHIFT;
1442 	if (page->index < end_index)
1443 		end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT;
1444 	else {
1445 		/*
1446 		 * Check whether the page to write out is beyond or straddles
1447 		 * i_size or not.
1448 		 * -------------------------------------------------------
1449 		 * |		file mapping		        | <EOF>  |
1450 		 * -------------------------------------------------------
1451 		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1452 		 * ^--------------------------------^-----------|---------
1453 		 * |				    |      Straddles     |
1454 		 * ---------------------------------^-----------|--------|
1455 		 */
1456 		unsigned offset_into_page = offset & (PAGE_SIZE - 1);
1457 
1458 		/*
1459 		 * Skip the page if it is fully outside i_size, e.g. due to a
1460 		 * truncate operation that is in progress. We must redirty the
1461 		 * page so that reclaim stops reclaiming it. Otherwise
1462 		 * iomap_vm_releasepage() is called on it and gets confused.
1463 		 *
1464 		 * Note that the end_index is unsigned long, it would overflow
1465 		 * if the given offset is greater than 16TB on 32-bit system
1466 		 * and if we do check the page is fully outside i_size or not
1467 		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
1468 		 * will be evaluated to 0.  Hence this page will be redirtied
1469 		 * and be written out repeatedly which would result in an
1470 		 * infinite loop, the user program that perform this operation
1471 		 * will hang.  Instead, we can verify this situation by checking
1472 		 * if the page to write is totally beyond the i_size or if it's
1473 		 * offset is just equal to the EOF.
1474 		 */
1475 		if (page->index > end_index ||
1476 		    (page->index == end_index && offset_into_page == 0))
1477 			goto redirty;
1478 
1479 		/*
1480 		 * The page straddles i_size.  It must be zeroed out on each
1481 		 * and every writepage invocation because it may be mmapped.
1482 		 * "A file is mapped in multiples of the page size.  For a file
1483 		 * that is not a multiple of the page size, the remaining
1484 		 * memory is zeroed when mapped, and writes to that region are
1485 		 * not written out to the file."
1486 		 */
1487 		zero_user_segment(page, offset_into_page, PAGE_SIZE);
1488 
1489 		/* Adjust the end_offset to the end of file */
1490 		end_offset = offset;
1491 	}
1492 
1493 	return iomap_writepage_map(wpc, wbc, inode, page, end_offset);
1494 
1495 redirty:
1496 	redirty_page_for_writepage(wbc, page);
1497 	unlock_page(page);
1498 	return 0;
1499 }
1500 
1501 int
1502 iomap_writepage(struct page *page, struct writeback_control *wbc,
1503 		struct iomap_writepage_ctx *wpc,
1504 		const struct iomap_writeback_ops *ops)
1505 {
1506 	int ret;
1507 
1508 	wpc->ops = ops;
1509 	ret = iomap_do_writepage(page, wbc, wpc);
1510 	if (!wpc->ioend)
1511 		return ret;
1512 	return iomap_submit_ioend(wpc, wpc->ioend, ret);
1513 }
1514 EXPORT_SYMBOL_GPL(iomap_writepage);
1515 
1516 int
1517 iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
1518 		struct iomap_writepage_ctx *wpc,
1519 		const struct iomap_writeback_ops *ops)
1520 {
1521 	int			ret;
1522 
1523 	wpc->ops = ops;
1524 	ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
1525 	if (!wpc->ioend)
1526 		return ret;
1527 	return iomap_submit_ioend(wpc, wpc->ioend, ret);
1528 }
1529 EXPORT_SYMBOL_GPL(iomap_writepages);
1530 
1531 static int __init iomap_init(void)
1532 {
1533 	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
1534 			   offsetof(struct iomap_ioend, io_inline_bio),
1535 			   BIOSET_NEED_BVECS);
1536 }
1537 fs_initcall(iomap_init);
1538