xref: /openbmc/linux/fs/xfs/xfs_aops.c (revision 981ab3f1)
1 /*
2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_shared.h"
20 #include "xfs_format.h"
21 #include "xfs_log_format.h"
22 #include "xfs_trans_resv.h"
23 #include "xfs_mount.h"
24 #include "xfs_inode.h"
25 #include "xfs_trans.h"
26 #include "xfs_inode_item.h"
27 #include "xfs_alloc.h"
28 #include "xfs_error.h"
29 #include "xfs_iomap.h"
30 #include "xfs_trace.h"
31 #include "xfs_bmap.h"
32 #include "xfs_bmap_util.h"
33 #include "xfs_bmap_btree.h"
34 #include "xfs_reflink.h"
35 #include <linux/gfp.h>
36 #include <linux/mpage.h>
37 #include <linux/pagevec.h>
38 #include <linux/writeback.h>
39 
40 /*
41  * structure owned by writepages passed to individual writepage calls
42  */
43 struct xfs_writepage_ctx {
44 	struct xfs_bmbt_irec    imap;
45 	bool			imap_valid;
46 	unsigned int		io_type;
47 	struct xfs_ioend	*ioend;
48 	sector_t		last_block;
49 };
50 
51 void
52 xfs_count_page_state(
53 	struct page		*page,
54 	int			*delalloc,
55 	int			*unwritten)
56 {
57 	struct buffer_head	*bh, *head;
58 
59 	*delalloc = *unwritten = 0;
60 
61 	bh = head = page_buffers(page);
62 	do {
63 		if (buffer_unwritten(bh))
64 			(*unwritten) = 1;
65 		else if (buffer_delay(bh))
66 			(*delalloc) = 1;
67 	} while ((bh = bh->b_this_page) != head);
68 }
69 
70 struct block_device *
71 xfs_find_bdev_for_inode(
72 	struct inode		*inode)
73 {
74 	struct xfs_inode	*ip = XFS_I(inode);
75 	struct xfs_mount	*mp = ip->i_mount;
76 
77 	if (XFS_IS_REALTIME_INODE(ip))
78 		return mp->m_rtdev_targp->bt_bdev;
79 	else
80 		return mp->m_ddev_targp->bt_bdev;
81 }
82 
83 /*
84  * We're now finished for good with this page.  Update the page state via the
85  * associated buffer_heads, paying attention to the start and end offsets that
86  * we need to process on the page.
87  *
88  * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
89  * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
90  * the page at all, as we may be racing with memory reclaim and it can free both
91  * the bufferhead chain and the page as it will see the page as clean and
92  * unused.
93  */
94 static void
95 xfs_finish_page_writeback(
96 	struct inode		*inode,
97 	struct bio_vec		*bvec,
98 	int			error)
99 {
100 	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
101 	struct buffer_head	*head, *bh, *next;
102 	unsigned int		off = 0;
103 	unsigned int		bsize;
104 
105 	ASSERT(bvec->bv_offset < PAGE_SIZE);
106 	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
107 	ASSERT(end < PAGE_SIZE);
108 	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
109 
110 	bh = head = page_buffers(bvec->bv_page);
111 
112 	bsize = bh->b_size;
113 	do {
114 		if (off > end)
115 			break;
116 		next = bh->b_this_page;
117 		if (off < bvec->bv_offset)
118 			goto next_bh;
119 		bh->b_end_io(bh, !error);
120 next_bh:
121 		off += bsize;
122 	} while ((bh = next) != head);
123 }
124 
125 /*
126  * We're now finished for good with this ioend structure.  Update the page
127  * state, release holds on bios, and finally free up memory.  Do not use the
128  * ioend after this.
129  */
130 STATIC void
131 xfs_destroy_ioend(
132 	struct xfs_ioend	*ioend,
133 	int			error)
134 {
135 	struct inode		*inode = ioend->io_inode;
136 	struct bio		*last = ioend->io_bio;
137 	struct bio		*bio, *next;
138 
139 	for (bio = &ioend->io_inline_bio; bio; bio = next) {
140 		struct bio_vec	*bvec;
141 		int		i;
142 
143 		/*
144 		 * For the last bio, bi_private points to the ioend, so we
145 		 * need to explicitly end the iteration here.
146 		 */
147 		if (bio == last)
148 			next = NULL;
149 		else
150 			next = bio->bi_private;
151 
152 		/* walk each page on bio, ending page IO on them */
153 		bio_for_each_segment_all(bvec, bio, i)
154 			xfs_finish_page_writeback(inode, bvec, error);
155 
156 		bio_put(bio);
157 	}
158 }
159 
160 /*
161  * Fast and loose check if this write could update the on-disk inode size.
162  */
163 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
164 {
165 	return ioend->io_offset + ioend->io_size >
166 		XFS_I(ioend->io_inode)->i_d.di_size;
167 }
168 
169 STATIC int
170 xfs_setfilesize_trans_alloc(
171 	struct xfs_ioend	*ioend)
172 {
173 	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
174 	struct xfs_trans	*tp;
175 	int			error;
176 
177 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
178 	if (error)
179 		return error;
180 
181 	ioend->io_append_trans = tp;
182 
183 	/*
184 	 * We may pass freeze protection with a transaction.  So tell lockdep
185 	 * we released it.
186 	 */
187 	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
188 	/*
189 	 * We hand off the transaction to the completion thread now, so
190 	 * clear the flag here.
191 	 */
192 	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
193 	return 0;
194 }
195 
196 /*
197  * Update on-disk file size now that data has been written to disk.
198  */
199 STATIC int
200 __xfs_setfilesize(
201 	struct xfs_inode	*ip,
202 	struct xfs_trans	*tp,
203 	xfs_off_t		offset,
204 	size_t			size)
205 {
206 	xfs_fsize_t		isize;
207 
208 	xfs_ilock(ip, XFS_ILOCK_EXCL);
209 	isize = xfs_new_eof(ip, offset + size);
210 	if (!isize) {
211 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
212 		xfs_trans_cancel(tp);
213 		return 0;
214 	}
215 
216 	trace_xfs_setfilesize(ip, offset, size);
217 
218 	ip->i_d.di_size = isize;
219 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
220 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
221 
222 	return xfs_trans_commit(tp);
223 }
224 
225 int
226 xfs_setfilesize(
227 	struct xfs_inode	*ip,
228 	xfs_off_t		offset,
229 	size_t			size)
230 {
231 	struct xfs_mount	*mp = ip->i_mount;
232 	struct xfs_trans	*tp;
233 	int			error;
234 
235 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
236 	if (error)
237 		return error;
238 
239 	return __xfs_setfilesize(ip, tp, offset, size);
240 }
241 
242 STATIC int
243 xfs_setfilesize_ioend(
244 	struct xfs_ioend	*ioend,
245 	int			error)
246 {
247 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
248 	struct xfs_trans	*tp = ioend->io_append_trans;
249 
250 	/*
251 	 * The transaction may have been allocated in the I/O submission thread,
252 	 * thus we need to mark ourselves as being in a transaction manually.
253 	 * Similarly for freeze protection.
254 	 */
255 	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
256 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
257 
258 	/* we abort the update if there was an IO error */
259 	if (error) {
260 		xfs_trans_cancel(tp);
261 		return error;
262 	}
263 
264 	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
265 }
266 
267 /*
268  * IO write completion.
269  */
270 STATIC void
271 xfs_end_io(
272 	struct work_struct *work)
273 {
274 	struct xfs_ioend	*ioend =
275 		container_of(work, struct xfs_ioend, io_work);
276 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
277 	xfs_off_t		offset = ioend->io_offset;
278 	size_t			size = ioend->io_size;
279 	int			error;
280 
281 	/*
282 	 * Just clean up the in-memory strutures if the fs has been shut down.
283 	 */
284 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
285 		error = -EIO;
286 		goto done;
287 	}
288 
289 	/*
290 	 * Clean up any COW blocks on an I/O error.
291 	 */
292 	error = blk_status_to_errno(ioend->io_bio->bi_status);
293 	if (unlikely(error)) {
294 		switch (ioend->io_type) {
295 		case XFS_IO_COW:
296 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
297 			break;
298 		}
299 
300 		goto done;
301 	}
302 
303 	/*
304 	 * Success:  commit the COW or unwritten blocks if needed.
305 	 */
306 	switch (ioend->io_type) {
307 	case XFS_IO_COW:
308 		error = xfs_reflink_end_cow(ip, offset, size);
309 		break;
310 	case XFS_IO_UNWRITTEN:
311 		error = xfs_iomap_write_unwritten(ip, offset, size);
312 		break;
313 	default:
314 		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
315 		break;
316 	}
317 
318 done:
319 	if (ioend->io_append_trans)
320 		error = xfs_setfilesize_ioend(ioend, error);
321 	xfs_destroy_ioend(ioend, error);
322 }
323 
324 STATIC void
325 xfs_end_bio(
326 	struct bio		*bio)
327 {
328 	struct xfs_ioend	*ioend = bio->bi_private;
329 	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
330 
331 	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
332 		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
333 	else if (ioend->io_append_trans)
334 		queue_work(mp->m_data_workqueue, &ioend->io_work);
335 	else
336 		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
337 }
338 
339 STATIC int
340 xfs_map_blocks(
341 	struct inode		*inode,
342 	loff_t			offset,
343 	struct xfs_bmbt_irec	*imap,
344 	int			type)
345 {
346 	struct xfs_inode	*ip = XFS_I(inode);
347 	struct xfs_mount	*mp = ip->i_mount;
348 	ssize_t			count = i_blocksize(inode);
349 	xfs_fileoff_t		offset_fsb, end_fsb;
350 	int			error = 0;
351 	int			bmapi_flags = XFS_BMAPI_ENTIRE;
352 	int			nimaps = 1;
353 
354 	if (XFS_FORCED_SHUTDOWN(mp))
355 		return -EIO;
356 
357 	ASSERT(type != XFS_IO_COW);
358 	if (type == XFS_IO_UNWRITTEN)
359 		bmapi_flags |= XFS_BMAPI_IGSTATE;
360 
361 	xfs_ilock(ip, XFS_ILOCK_SHARED);
362 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
363 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
364 	ASSERT(offset <= mp->m_super->s_maxbytes);
365 
366 	if (offset + count > mp->m_super->s_maxbytes)
367 		count = mp->m_super->s_maxbytes - offset;
368 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
369 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
370 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
371 				imap, &nimaps, bmapi_flags);
372 	/*
373 	 * Truncate an overwrite extent if there's a pending CoW
374 	 * reservation before the end of this extent.  This forces us
375 	 * to come back to writepage to take care of the CoW.
376 	 */
377 	if (nimaps && type == XFS_IO_OVERWRITE)
378 		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
379 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
380 
381 	if (error)
382 		return error;
383 
384 	if (type == XFS_IO_DELALLOC &&
385 	    (!nimaps || isnullstartblock(imap->br_startblock))) {
386 		error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
387 				imap);
388 		if (!error)
389 			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
390 		return error;
391 	}
392 
393 #ifdef DEBUG
394 	if (type == XFS_IO_UNWRITTEN) {
395 		ASSERT(nimaps);
396 		ASSERT(imap->br_startblock != HOLESTARTBLOCK);
397 		ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
398 	}
399 #endif
400 	if (nimaps)
401 		trace_xfs_map_blocks_found(ip, offset, count, type, imap);
402 	return 0;
403 }
404 
405 STATIC bool
406 xfs_imap_valid(
407 	struct inode		*inode,
408 	struct xfs_bmbt_irec	*imap,
409 	xfs_off_t		offset)
410 {
411 	offset >>= inode->i_blkbits;
412 
413 	return offset >= imap->br_startoff &&
414 		offset < imap->br_startoff + imap->br_blockcount;
415 }
416 
417 STATIC void
418 xfs_start_buffer_writeback(
419 	struct buffer_head	*bh)
420 {
421 	ASSERT(buffer_mapped(bh));
422 	ASSERT(buffer_locked(bh));
423 	ASSERT(!buffer_delay(bh));
424 	ASSERT(!buffer_unwritten(bh));
425 
426 	mark_buffer_async_write(bh);
427 	set_buffer_uptodate(bh);
428 	clear_buffer_dirty(bh);
429 }
430 
431 STATIC void
432 xfs_start_page_writeback(
433 	struct page		*page,
434 	int			clear_dirty)
435 {
436 	ASSERT(PageLocked(page));
437 	ASSERT(!PageWriteback(page));
438 
439 	/*
440 	 * if the page was not fully cleaned, we need to ensure that the higher
441 	 * layers come back to it correctly. That means we need to keep the page
442 	 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
443 	 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
444 	 * write this page in this writeback sweep will be made.
445 	 */
446 	if (clear_dirty) {
447 		clear_page_dirty_for_io(page);
448 		set_page_writeback(page);
449 	} else
450 		set_page_writeback_keepwrite(page);
451 
452 	unlock_page(page);
453 }
454 
455 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
456 {
457 	return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
458 }
459 
460 /*
461  * Submit the bio for an ioend. We are passed an ioend with a bio attached to
462  * it, and we submit that bio. The ioend may be used for multiple bio
463  * submissions, so we only want to allocate an append transaction for the ioend
464  * once. In the case of multiple bio submission, each bio will take an IO
465  * reference to the ioend to ensure that the ioend completion is only done once
466  * all bios have been submitted and the ioend is really done.
467  *
468  * If @fail is non-zero, it means that we have a situation where some part of
469  * the submission process has failed after we have marked paged for writeback
470  * and unlocked them. In this situation, we need to fail the bio and ioend
471  * rather than submit it to IO. This typically only happens on a filesystem
472  * shutdown.
473  */
474 STATIC int
475 xfs_submit_ioend(
476 	struct writeback_control *wbc,
477 	struct xfs_ioend	*ioend,
478 	int			status)
479 {
480 	/* Convert CoW extents to regular */
481 	if (!status && ioend->io_type == XFS_IO_COW) {
482 		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
483 				ioend->io_offset, ioend->io_size);
484 	}
485 
486 	/* Reserve log space if we might write beyond the on-disk inode size. */
487 	if (!status &&
488 	    ioend->io_type != XFS_IO_UNWRITTEN &&
489 	    xfs_ioend_is_append(ioend) &&
490 	    !ioend->io_append_trans)
491 		status = xfs_setfilesize_trans_alloc(ioend);
492 
493 	ioend->io_bio->bi_private = ioend;
494 	ioend->io_bio->bi_end_io = xfs_end_bio;
495 	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
496 
497 	/*
498 	 * If we are failing the IO now, just mark the ioend with an
499 	 * error and finish it. This will run IO completion immediately
500 	 * as there is only one reference to the ioend at this point in
501 	 * time.
502 	 */
503 	if (status) {
504 		ioend->io_bio->bi_status = errno_to_blk_status(status);
505 		bio_endio(ioend->io_bio);
506 		return status;
507 	}
508 
509 	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
510 	submit_bio(ioend->io_bio);
511 	return 0;
512 }
513 
514 static void
515 xfs_init_bio_from_bh(
516 	struct bio		*bio,
517 	struct buffer_head	*bh)
518 {
519 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
520 	bio->bi_bdev = bh->b_bdev;
521 }
522 
523 static struct xfs_ioend *
524 xfs_alloc_ioend(
525 	struct inode		*inode,
526 	unsigned int		type,
527 	xfs_off_t		offset,
528 	struct buffer_head	*bh)
529 {
530 	struct xfs_ioend	*ioend;
531 	struct bio		*bio;
532 
533 	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
534 	xfs_init_bio_from_bh(bio, bh);
535 
536 	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
537 	INIT_LIST_HEAD(&ioend->io_list);
538 	ioend->io_type = type;
539 	ioend->io_inode = inode;
540 	ioend->io_size = 0;
541 	ioend->io_offset = offset;
542 	INIT_WORK(&ioend->io_work, xfs_end_io);
543 	ioend->io_append_trans = NULL;
544 	ioend->io_bio = bio;
545 	return ioend;
546 }
547 
548 /*
549  * Allocate a new bio, and chain the old bio to the new one.
550  *
551  * Note that we have to do perform the chaining in this unintuitive order
552  * so that the bi_private linkage is set up in the right direction for the
553  * traversal in xfs_destroy_ioend().
554  */
555 static void
556 xfs_chain_bio(
557 	struct xfs_ioend	*ioend,
558 	struct writeback_control *wbc,
559 	struct buffer_head	*bh)
560 {
561 	struct bio *new;
562 
563 	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
564 	xfs_init_bio_from_bh(new, bh);
565 
566 	bio_chain(ioend->io_bio, new);
567 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
568 	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
569 	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
570 	submit_bio(ioend->io_bio);
571 	ioend->io_bio = new;
572 }
573 
574 /*
575  * Test to see if we've been building up a completion structure for
576  * earlier buffers -- if so, we try to append to this ioend if we
577  * can, otherwise we finish off any current ioend and start another.
578  * Return the ioend we finished off so that the caller can submit it
579  * once it has finished processing the dirty page.
580  */
581 STATIC void
582 xfs_add_to_ioend(
583 	struct inode		*inode,
584 	struct buffer_head	*bh,
585 	xfs_off_t		offset,
586 	struct xfs_writepage_ctx *wpc,
587 	struct writeback_control *wbc,
588 	struct list_head	*iolist)
589 {
590 	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
591 	    bh->b_blocknr != wpc->last_block + 1 ||
592 	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
593 		if (wpc->ioend)
594 			list_add(&wpc->ioend->io_list, iolist);
595 		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
596 	}
597 
598 	/*
599 	 * If the buffer doesn't fit into the bio we need to allocate a new
600 	 * one.  This shouldn't happen more than once for a given buffer.
601 	 */
602 	while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
603 		xfs_chain_bio(wpc->ioend, wbc, bh);
604 
605 	wpc->ioend->io_size += bh->b_size;
606 	wpc->last_block = bh->b_blocknr;
607 	xfs_start_buffer_writeback(bh);
608 }
609 
610 STATIC void
611 xfs_map_buffer(
612 	struct inode		*inode,
613 	struct buffer_head	*bh,
614 	struct xfs_bmbt_irec	*imap,
615 	xfs_off_t		offset)
616 {
617 	sector_t		bn;
618 	struct xfs_mount	*m = XFS_I(inode)->i_mount;
619 	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
620 	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
621 
622 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
623 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
624 
625 	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
626 	      ((offset - iomap_offset) >> inode->i_blkbits);
627 
628 	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
629 
630 	bh->b_blocknr = bn;
631 	set_buffer_mapped(bh);
632 }
633 
634 STATIC void
635 xfs_map_at_offset(
636 	struct inode		*inode,
637 	struct buffer_head	*bh,
638 	struct xfs_bmbt_irec	*imap,
639 	xfs_off_t		offset)
640 {
641 	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
642 	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
643 
644 	xfs_map_buffer(inode, bh, imap, offset);
645 	set_buffer_mapped(bh);
646 	clear_buffer_delay(bh);
647 	clear_buffer_unwritten(bh);
648 }
649 
650 /*
651  * Test if a given page contains at least one buffer of a given @type.
652  * If @check_all_buffers is true, then we walk all the buffers in the page to
653  * try to find one of the type passed in. If it is not set, then the caller only
654  * needs to check the first buffer on the page for a match.
655  */
656 STATIC bool
657 xfs_check_page_type(
658 	struct page		*page,
659 	unsigned int		type,
660 	bool			check_all_buffers)
661 {
662 	struct buffer_head	*bh;
663 	struct buffer_head	*head;
664 
665 	if (PageWriteback(page))
666 		return false;
667 	if (!page->mapping)
668 		return false;
669 	if (!page_has_buffers(page))
670 		return false;
671 
672 	bh = head = page_buffers(page);
673 	do {
674 		if (buffer_unwritten(bh)) {
675 			if (type == XFS_IO_UNWRITTEN)
676 				return true;
677 		} else if (buffer_delay(bh)) {
678 			if (type == XFS_IO_DELALLOC)
679 				return true;
680 		} else if (buffer_dirty(bh) && buffer_mapped(bh)) {
681 			if (type == XFS_IO_OVERWRITE)
682 				return true;
683 		}
684 
685 		/* If we are only checking the first buffer, we are done now. */
686 		if (!check_all_buffers)
687 			break;
688 	} while ((bh = bh->b_this_page) != head);
689 
690 	return false;
691 }
692 
693 STATIC void
694 xfs_vm_invalidatepage(
695 	struct page		*page,
696 	unsigned int		offset,
697 	unsigned int		length)
698 {
699 	trace_xfs_invalidatepage(page->mapping->host, page, offset,
700 				 length);
701 	block_invalidatepage(page, offset, length);
702 }
703 
704 /*
705  * If the page has delalloc buffers on it, we need to punch them out before we
706  * invalidate the page. If we don't, we leave a stale delalloc mapping on the
707  * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
708  * is done on that same region - the delalloc extent is returned when none is
709  * supposed to be there.
710  *
711  * We prevent this by truncating away the delalloc regions on the page before
712  * invalidating it. Because they are delalloc, we can do this without needing a
713  * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
714  * truncation without a transaction as there is no space left for block
715  * reservation (typically why we see a ENOSPC in writeback).
716  *
717  * This is not a performance critical path, so for now just do the punching a
718  * buffer head at a time.
719  */
720 STATIC void
721 xfs_aops_discard_page(
722 	struct page		*page)
723 {
724 	struct inode		*inode = page->mapping->host;
725 	struct xfs_inode	*ip = XFS_I(inode);
726 	struct buffer_head	*bh, *head;
727 	loff_t			offset = page_offset(page);
728 
729 	if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
730 		goto out_invalidate;
731 
732 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
733 		goto out_invalidate;
734 
735 	xfs_alert(ip->i_mount,
736 		"page discard on page %p, inode 0x%llx, offset %llu.",
737 			page, ip->i_ino, offset);
738 
739 	xfs_ilock(ip, XFS_ILOCK_EXCL);
740 	bh = head = page_buffers(page);
741 	do {
742 		int		error;
743 		xfs_fileoff_t	start_fsb;
744 
745 		if (!buffer_delay(bh))
746 			goto next_buffer;
747 
748 		start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
749 		error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
750 		if (error) {
751 			/* something screwed, just bail */
752 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
753 				xfs_alert(ip->i_mount,
754 			"page discard unable to remove delalloc mapping.");
755 			}
756 			break;
757 		}
758 next_buffer:
759 		offset += i_blocksize(inode);
760 
761 	} while ((bh = bh->b_this_page) != head);
762 
763 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
764 out_invalidate:
765 	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
766 	return;
767 }
768 
769 static int
770 xfs_map_cow(
771 	struct xfs_writepage_ctx *wpc,
772 	struct inode		*inode,
773 	loff_t			offset,
774 	unsigned int		*new_type)
775 {
776 	struct xfs_inode	*ip = XFS_I(inode);
777 	struct xfs_bmbt_irec	imap;
778 	bool			is_cow = false;
779 	int			error;
780 
781 	/*
782 	 * If we already have a valid COW mapping keep using it.
783 	 */
784 	if (wpc->io_type == XFS_IO_COW) {
785 		wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
786 		if (wpc->imap_valid) {
787 			*new_type = XFS_IO_COW;
788 			return 0;
789 		}
790 	}
791 
792 	/*
793 	 * Else we need to check if there is a COW mapping at this offset.
794 	 */
795 	xfs_ilock(ip, XFS_ILOCK_SHARED);
796 	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
797 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
798 
799 	if (!is_cow)
800 		return 0;
801 
802 	/*
803 	 * And if the COW mapping has a delayed extent here we need to
804 	 * allocate real space for it now.
805 	 */
806 	if (isnullstartblock(imap.br_startblock)) {
807 		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
808 				&imap);
809 		if (error)
810 			return error;
811 	}
812 
813 	wpc->io_type = *new_type = XFS_IO_COW;
814 	wpc->imap_valid = true;
815 	wpc->imap = imap;
816 	return 0;
817 }
818 
819 /*
820  * We implement an immediate ioend submission policy here to avoid needing to
821  * chain multiple ioends and hence nest mempool allocations which can violate
822  * forward progress guarantees we need to provide. The current ioend we are
823  * adding buffers to is cached on the writepage context, and if the new buffer
824  * does not append to the cached ioend it will create a new ioend and cache that
825  * instead.
826  *
827  * If a new ioend is created and cached, the old ioend is returned and queued
828  * locally for submission once the entire page is processed or an error has been
829  * detected.  While ioends are submitted immediately after they are completed,
830  * batching optimisations are provided by higher level block plugging.
831  *
832  * At the end of a writeback pass, there will be a cached ioend remaining on the
833  * writepage context that the caller will need to submit.
834  */
835 static int
836 xfs_writepage_map(
837 	struct xfs_writepage_ctx *wpc,
838 	struct writeback_control *wbc,
839 	struct inode		*inode,
840 	struct page		*page,
841 	loff_t			offset,
842 	uint64_t              end_offset)
843 {
844 	LIST_HEAD(submit_list);
845 	struct xfs_ioend	*ioend, *next;
846 	struct buffer_head	*bh, *head;
847 	ssize_t			len = i_blocksize(inode);
848 	int			error = 0;
849 	int			count = 0;
850 	int			uptodate = 1;
851 	unsigned int		new_type;
852 
853 	bh = head = page_buffers(page);
854 	offset = page_offset(page);
855 	do {
856 		if (offset >= end_offset)
857 			break;
858 		if (!buffer_uptodate(bh))
859 			uptodate = 0;
860 
861 		/*
862 		 * set_page_dirty dirties all buffers in a page, independent
863 		 * of their state.  The dirty state however is entirely
864 		 * meaningless for holes (!mapped && uptodate), so skip
865 		 * buffers covering holes here.
866 		 */
867 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
868 			wpc->imap_valid = false;
869 			continue;
870 		}
871 
872 		if (buffer_unwritten(bh))
873 			new_type = XFS_IO_UNWRITTEN;
874 		else if (buffer_delay(bh))
875 			new_type = XFS_IO_DELALLOC;
876 		else if (buffer_uptodate(bh))
877 			new_type = XFS_IO_OVERWRITE;
878 		else {
879 			if (PageUptodate(page))
880 				ASSERT(buffer_mapped(bh));
881 			/*
882 			 * This buffer is not uptodate and will not be
883 			 * written to disk.  Ensure that we will put any
884 			 * subsequent writeable buffers into a new
885 			 * ioend.
886 			 */
887 			wpc->imap_valid = false;
888 			continue;
889 		}
890 
891 		if (xfs_is_reflink_inode(XFS_I(inode))) {
892 			error = xfs_map_cow(wpc, inode, offset, &new_type);
893 			if (error)
894 				goto out;
895 		}
896 
897 		if (wpc->io_type != new_type) {
898 			wpc->io_type = new_type;
899 			wpc->imap_valid = false;
900 		}
901 
902 		if (wpc->imap_valid)
903 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
904 							 offset);
905 		if (!wpc->imap_valid) {
906 			error = xfs_map_blocks(inode, offset, &wpc->imap,
907 					     wpc->io_type);
908 			if (error)
909 				goto out;
910 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
911 							 offset);
912 		}
913 		if (wpc->imap_valid) {
914 			lock_buffer(bh);
915 			if (wpc->io_type != XFS_IO_OVERWRITE)
916 				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
917 			xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
918 			count++;
919 		}
920 
921 	} while (offset += len, ((bh = bh->b_this_page) != head));
922 
923 	if (uptodate && bh == head)
924 		SetPageUptodate(page);
925 
926 	ASSERT(wpc->ioend || list_empty(&submit_list));
927 
928 out:
929 	/*
930 	 * On error, we have to fail the ioend here because we have locked
931 	 * buffers in the ioend. If we don't do this, we'll deadlock
932 	 * invalidating the page as that tries to lock the buffers on the page.
933 	 * Also, because we may have set pages under writeback, we have to make
934 	 * sure we run IO completion to mark the error state of the IO
935 	 * appropriately, so we can't cancel the ioend directly here. That means
936 	 * we have to mark this page as under writeback if we included any
937 	 * buffers from it in the ioend chain so that completion treats it
938 	 * correctly.
939 	 *
940 	 * If we didn't include the page in the ioend, the on error we can
941 	 * simply discard and unlock it as there are no other users of the page
942 	 * or it's buffers right now. The caller will still need to trigger
943 	 * submission of outstanding ioends on the writepage context so they are
944 	 * treated correctly on error.
945 	 */
946 	if (count) {
947 		xfs_start_page_writeback(page, !error);
948 
949 		/*
950 		 * Preserve the original error if there was one, otherwise catch
951 		 * submission errors here and propagate into subsequent ioend
952 		 * submissions.
953 		 */
954 		list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
955 			int error2;
956 
957 			list_del_init(&ioend->io_list);
958 			error2 = xfs_submit_ioend(wbc, ioend, error);
959 			if (error2 && !error)
960 				error = error2;
961 		}
962 	} else if (error) {
963 		xfs_aops_discard_page(page);
964 		ClearPageUptodate(page);
965 		unlock_page(page);
966 	} else {
967 		/*
968 		 * We can end up here with no error and nothing to write if we
969 		 * race with a partial page truncate on a sub-page block sized
970 		 * filesystem. In that case we need to mark the page clean.
971 		 */
972 		xfs_start_page_writeback(page, 1);
973 		end_page_writeback(page);
974 	}
975 
976 	mapping_set_error(page->mapping, error);
977 	return error;
978 }
979 
980 /*
981  * Write out a dirty page.
982  *
983  * For delalloc space on the page we need to allocate space and flush it.
984  * For unwritten space on the page we need to start the conversion to
985  * regular allocated space.
986  * For any other dirty buffer heads on the page we should flush them.
987  */
988 STATIC int
989 xfs_do_writepage(
990 	struct page		*page,
991 	struct writeback_control *wbc,
992 	void			*data)
993 {
994 	struct xfs_writepage_ctx *wpc = data;
995 	struct inode		*inode = page->mapping->host;
996 	loff_t			offset;
997 	uint64_t              end_offset;
998 	pgoff_t                 end_index;
999 
1000 	trace_xfs_writepage(inode, page, 0, 0);
1001 
1002 	ASSERT(page_has_buffers(page));
1003 
1004 	/*
1005 	 * Refuse to write the page out if we are called from reclaim context.
1006 	 *
1007 	 * This avoids stack overflows when called from deeply used stacks in
1008 	 * random callers for direct reclaim or memcg reclaim.  We explicitly
1009 	 * allow reclaim from kswapd as the stack usage there is relatively low.
1010 	 *
1011 	 * This should never happen except in the case of a VM regression so
1012 	 * warn about it.
1013 	 */
1014 	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1015 			PF_MEMALLOC))
1016 		goto redirty;
1017 
1018 	/*
1019 	 * Given that we do not allow direct reclaim to call us, we should
1020 	 * never be called while in a filesystem transaction.
1021 	 */
1022 	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
1023 		goto redirty;
1024 
1025 	/*
1026 	 * Is this page beyond the end of the file?
1027 	 *
1028 	 * The page index is less than the end_index, adjust the end_offset
1029 	 * to the highest offset that this page should represent.
1030 	 * -----------------------------------------------------
1031 	 * |			file mapping	       | <EOF> |
1032 	 * -----------------------------------------------------
1033 	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
1034 	 * ^--------------------------------^----------|--------
1035 	 * |     desired writeback range    |      see else    |
1036 	 * ---------------------------------^------------------|
1037 	 */
1038 	offset = i_size_read(inode);
1039 	end_index = offset >> PAGE_SHIFT;
1040 	if (page->index < end_index)
1041 		end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
1042 	else {
1043 		/*
1044 		 * Check whether the page to write out is beyond or straddles
1045 		 * i_size or not.
1046 		 * -------------------------------------------------------
1047 		 * |		file mapping		        | <EOF>  |
1048 		 * -------------------------------------------------------
1049 		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1050 		 * ^--------------------------------^-----------|---------
1051 		 * |				    |      Straddles     |
1052 		 * ---------------------------------^-----------|--------|
1053 		 */
1054 		unsigned offset_into_page = offset & (PAGE_SIZE - 1);
1055 
1056 		/*
1057 		 * Skip the page if it is fully outside i_size, e.g. due to a
1058 		 * truncate operation that is in progress. We must redirty the
1059 		 * page so that reclaim stops reclaiming it. Otherwise
1060 		 * xfs_vm_releasepage() is called on it and gets confused.
1061 		 *
1062 		 * Note that the end_index is unsigned long, it would overflow
1063 		 * if the given offset is greater than 16TB on 32-bit system
1064 		 * and if we do check the page is fully outside i_size or not
1065 		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
1066 		 * will be evaluated to 0.  Hence this page will be redirtied
1067 		 * and be written out repeatedly which would result in an
1068 		 * infinite loop, the user program that perform this operation
1069 		 * will hang.  Instead, we can verify this situation by checking
1070 		 * if the page to write is totally beyond the i_size or if it's
1071 		 * offset is just equal to the EOF.
1072 		 */
1073 		if (page->index > end_index ||
1074 		    (page->index == end_index && offset_into_page == 0))
1075 			goto redirty;
1076 
1077 		/*
1078 		 * The page straddles i_size.  It must be zeroed out on each
1079 		 * and every writepage invocation because it may be mmapped.
1080 		 * "A file is mapped in multiples of the page size.  For a file
1081 		 * that is not a multiple of the page size, the remaining
1082 		 * memory is zeroed when mapped, and writes to that region are
1083 		 * not written out to the file."
1084 		 */
1085 		zero_user_segment(page, offset_into_page, PAGE_SIZE);
1086 
1087 		/* Adjust the end_offset to the end of file */
1088 		end_offset = offset;
1089 	}
1090 
1091 	return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
1092 
1093 redirty:
1094 	redirty_page_for_writepage(wbc, page);
1095 	unlock_page(page);
1096 	return 0;
1097 }
1098 
1099 STATIC int
1100 xfs_vm_writepage(
1101 	struct page		*page,
1102 	struct writeback_control *wbc)
1103 {
1104 	struct xfs_writepage_ctx wpc = {
1105 		.io_type = XFS_IO_INVALID,
1106 	};
1107 	int			ret;
1108 
1109 	ret = xfs_do_writepage(page, wbc, &wpc);
1110 	if (wpc.ioend)
1111 		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1112 	return ret;
1113 }
1114 
1115 STATIC int
1116 xfs_vm_writepages(
1117 	struct address_space	*mapping,
1118 	struct writeback_control *wbc)
1119 {
1120 	struct xfs_writepage_ctx wpc = {
1121 		.io_type = XFS_IO_INVALID,
1122 	};
1123 	int			ret;
1124 
1125 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1126 	if (dax_mapping(mapping))
1127 		return dax_writeback_mapping_range(mapping,
1128 				xfs_find_bdev_for_inode(mapping->host), wbc);
1129 
1130 	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1131 	if (wpc.ioend)
1132 		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1133 	return ret;
1134 }
1135 
1136 /*
1137  * Called to move a page into cleanable state - and from there
1138  * to be released. The page should already be clean. We always
1139  * have buffer heads in this call.
1140  *
1141  * Returns 1 if the page is ok to release, 0 otherwise.
1142  */
1143 STATIC int
1144 xfs_vm_releasepage(
1145 	struct page		*page,
1146 	gfp_t			gfp_mask)
1147 {
1148 	int			delalloc, unwritten;
1149 
1150 	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1151 
1152 	/*
1153 	 * mm accommodates an old ext3 case where clean pages might not have had
1154 	 * the dirty bit cleared. Thus, it can send actual dirty pages to
1155 	 * ->releasepage() via shrink_active_list(). Conversely,
1156 	 * block_invalidatepage() can send pages that are still marked dirty
1157 	 * but otherwise have invalidated buffers.
1158 	 *
1159 	 * We want to release the latter to avoid unnecessary buildup of the
1160 	 * LRU, skip the former and warn if we've left any lingering
1161 	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
1162 	 * or unwritten buffers and warn if the page is not dirty. Otherwise
1163 	 * try to release the buffers.
1164 	 */
1165 	xfs_count_page_state(page, &delalloc, &unwritten);
1166 
1167 	if (delalloc) {
1168 		WARN_ON_ONCE(!PageDirty(page));
1169 		return 0;
1170 	}
1171 	if (unwritten) {
1172 		WARN_ON_ONCE(!PageDirty(page));
1173 		return 0;
1174 	}
1175 
1176 	return try_to_free_buffers(page);
1177 }
1178 
1179 /*
1180  * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1181  * is, so that we can avoid repeated get_blocks calls.
1182  *
1183  * If the mapping spans EOF, then we have to break the mapping up as the mapping
1184  * for blocks beyond EOF must be marked new so that sub block regions can be
1185  * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1186  * was just allocated or is unwritten, otherwise the callers would overwrite
1187  * existing data with zeros. Hence we have to split the mapping into a range up
1188  * to and including EOF, and a second mapping for beyond EOF.
1189  */
1190 static void
1191 xfs_map_trim_size(
1192 	struct inode		*inode,
1193 	sector_t		iblock,
1194 	struct buffer_head	*bh_result,
1195 	struct xfs_bmbt_irec	*imap,
1196 	xfs_off_t		offset,
1197 	ssize_t			size)
1198 {
1199 	xfs_off_t		mapping_size;
1200 
1201 	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1202 	mapping_size <<= inode->i_blkbits;
1203 
1204 	ASSERT(mapping_size > 0);
1205 	if (mapping_size > size)
1206 		mapping_size = size;
1207 	if (offset < i_size_read(inode) &&
1208 	    offset + mapping_size >= i_size_read(inode)) {
1209 		/* limit mapping to block that spans EOF */
1210 		mapping_size = roundup_64(i_size_read(inode) - offset,
1211 					  i_blocksize(inode));
1212 	}
1213 	if (mapping_size > LONG_MAX)
1214 		mapping_size = LONG_MAX;
1215 
1216 	bh_result->b_size = mapping_size;
1217 }
1218 
1219 static int
1220 xfs_get_blocks(
1221 	struct inode		*inode,
1222 	sector_t		iblock,
1223 	struct buffer_head	*bh_result,
1224 	int			create)
1225 {
1226 	struct xfs_inode	*ip = XFS_I(inode);
1227 	struct xfs_mount	*mp = ip->i_mount;
1228 	xfs_fileoff_t		offset_fsb, end_fsb;
1229 	int			error = 0;
1230 	int			lockmode = 0;
1231 	struct xfs_bmbt_irec	imap;
1232 	int			nimaps = 1;
1233 	xfs_off_t		offset;
1234 	ssize_t			size;
1235 
1236 	BUG_ON(create);
1237 
1238 	if (XFS_FORCED_SHUTDOWN(mp))
1239 		return -EIO;
1240 
1241 	offset = (xfs_off_t)iblock << inode->i_blkbits;
1242 	ASSERT(bh_result->b_size >= i_blocksize(inode));
1243 	size = bh_result->b_size;
1244 
1245 	if (offset >= i_size_read(inode))
1246 		return 0;
1247 
1248 	/*
1249 	 * Direct I/O is usually done on preallocated files, so try getting
1250 	 * a block mapping without an exclusive lock first.
1251 	 */
1252 	lockmode = xfs_ilock_data_map_shared(ip);
1253 
1254 	ASSERT(offset <= mp->m_super->s_maxbytes);
1255 	if (offset + size > mp->m_super->s_maxbytes)
1256 		size = mp->m_super->s_maxbytes - offset;
1257 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1258 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
1259 
1260 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1261 				&imap, &nimaps, XFS_BMAPI_ENTIRE);
1262 	if (error)
1263 		goto out_unlock;
1264 
1265 	if (nimaps) {
1266 		trace_xfs_get_blocks_found(ip, offset, size,
1267 			imap.br_state == XFS_EXT_UNWRITTEN ?
1268 				XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
1269 		xfs_iunlock(ip, lockmode);
1270 	} else {
1271 		trace_xfs_get_blocks_notfound(ip, offset, size);
1272 		goto out_unlock;
1273 	}
1274 
1275 	/* trim mapping down to size requested */
1276 	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1277 
1278 	/*
1279 	 * For unwritten extents do not report a disk address in the buffered
1280 	 * read case (treat as if we're reading into a hole).
1281 	 */
1282 	if (xfs_bmap_is_real_extent(&imap))
1283 		xfs_map_buffer(inode, bh_result, &imap, offset);
1284 
1285 	/*
1286 	 * If this is a realtime file, data may be on a different device.
1287 	 * to that pointed to from the buffer_head b_bdev currently.
1288 	 */
1289 	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1290 	return 0;
1291 
1292 out_unlock:
1293 	xfs_iunlock(ip, lockmode);
1294 	return error;
1295 }
1296 
1297 STATIC ssize_t
1298 xfs_vm_direct_IO(
1299 	struct kiocb		*iocb,
1300 	struct iov_iter		*iter)
1301 {
1302 	/*
1303 	 * We just need the method present so that open/fcntl allow direct I/O.
1304 	 */
1305 	return -EINVAL;
1306 }
1307 
1308 STATIC sector_t
1309 xfs_vm_bmap(
1310 	struct address_space	*mapping,
1311 	sector_t		block)
1312 {
1313 	struct inode		*inode = (struct inode *)mapping->host;
1314 	struct xfs_inode	*ip = XFS_I(inode);
1315 
1316 	trace_xfs_vm_bmap(XFS_I(inode));
1317 
1318 	/*
1319 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
1320 	 * bypasseѕ the file system for actual I/O.  We really can't allow
1321 	 * that on reflinks inodes, so we have to skip out here.  And yes,
1322 	 * 0 is the magic code for a bmap error.
1323 	 *
1324 	 * Since we don't pass back blockdev info, we can't return bmap
1325 	 * information for rt files either.
1326 	 */
1327 	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
1328 		return 0;
1329 
1330 	filemap_write_and_wait(mapping);
1331 	return generic_block_bmap(mapping, block, xfs_get_blocks);
1332 }
1333 
1334 STATIC int
1335 xfs_vm_readpage(
1336 	struct file		*unused,
1337 	struct page		*page)
1338 {
1339 	trace_xfs_vm_readpage(page->mapping->host, 1);
1340 	return mpage_readpage(page, xfs_get_blocks);
1341 }
1342 
1343 STATIC int
1344 xfs_vm_readpages(
1345 	struct file		*unused,
1346 	struct address_space	*mapping,
1347 	struct list_head	*pages,
1348 	unsigned		nr_pages)
1349 {
1350 	trace_xfs_vm_readpages(mapping->host, nr_pages);
1351 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1352 }
1353 
1354 /*
1355  * This is basically a copy of __set_page_dirty_buffers() with one
1356  * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1357  * dirty, we'll never be able to clean them because we don't write buffers
1358  * beyond EOF, and that means we can't invalidate pages that span EOF
1359  * that have been marked dirty. Further, the dirty state can leak into
1360  * the file interior if the file is extended, resulting in all sorts of
1361  * bad things happening as the state does not match the underlying data.
1362  *
1363  * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1364  * this only exist because of bufferheads and how the generic code manages them.
1365  */
1366 STATIC int
1367 xfs_vm_set_page_dirty(
1368 	struct page		*page)
1369 {
1370 	struct address_space	*mapping = page->mapping;
1371 	struct inode		*inode = mapping->host;
1372 	loff_t			end_offset;
1373 	loff_t			offset;
1374 	int			newly_dirty;
1375 
1376 	if (unlikely(!mapping))
1377 		return !TestSetPageDirty(page);
1378 
1379 	end_offset = i_size_read(inode);
1380 	offset = page_offset(page);
1381 
1382 	spin_lock(&mapping->private_lock);
1383 	if (page_has_buffers(page)) {
1384 		struct buffer_head *head = page_buffers(page);
1385 		struct buffer_head *bh = head;
1386 
1387 		do {
1388 			if (offset < end_offset)
1389 				set_buffer_dirty(bh);
1390 			bh = bh->b_this_page;
1391 			offset += i_blocksize(inode);
1392 		} while (bh != head);
1393 	}
1394 	/*
1395 	 * Lock out page->mem_cgroup migration to keep PageDirty
1396 	 * synchronized with per-memcg dirty page counters.
1397 	 */
1398 	lock_page_memcg(page);
1399 	newly_dirty = !TestSetPageDirty(page);
1400 	spin_unlock(&mapping->private_lock);
1401 
1402 	if (newly_dirty) {
1403 		/* sigh - __set_page_dirty() is static, so copy it here, too */
1404 		unsigned long flags;
1405 
1406 		spin_lock_irqsave(&mapping->tree_lock, flags);
1407 		if (page->mapping) {	/* Race with truncate? */
1408 			WARN_ON_ONCE(!PageUptodate(page));
1409 			account_page_dirtied(page, mapping);
1410 			radix_tree_tag_set(&mapping->page_tree,
1411 					page_index(page), PAGECACHE_TAG_DIRTY);
1412 		}
1413 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
1414 	}
1415 	unlock_page_memcg(page);
1416 	if (newly_dirty)
1417 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1418 	return newly_dirty;
1419 }
1420 
1421 const struct address_space_operations xfs_address_space_operations = {
1422 	.readpage		= xfs_vm_readpage,
1423 	.readpages		= xfs_vm_readpages,
1424 	.writepage		= xfs_vm_writepage,
1425 	.writepages		= xfs_vm_writepages,
1426 	.set_page_dirty		= xfs_vm_set_page_dirty,
1427 	.releasepage		= xfs_vm_releasepage,
1428 	.invalidatepage		= xfs_vm_invalidatepage,
1429 	.bmap			= xfs_vm_bmap,
1430 	.direct_IO		= xfs_vm_direct_IO,
1431 	.migratepage		= buffer_migrate_page,
1432 	.is_partially_uptodate  = block_is_partially_uptodate,
1433 	.error_remove_page	= generic_error_remove_page,
1434 };
1435