xref: /openbmc/linux/fs/xfs/xfs_iomap.c (revision cd5d5810)
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_format.h"
21 #include "xfs_log.h"
22 #include "xfs_trans.h"
23 #include "xfs_sb.h"
24 #include "xfs_ag.h"
25 #include "xfs_alloc.h"
26 #include "xfs_quota.h"
27 #include "xfs_mount.h"
28 #include "xfs_bmap_btree.h"
29 #include "xfs_alloc_btree.h"
30 #include "xfs_ialloc_btree.h"
31 #include "xfs_dinode.h"
32 #include "xfs_inode.h"
33 #include "xfs_inode_item.h"
34 #include "xfs_btree.h"
35 #include "xfs_bmap.h"
36 #include "xfs_bmap_util.h"
37 #include "xfs_rtalloc.h"
38 #include "xfs_error.h"
39 #include "xfs_itable.h"
40 #include "xfs_attr.h"
41 #include "xfs_buf_item.h"
42 #include "xfs_trans_space.h"
43 #include "xfs_iomap.h"
44 #include "xfs_trace.h"
45 #include "xfs_icache.h"
46 #include "xfs_dquot_item.h"
47 #include "xfs_dquot.h"
48 
49 
50 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
51 						<< mp->m_writeio_log)
52 #define XFS_WRITE_IMAPS		XFS_BMAP_MAX_NMAP
53 
54 STATIC int
55 xfs_iomap_eof_align_last_fsb(
56 	xfs_mount_t	*mp,
57 	xfs_inode_t	*ip,
58 	xfs_extlen_t	extsize,
59 	xfs_fileoff_t	*last_fsb)
60 {
61 	xfs_fileoff_t	new_last_fsb = 0;
62 	xfs_extlen_t	align = 0;
63 	int		eof, error;
64 
65 	if (!XFS_IS_REALTIME_INODE(ip)) {
66 		/*
67 		 * Round up the allocation request to a stripe unit
68 		 * (m_dalign) boundary if the file size is >= stripe unit
69 		 * size, and we are allocating past the allocation eof.
70 		 *
71 		 * If mounted with the "-o swalloc" option the alignment is
72 		 * increased from the strip unit size to the stripe width.
73 		 */
74 		if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
75 			align = mp->m_swidth;
76 		else if (mp->m_dalign)
77 			align = mp->m_dalign;
78 
79 		if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
80 			new_last_fsb = roundup_64(*last_fsb, align);
81 	}
82 
83 	/*
84 	 * Always round up the allocation request to an extent boundary
85 	 * (when file on a real-time subvolume or has di_extsize hint).
86 	 */
87 	if (extsize) {
88 		if (new_last_fsb)
89 			align = roundup_64(new_last_fsb, extsize);
90 		else
91 			align = extsize;
92 		new_last_fsb = roundup_64(*last_fsb, align);
93 	}
94 
95 	if (new_last_fsb) {
96 		error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
97 		if (error)
98 			return error;
99 		if (eof)
100 			*last_fsb = new_last_fsb;
101 	}
102 	return 0;
103 }
104 
105 STATIC int
106 xfs_alert_fsblock_zero(
107 	xfs_inode_t	*ip,
108 	xfs_bmbt_irec_t	*imap)
109 {
110 	xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
111 			"Access to block zero in inode %llu "
112 			"start_block: %llx start_off: %llx "
113 			"blkcnt: %llx extent-state: %x\n",
114 		(unsigned long long)ip->i_ino,
115 		(unsigned long long)imap->br_startblock,
116 		(unsigned long long)imap->br_startoff,
117 		(unsigned long long)imap->br_blockcount,
118 		imap->br_state);
119 	return EFSCORRUPTED;
120 }
121 
122 int
123 xfs_iomap_write_direct(
124 	xfs_inode_t	*ip,
125 	xfs_off_t	offset,
126 	size_t		count,
127 	xfs_bmbt_irec_t *imap,
128 	int		nmaps)
129 {
130 	xfs_mount_t	*mp = ip->i_mount;
131 	xfs_fileoff_t	offset_fsb;
132 	xfs_fileoff_t	last_fsb;
133 	xfs_filblks_t	count_fsb, resaligned;
134 	xfs_fsblock_t	firstfsb;
135 	xfs_extlen_t	extsz, temp;
136 	int		nimaps;
137 	int		bmapi_flag;
138 	int		quota_flag;
139 	int		rt;
140 	xfs_trans_t	*tp;
141 	xfs_bmap_free_t free_list;
142 	uint		qblocks, resblks, resrtextents;
143 	int		committed;
144 	int		error;
145 
146 	error = xfs_qm_dqattach(ip, 0);
147 	if (error)
148 		return XFS_ERROR(error);
149 
150 	rt = XFS_IS_REALTIME_INODE(ip);
151 	extsz = xfs_get_extsz_hint(ip);
152 
153 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
154 	last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
155 	if ((offset + count) > XFS_ISIZE(ip)) {
156 		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
157 		if (error)
158 			return XFS_ERROR(error);
159 	} else {
160 		if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
161 			last_fsb = MIN(last_fsb, (xfs_fileoff_t)
162 					imap->br_blockcount +
163 					imap->br_startoff);
164 	}
165 	count_fsb = last_fsb - offset_fsb;
166 	ASSERT(count_fsb > 0);
167 
168 	resaligned = count_fsb;
169 	if (unlikely(extsz)) {
170 		if ((temp = do_mod(offset_fsb, extsz)))
171 			resaligned += temp;
172 		if ((temp = do_mod(resaligned, extsz)))
173 			resaligned += extsz - temp;
174 	}
175 
176 	if (unlikely(rt)) {
177 		resrtextents = qblocks = resaligned;
178 		resrtextents /= mp->m_sb.sb_rextsize;
179 		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
180 		quota_flag = XFS_QMOPT_RES_RTBLKS;
181 	} else {
182 		resrtextents = 0;
183 		resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
184 		quota_flag = XFS_QMOPT_RES_REGBLKS;
185 	}
186 
187 	/*
188 	 * Allocate and setup the transaction
189 	 */
190 	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
191 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
192 				  resblks, resrtextents);
193 	/*
194 	 * Check for running out of space, note: need lock to return
195 	 */
196 	if (error) {
197 		xfs_trans_cancel(tp, 0);
198 		return XFS_ERROR(error);
199 	}
200 
201 	xfs_ilock(ip, XFS_ILOCK_EXCL);
202 
203 	error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
204 	if (error)
205 		goto out_trans_cancel;
206 
207 	xfs_trans_ijoin(tp, ip, 0);
208 
209 	bmapi_flag = 0;
210 	if (offset < XFS_ISIZE(ip) || extsz)
211 		bmapi_flag |= XFS_BMAPI_PREALLOC;
212 
213 	/*
214 	 * From this point onwards we overwrite the imap pointer that the
215 	 * caller gave to us.
216 	 */
217 	xfs_bmap_init(&free_list, &firstfsb);
218 	nimaps = 1;
219 	error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
220 				&firstfsb, 0, imap, &nimaps, &free_list);
221 	if (error)
222 		goto out_bmap_cancel;
223 
224 	/*
225 	 * Complete the transaction
226 	 */
227 	error = xfs_bmap_finish(&tp, &free_list, &committed);
228 	if (error)
229 		goto out_bmap_cancel;
230 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
231 	if (error)
232 		goto out_unlock;
233 
234 	/*
235 	 * Copy any maps to caller's array and return any error.
236 	 */
237 	if (nimaps == 0) {
238 		error = XFS_ERROR(ENOSPC);
239 		goto out_unlock;
240 	}
241 
242 	if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
243 		error = xfs_alert_fsblock_zero(ip, imap);
244 
245 out_unlock:
246 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
247 	return error;
248 
249 out_bmap_cancel:
250 	xfs_bmap_cancel(&free_list);
251 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
252 out_trans_cancel:
253 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
254 	goto out_unlock;
255 }
256 
257 /*
258  * If the caller is doing a write at the end of the file, then extend the
259  * allocation out to the file system's write iosize.  We clean up any extra
260  * space left over when the file is closed in xfs_inactive().
261  *
262  * If we find we already have delalloc preallocation beyond EOF, don't do more
263  * preallocation as it it not needed.
264  */
265 STATIC int
266 xfs_iomap_eof_want_preallocate(
267 	xfs_mount_t	*mp,
268 	xfs_inode_t	*ip,
269 	xfs_off_t	offset,
270 	size_t		count,
271 	xfs_bmbt_irec_t *imap,
272 	int		nimaps,
273 	int		*prealloc)
274 {
275 	xfs_fileoff_t   start_fsb;
276 	xfs_filblks_t   count_fsb;
277 	xfs_fsblock_t	firstblock;
278 	int		n, error, imaps;
279 	int		found_delalloc = 0;
280 
281 	*prealloc = 0;
282 	if (offset + count <= XFS_ISIZE(ip))
283 		return 0;
284 
285 	/*
286 	 * If the file is smaller than the minimum prealloc and we are using
287 	 * dynamic preallocation, don't do any preallocation at all as it is
288 	 * likely this is the only write to the file that is going to be done.
289 	 */
290 	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
291 	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks))
292 		return 0;
293 
294 	/*
295 	 * If there are any real blocks past eof, then don't
296 	 * do any speculative allocation.
297 	 */
298 	start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1)));
299 	count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
300 	while (count_fsb > 0) {
301 		imaps = nimaps;
302 		firstblock = NULLFSBLOCK;
303 		error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps,
304 				       0);
305 		if (error)
306 			return error;
307 		for (n = 0; n < imaps; n++) {
308 			if ((imap[n].br_startblock != HOLESTARTBLOCK) &&
309 			    (imap[n].br_startblock != DELAYSTARTBLOCK))
310 				return 0;
311 			start_fsb += imap[n].br_blockcount;
312 			count_fsb -= imap[n].br_blockcount;
313 
314 			if (imap[n].br_startblock == DELAYSTARTBLOCK)
315 				found_delalloc = 1;
316 		}
317 	}
318 	if (!found_delalloc)
319 		*prealloc = 1;
320 	return 0;
321 }
322 
323 /*
324  * Determine the initial size of the preallocation. We are beyond the current
325  * EOF here, but we need to take into account whether this is a sparse write or
326  * an extending write when determining the preallocation size.  Hence we need to
327  * look up the extent that ends at the current write offset and use the result
328  * to determine the preallocation size.
329  *
330  * If the extent is a hole, then preallocation is essentially disabled.
331  * Otherwise we take the size of the preceeding data extent as the basis for the
332  * preallocation size. If the size of the extent is greater than half the
333  * maximum extent length, then use the current offset as the basis. This ensures
334  * that for large files the preallocation size always extends to MAXEXTLEN
335  * rather than falling short due to things like stripe unit/width alignment of
336  * real extents.
337  */
338 STATIC xfs_fsblock_t
339 xfs_iomap_eof_prealloc_initial_size(
340 	struct xfs_mount	*mp,
341 	struct xfs_inode	*ip,
342 	xfs_off_t		offset,
343 	xfs_bmbt_irec_t		*imap,
344 	int			nimaps)
345 {
346 	xfs_fileoff_t   start_fsb;
347 	int		imaps = 1;
348 	int		error;
349 
350 	ASSERT(nimaps >= imaps);
351 
352 	/* if we are using a specific prealloc size, return now */
353 	if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
354 		return 0;
355 
356 	/* If the file is small, then use the minimum prealloc */
357 	if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign))
358 		return 0;
359 
360 	/*
361 	 * As we write multiple pages, the offset will always align to the
362 	 * start of a page and hence point to a hole at EOF. i.e. if the size is
363 	 * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
364 	 * will return FSB 1. Hence if there are blocks in the file, we want to
365 	 * point to the block prior to the EOF block and not the hole that maps
366 	 * directly at @offset.
367 	 */
368 	start_fsb = XFS_B_TO_FSB(mp, offset);
369 	if (start_fsb)
370 		start_fsb--;
371 	error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
372 	if (error)
373 		return 0;
374 
375 	ASSERT(imaps == 1);
376 	if (imap[0].br_startblock == HOLESTARTBLOCK)
377 		return 0;
378 	if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
379 		return imap[0].br_blockcount << 1;
380 	return XFS_B_TO_FSB(mp, offset);
381 }
382 
383 STATIC bool
384 xfs_quota_need_throttle(
385 	struct xfs_inode *ip,
386 	int type,
387 	xfs_fsblock_t alloc_blocks)
388 {
389 	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
390 
391 	if (!dq || !xfs_this_quota_on(ip->i_mount, type))
392 		return false;
393 
394 	/* no hi watermark, no throttle */
395 	if (!dq->q_prealloc_hi_wmark)
396 		return false;
397 
398 	/* under the lo watermark, no throttle */
399 	if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark)
400 		return false;
401 
402 	return true;
403 }
404 
405 STATIC void
406 xfs_quota_calc_throttle(
407 	struct xfs_inode *ip,
408 	int type,
409 	xfs_fsblock_t *qblocks,
410 	int *qshift)
411 {
412 	int64_t freesp;
413 	int shift = 0;
414 	struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
415 
416 	/* over hi wmark, squash the prealloc completely */
417 	if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
418 		*qblocks = 0;
419 		return;
420 	}
421 
422 	freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount;
423 	if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) {
424 		shift = 2;
425 		if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT])
426 			shift += 2;
427 		if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT])
428 			shift += 2;
429 	}
430 
431 	/* only overwrite the throttle values if we are more aggressive */
432 	if ((freesp >> shift) < (*qblocks >> *qshift)) {
433 		*qblocks = freesp;
434 		*qshift = shift;
435 	}
436 }
437 
438 /*
439  * If we don't have a user specified preallocation size, dynamically increase
440  * the preallocation size as the size of the file grows. Cap the maximum size
441  * at a single extent or less if the filesystem is near full. The closer the
442  * filesystem is to full, the smaller the maximum prealocation.
443  */
444 STATIC xfs_fsblock_t
445 xfs_iomap_prealloc_size(
446 	struct xfs_mount	*mp,
447 	struct xfs_inode	*ip,
448 	xfs_off_t		offset,
449 	struct xfs_bmbt_irec	*imap,
450 	int			nimaps)
451 {
452 	xfs_fsblock_t		alloc_blocks = 0;
453 	int			shift = 0;
454 	int64_t			freesp;
455 	xfs_fsblock_t		qblocks;
456 	int			qshift = 0;
457 
458 	alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
459 							   imap, nimaps);
460 	if (!alloc_blocks)
461 		goto check_writeio;
462 	qblocks = alloc_blocks;
463 
464 	/*
465 	 * MAXEXTLEN is not a power of two value but we round the prealloc down
466 	 * to the nearest power of two value after throttling. To prevent the
467 	 * round down from unconditionally reducing the maximum supported prealloc
468 	 * size, we round up first, apply appropriate throttling, round down and
469 	 * cap the value to MAXEXTLEN.
470 	 */
471 	alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
472 				       alloc_blocks);
473 
474 	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
475 	freesp = mp->m_sb.sb_fdblocks;
476 	if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
477 		shift = 2;
478 		if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
479 			shift++;
480 		if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
481 			shift++;
482 		if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
483 			shift++;
484 		if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
485 			shift++;
486 	}
487 
488 	/*
489 	 * Check each quota to cap the prealloc size and provide a shift
490 	 * value to throttle with.
491 	 */
492 	if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
493 		xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift);
494 	if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
495 		xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift);
496 	if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
497 		xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift);
498 
499 	/*
500 	 * The final prealloc size is set to the minimum of free space available
501 	 * in each of the quotas and the overall filesystem.
502 	 *
503 	 * The shift throttle value is set to the maximum value as determined by
504 	 * the global low free space values and per-quota low free space values.
505 	 */
506 	alloc_blocks = MIN(alloc_blocks, qblocks);
507 	shift = MAX(shift, qshift);
508 
509 	if (shift)
510 		alloc_blocks >>= shift;
511 	/*
512 	 * rounddown_pow_of_two() returns an undefined result if we pass in
513 	 * alloc_blocks = 0.
514 	 */
515 	if (alloc_blocks)
516 		alloc_blocks = rounddown_pow_of_two(alloc_blocks);
517 	if (alloc_blocks > MAXEXTLEN)
518 		alloc_blocks = MAXEXTLEN;
519 
520 	/*
521 	 * If we are still trying to allocate more space than is
522 	 * available, squash the prealloc hard. This can happen if we
523 	 * have a large file on a small filesystem and the above
524 	 * lowspace thresholds are smaller than MAXEXTLEN.
525 	 */
526 	while (alloc_blocks && alloc_blocks >= freesp)
527 		alloc_blocks >>= 4;
528 
529 check_writeio:
530 	if (alloc_blocks < mp->m_writeio_blocks)
531 		alloc_blocks = mp->m_writeio_blocks;
532 
533 	trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
534 				      mp->m_writeio_blocks);
535 
536 	return alloc_blocks;
537 }
538 
539 int
540 xfs_iomap_write_delay(
541 	xfs_inode_t	*ip,
542 	xfs_off_t	offset,
543 	size_t		count,
544 	xfs_bmbt_irec_t *ret_imap)
545 {
546 	xfs_mount_t	*mp = ip->i_mount;
547 	xfs_fileoff_t	offset_fsb;
548 	xfs_fileoff_t	last_fsb;
549 	xfs_off_t	aligned_offset;
550 	xfs_fileoff_t	ioalign;
551 	xfs_extlen_t	extsz;
552 	int		nimaps;
553 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
554 	int		prealloc;
555 	int		error;
556 
557 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
558 
559 	/*
560 	 * Make sure that the dquots are there. This doesn't hold
561 	 * the ilock across a disk read.
562 	 */
563 	error = xfs_qm_dqattach_locked(ip, 0);
564 	if (error)
565 		return XFS_ERROR(error);
566 
567 	extsz = xfs_get_extsz_hint(ip);
568 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
569 
570 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
571 				imap, XFS_WRITE_IMAPS, &prealloc);
572 	if (error)
573 		return error;
574 
575 retry:
576 	if (prealloc) {
577 		xfs_fsblock_t	alloc_blocks;
578 
579 		alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
580 						       XFS_WRITE_IMAPS);
581 
582 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
583 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
584 		last_fsb = ioalign + alloc_blocks;
585 	} else {
586 		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
587 	}
588 
589 	if (prealloc || extsz) {
590 		error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
591 		if (error)
592 			return error;
593 	}
594 
595 	/*
596 	 * Make sure preallocation does not create extents beyond the range we
597 	 * actually support in this filesystem.
598 	 */
599 	if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes))
600 		last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
601 
602 	ASSERT(last_fsb > offset_fsb);
603 
604 	nimaps = XFS_WRITE_IMAPS;
605 	error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb,
606 				imap, &nimaps, XFS_BMAPI_ENTIRE);
607 	switch (error) {
608 	case 0:
609 	case ENOSPC:
610 	case EDQUOT:
611 		break;
612 	default:
613 		return XFS_ERROR(error);
614 	}
615 
616 	/*
617 	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
618 	 * without EOF preallocation.
619 	 */
620 	if (nimaps == 0) {
621 		trace_xfs_delalloc_enospc(ip, offset, count);
622 		if (prealloc) {
623 			prealloc = 0;
624 			error = 0;
625 			goto retry;
626 		}
627 		return XFS_ERROR(error ? error : ENOSPC);
628 	}
629 
630 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
631 		return xfs_alert_fsblock_zero(ip, &imap[0]);
632 
633 	/*
634 	 * Tag the inode as speculatively preallocated so we can reclaim this
635 	 * space on demand, if necessary.
636 	 */
637 	if (prealloc)
638 		xfs_inode_set_eofblocks_tag(ip);
639 
640 	*ret_imap = imap[0];
641 	return 0;
642 }
643 
644 /*
645  * Pass in a delayed allocate extent, convert it to real extents;
646  * return to the caller the extent we create which maps on top of
647  * the originating callers request.
648  *
649  * Called without a lock on the inode.
650  *
651  * We no longer bother to look at the incoming map - all we have to
652  * guarantee is that whatever we allocate fills the required range.
653  */
654 int
655 xfs_iomap_write_allocate(
656 	xfs_inode_t	*ip,
657 	xfs_off_t	offset,
658 	size_t		count,
659 	xfs_bmbt_irec_t *imap)
660 {
661 	xfs_mount_t	*mp = ip->i_mount;
662 	xfs_fileoff_t	offset_fsb, last_block;
663 	xfs_fileoff_t	end_fsb, map_start_fsb;
664 	xfs_fsblock_t	first_block;
665 	xfs_bmap_free_t	free_list;
666 	xfs_filblks_t	count_fsb;
667 	xfs_trans_t	*tp;
668 	int		nimaps, committed;
669 	int		error = 0;
670 	int		nres;
671 
672 	/*
673 	 * Make sure that the dquots are there.
674 	 */
675 	error = xfs_qm_dqattach(ip, 0);
676 	if (error)
677 		return XFS_ERROR(error);
678 
679 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
680 	count_fsb = imap->br_blockcount;
681 	map_start_fsb = imap->br_startoff;
682 
683 	XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
684 
685 	while (count_fsb != 0) {
686 		/*
687 		 * Set up a transaction with which to allocate the
688 		 * backing store for the file.  Do allocations in a
689 		 * loop until we get some space in the range we are
690 		 * interested in.  The other space that might be allocated
691 		 * is in the delayed allocation extent on which we sit
692 		 * but before our buffer starts.
693 		 */
694 
695 		nimaps = 0;
696 		while (nimaps == 0) {
697 			tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE);
698 			tp->t_flags |= XFS_TRANS_RESERVE;
699 			nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
700 			error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
701 						  nres, 0);
702 			if (error) {
703 				xfs_trans_cancel(tp, 0);
704 				return XFS_ERROR(error);
705 			}
706 			xfs_ilock(ip, XFS_ILOCK_EXCL);
707 			xfs_trans_ijoin(tp, ip, 0);
708 
709 			xfs_bmap_init(&free_list, &first_block);
710 
711 			/*
712 			 * it is possible that the extents have changed since
713 			 * we did the read call as we dropped the ilock for a
714 			 * while. We have to be careful about truncates or hole
715 			 * punchs here - we are not allowed to allocate
716 			 * non-delalloc blocks here.
717 			 *
718 			 * The only protection against truncation is the pages
719 			 * for the range we are being asked to convert are
720 			 * locked and hence a truncate will block on them
721 			 * first.
722 			 *
723 			 * As a result, if we go beyond the range we really
724 			 * need and hit an delalloc extent boundary followed by
725 			 * a hole while we have excess blocks in the map, we
726 			 * will fill the hole incorrectly and overrun the
727 			 * transaction reservation.
728 			 *
729 			 * Using a single map prevents this as we are forced to
730 			 * check each map we look for overlap with the desired
731 			 * range and abort as soon as we find it. Also, given
732 			 * that we only return a single map, having one beyond
733 			 * what we can return is probably a bit silly.
734 			 *
735 			 * We also need to check that we don't go beyond EOF;
736 			 * this is a truncate optimisation as a truncate sets
737 			 * the new file size before block on the pages we
738 			 * currently have locked under writeback. Because they
739 			 * are about to be tossed, we don't need to write them
740 			 * back....
741 			 */
742 			nimaps = 1;
743 			end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
744 			error = xfs_bmap_last_offset(NULL, ip, &last_block,
745 							XFS_DATA_FORK);
746 			if (error)
747 				goto trans_cancel;
748 
749 			last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
750 			if ((map_start_fsb + count_fsb) > last_block) {
751 				count_fsb = last_block - map_start_fsb;
752 				if (count_fsb == 0) {
753 					error = EAGAIN;
754 					goto trans_cancel;
755 				}
756 			}
757 
758 			/*
759 			 * From this point onwards we overwrite the imap
760 			 * pointer that the caller gave to us.
761 			 */
762 			error = xfs_bmapi_write(tp, ip, map_start_fsb,
763 						count_fsb,
764 						XFS_BMAPI_STACK_SWITCH,
765 						&first_block, 1,
766 						imap, &nimaps, &free_list);
767 			if (error)
768 				goto trans_cancel;
769 
770 			error = xfs_bmap_finish(&tp, &free_list, &committed);
771 			if (error)
772 				goto trans_cancel;
773 
774 			error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
775 			if (error)
776 				goto error0;
777 
778 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
779 		}
780 
781 		/*
782 		 * See if we were able to allocate an extent that
783 		 * covers at least part of the callers request
784 		 */
785 		if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
786 			return xfs_alert_fsblock_zero(ip, imap);
787 
788 		if ((offset_fsb >= imap->br_startoff) &&
789 		    (offset_fsb < (imap->br_startoff +
790 				   imap->br_blockcount))) {
791 			XFS_STATS_INC(xs_xstrat_quick);
792 			return 0;
793 		}
794 
795 		/*
796 		 * So far we have not mapped the requested part of the
797 		 * file, just surrounding data, try again.
798 		 */
799 		count_fsb -= imap->br_blockcount;
800 		map_start_fsb = imap->br_startoff + imap->br_blockcount;
801 	}
802 
803 trans_cancel:
804 	xfs_bmap_cancel(&free_list);
805 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
806 error0:
807 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
808 	return XFS_ERROR(error);
809 }
810 
811 int
812 xfs_iomap_write_unwritten(
813 	xfs_inode_t	*ip,
814 	xfs_off_t	offset,
815 	size_t		count)
816 {
817 	xfs_mount_t	*mp = ip->i_mount;
818 	xfs_fileoff_t	offset_fsb;
819 	xfs_filblks_t	count_fsb;
820 	xfs_filblks_t	numblks_fsb;
821 	xfs_fsblock_t	firstfsb;
822 	int		nimaps;
823 	xfs_trans_t	*tp;
824 	xfs_bmbt_irec_t imap;
825 	xfs_bmap_free_t free_list;
826 	xfs_fsize_t	i_size;
827 	uint		resblks;
828 	int		committed;
829 	int		error;
830 
831 	trace_xfs_unwritten_convert(ip, offset, count);
832 
833 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
834 	count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
835 	count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
836 
837 	/*
838 	 * Reserve enough blocks in this transaction for two complete extent
839 	 * btree splits.  We may be converting the middle part of an unwritten
840 	 * extent and in this case we will insert two new extents in the btree
841 	 * each of which could cause a full split.
842 	 *
843 	 * This reservation amount will be used in the first call to
844 	 * xfs_bmbt_split() to select an AG with enough space to satisfy the
845 	 * rest of the operation.
846 	 */
847 	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
848 
849 	do {
850 		/*
851 		 * set up a transaction to convert the range of extents
852 		 * from unwritten to real. Do allocations in a loop until
853 		 * we have covered the range passed in.
854 		 *
855 		 * Note that we open code the transaction allocation here
856 		 * to pass KM_NOFS--we can't risk to recursing back into
857 		 * the filesystem here as we might be asked to write out
858 		 * the same inode that we complete here and might deadlock
859 		 * on the iolock.
860 		 */
861 		sb_start_intwrite(mp->m_super);
862 		tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
863 		tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
864 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
865 					  resblks, 0);
866 		if (error) {
867 			xfs_trans_cancel(tp, 0);
868 			return XFS_ERROR(error);
869 		}
870 
871 		xfs_ilock(ip, XFS_ILOCK_EXCL);
872 		xfs_trans_ijoin(tp, ip, 0);
873 
874 		/*
875 		 * Modify the unwritten extent state of the buffer.
876 		 */
877 		xfs_bmap_init(&free_list, &firstfsb);
878 		nimaps = 1;
879 		error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
880 				  XFS_BMAPI_CONVERT, &firstfsb,
881 				  1, &imap, &nimaps, &free_list);
882 		if (error)
883 			goto error_on_bmapi_transaction;
884 
885 		/*
886 		 * Log the updated inode size as we go.  We have to be careful
887 		 * to only log it up to the actual write offset if it is
888 		 * halfway into a block.
889 		 */
890 		i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
891 		if (i_size > offset + count)
892 			i_size = offset + count;
893 
894 		i_size = xfs_new_eof(ip, i_size);
895 		if (i_size) {
896 			ip->i_d.di_size = i_size;
897 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
898 		}
899 
900 		error = xfs_bmap_finish(&tp, &free_list, &committed);
901 		if (error)
902 			goto error_on_bmapi_transaction;
903 
904 		error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
905 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
906 		if (error)
907 			return XFS_ERROR(error);
908 
909 		if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
910 			return xfs_alert_fsblock_zero(ip, &imap);
911 
912 		if ((numblks_fsb = imap.br_blockcount) == 0) {
913 			/*
914 			 * The numblks_fsb value should always get
915 			 * smaller, otherwise the loop is stuck.
916 			 */
917 			ASSERT(imap.br_blockcount);
918 			break;
919 		}
920 		offset_fsb += numblks_fsb;
921 		count_fsb -= numblks_fsb;
922 	} while (count_fsb > 0);
923 
924 	return 0;
925 
926 error_on_bmapi_transaction:
927 	xfs_bmap_cancel(&free_list);
928 	xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
929 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
930 	return XFS_ERROR(error);
931 }
932