xref: /openbmc/linux/fs/xfs/libxfs/xfs_rmap_btree.c (revision 1f5c071d)
1 /*
2  * Copyright (c) 2014 Red Hat, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_sb.h"
26 #include "xfs_mount.h"
27 #include "xfs_defer.h"
28 #include "xfs_inode.h"
29 #include "xfs_trans.h"
30 #include "xfs_alloc.h"
31 #include "xfs_btree.h"
32 #include "xfs_rmap.h"
33 #include "xfs_rmap_btree.h"
34 #include "xfs_trace.h"
35 #include "xfs_cksum.h"
36 #include "xfs_error.h"
37 #include "xfs_extent_busy.h"
38 #include "xfs_ag_resv.h"
39 
40 /*
41  * Reverse map btree.
42  *
43  * This is a per-ag tree used to track the owner(s) of a given extent. With
44  * reflink it is possible for there to be multiple owners, which is a departure
45  * from classic XFS. Owner records for data extents are inserted when the
46  * extent is mapped and removed when an extent is unmapped.  Owner records for
47  * all other block types (i.e. metadata) are inserted when an extent is
48  * allocated and removed when an extent is freed. There can only be one owner
49  * of a metadata extent, usually an inode or some other metadata structure like
50  * an AG btree.
51  *
52  * The rmap btree is part of the free space management, so blocks for the tree
53  * are sourced from the agfl. Hence we need transaction reservation support for
54  * this tree so that the freelist is always large enough. This also impacts on
55  * the minimum space we need to leave free in the AG.
56  *
57  * The tree is ordered by [ag block, owner, offset]. This is a large key size,
58  * but it is the only way to enforce unique keys when a block can be owned by
59  * multiple files at any offset. There's no need to order/search by extent
60  * size for online updating/management of the tree. It is intended that most
61  * reverse lookups will be to find the owner(s) of a particular block, or to
62  * try to recover tree and file data from corrupt primary metadata.
63  */
64 
65 static struct xfs_btree_cur *
66 xfs_rmapbt_dup_cursor(
67 	struct xfs_btree_cur	*cur)
68 {
69 	return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
70 			cur->bc_private.a.agbp, cur->bc_private.a.agno);
71 }
72 
73 STATIC void
74 xfs_rmapbt_set_root(
75 	struct xfs_btree_cur	*cur,
76 	union xfs_btree_ptr	*ptr,
77 	int			inc)
78 {
79 	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
80 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
81 	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
82 	int			btnum = cur->bc_btnum;
83 	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
84 
85 	ASSERT(ptr->s != 0);
86 
87 	agf->agf_roots[btnum] = ptr->s;
88 	be32_add_cpu(&agf->agf_levels[btnum], inc);
89 	pag->pagf_levels[btnum] += inc;
90 	xfs_perag_put(pag);
91 
92 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
93 }
94 
95 STATIC int
96 xfs_rmapbt_alloc_block(
97 	struct xfs_btree_cur	*cur,
98 	union xfs_btree_ptr	*start,
99 	union xfs_btree_ptr	*new,
100 	int			*stat)
101 {
102 	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
103 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
104 	int			error;
105 	xfs_agblock_t		bno;
106 
107 	/* Allocate the new block from the freelist. If we can't, give up.  */
108 	error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
109 				       &bno, 1);
110 	if (error)
111 		return error;
112 
113 	trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
114 			bno, 1);
115 	if (bno == NULLAGBLOCK) {
116 		*stat = 0;
117 		return 0;
118 	}
119 
120 	xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
121 			false);
122 
123 	xfs_trans_agbtree_delta(cur->bc_tp, 1);
124 	new->s = cpu_to_be32(bno);
125 	be32_add_cpu(&agf->agf_rmap_blocks, 1);
126 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
127 
128 	xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
129 
130 	*stat = 1;
131 	return 0;
132 }
133 
134 STATIC int
135 xfs_rmapbt_free_block(
136 	struct xfs_btree_cur	*cur,
137 	struct xfs_buf		*bp)
138 {
139 	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
140 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
141 	xfs_agblock_t		bno;
142 	int			error;
143 
144 	bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
145 	trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
146 			bno, 1);
147 	be32_add_cpu(&agf->agf_rmap_blocks, -1);
148 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
149 	error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
150 	if (error)
151 		return error;
152 
153 	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
154 			      XFS_EXTENT_BUSY_SKIP_DISCARD);
155 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
156 
157 	xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
158 
159 	return 0;
160 }
161 
162 STATIC int
163 xfs_rmapbt_get_minrecs(
164 	struct xfs_btree_cur	*cur,
165 	int			level)
166 {
167 	return cur->bc_mp->m_rmap_mnr[level != 0];
168 }
169 
170 STATIC int
171 xfs_rmapbt_get_maxrecs(
172 	struct xfs_btree_cur	*cur,
173 	int			level)
174 {
175 	return cur->bc_mp->m_rmap_mxr[level != 0];
176 }
177 
178 STATIC void
179 xfs_rmapbt_init_key_from_rec(
180 	union xfs_btree_key	*key,
181 	union xfs_btree_rec	*rec)
182 {
183 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
184 	key->rmap.rm_owner = rec->rmap.rm_owner;
185 	key->rmap.rm_offset = rec->rmap.rm_offset;
186 }
187 
188 /*
189  * The high key for a reverse mapping record can be computed by shifting
190  * the startblock and offset to the highest value that would still map
191  * to that record.  In practice this means that we add blockcount-1 to
192  * the startblock for all records, and if the record is for a data/attr
193  * fork mapping, we add blockcount-1 to the offset too.
194  */
195 STATIC void
196 xfs_rmapbt_init_high_key_from_rec(
197 	union xfs_btree_key	*key,
198 	union xfs_btree_rec	*rec)
199 {
200 	uint64_t		off;
201 	int			adj;
202 
203 	adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
204 
205 	key->rmap.rm_startblock = rec->rmap.rm_startblock;
206 	be32_add_cpu(&key->rmap.rm_startblock, adj);
207 	key->rmap.rm_owner = rec->rmap.rm_owner;
208 	key->rmap.rm_offset = rec->rmap.rm_offset;
209 	if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
210 	    XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
211 		return;
212 	off = be64_to_cpu(key->rmap.rm_offset);
213 	off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
214 	key->rmap.rm_offset = cpu_to_be64(off);
215 }
216 
217 STATIC void
218 xfs_rmapbt_init_rec_from_cur(
219 	struct xfs_btree_cur	*cur,
220 	union xfs_btree_rec	*rec)
221 {
222 	rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
223 	rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
224 	rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
225 	rec->rmap.rm_offset = cpu_to_be64(
226 			xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
227 }
228 
229 STATIC void
230 xfs_rmapbt_init_ptr_from_cur(
231 	struct xfs_btree_cur	*cur,
232 	union xfs_btree_ptr	*ptr)
233 {
234 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
235 
236 	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
237 
238 	ptr->s = agf->agf_roots[cur->bc_btnum];
239 }
240 
241 STATIC int64_t
242 xfs_rmapbt_key_diff(
243 	struct xfs_btree_cur	*cur,
244 	union xfs_btree_key	*key)
245 {
246 	struct xfs_rmap_irec	*rec = &cur->bc_rec.r;
247 	struct xfs_rmap_key	*kp = &key->rmap;
248 	__u64			x, y;
249 	int64_t			d;
250 
251 	d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
252 	if (d)
253 		return d;
254 
255 	x = be64_to_cpu(kp->rm_owner);
256 	y = rec->rm_owner;
257 	if (x > y)
258 		return 1;
259 	else if (y > x)
260 		return -1;
261 
262 	x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
263 	y = rec->rm_offset;
264 	if (x > y)
265 		return 1;
266 	else if (y > x)
267 		return -1;
268 	return 0;
269 }
270 
271 STATIC int64_t
272 xfs_rmapbt_diff_two_keys(
273 	struct xfs_btree_cur	*cur,
274 	union xfs_btree_key	*k1,
275 	union xfs_btree_key	*k2)
276 {
277 	struct xfs_rmap_key	*kp1 = &k1->rmap;
278 	struct xfs_rmap_key	*kp2 = &k2->rmap;
279 	int64_t			d;
280 	__u64			x, y;
281 
282 	d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
283 		       be32_to_cpu(kp2->rm_startblock);
284 	if (d)
285 		return d;
286 
287 	x = be64_to_cpu(kp1->rm_owner);
288 	y = be64_to_cpu(kp2->rm_owner);
289 	if (x > y)
290 		return 1;
291 	else if (y > x)
292 		return -1;
293 
294 	x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
295 	y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
296 	if (x > y)
297 		return 1;
298 	else if (y > x)
299 		return -1;
300 	return 0;
301 }
302 
303 static xfs_failaddr_t
304 xfs_rmapbt_verify(
305 	struct xfs_buf		*bp)
306 {
307 	struct xfs_mount	*mp = bp->b_target->bt_mount;
308 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
309 	struct xfs_perag	*pag = bp->b_pag;
310 	xfs_failaddr_t		fa;
311 	unsigned int		level;
312 
313 	/*
314 	 * magic number and level verification
315 	 *
316 	 * During growfs operations, we can't verify the exact level or owner as
317 	 * the perag is not fully initialised and hence not attached to the
318 	 * buffer.  In this case, check against the maximum tree depth.
319 	 *
320 	 * Similarly, during log recovery we will have a perag structure
321 	 * attached, but the agf information will not yet have been initialised
322 	 * from the on disk AGF. Again, we can only check against maximum limits
323 	 * in this case.
324 	 */
325 	if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
326 		return __this_address;
327 
328 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
329 		return __this_address;
330 	fa = xfs_btree_sblock_v5hdr_verify(bp);
331 	if (fa)
332 		return fa;
333 
334 	level = be16_to_cpu(block->bb_level);
335 	if (pag && pag->pagf_init) {
336 		if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
337 			return __this_address;
338 	} else if (level >= mp->m_rmap_maxlevels)
339 		return __this_address;
340 
341 	return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
342 }
343 
344 static void
345 xfs_rmapbt_read_verify(
346 	struct xfs_buf	*bp)
347 {
348 	xfs_failaddr_t	fa;
349 
350 	if (!xfs_btree_sblock_verify_crc(bp))
351 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
352 	else {
353 		fa = xfs_rmapbt_verify(bp);
354 		if (fa)
355 			xfs_verifier_error(bp, -EFSCORRUPTED, fa);
356 	}
357 
358 	if (bp->b_error)
359 		trace_xfs_btree_corrupt(bp, _RET_IP_);
360 }
361 
362 static void
363 xfs_rmapbt_write_verify(
364 	struct xfs_buf	*bp)
365 {
366 	xfs_failaddr_t	fa;
367 
368 	fa = xfs_rmapbt_verify(bp);
369 	if (fa) {
370 		trace_xfs_btree_corrupt(bp, _RET_IP_);
371 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
372 		return;
373 	}
374 	xfs_btree_sblock_calc_crc(bp);
375 
376 }
377 
378 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
379 	.name			= "xfs_rmapbt",
380 	.verify_read		= xfs_rmapbt_read_verify,
381 	.verify_write		= xfs_rmapbt_write_verify,
382 	.verify_struct		= xfs_rmapbt_verify,
383 };
384 
385 STATIC int
386 xfs_rmapbt_keys_inorder(
387 	struct xfs_btree_cur	*cur,
388 	union xfs_btree_key	*k1,
389 	union xfs_btree_key	*k2)
390 {
391 	uint32_t		x;
392 	uint32_t		y;
393 	uint64_t		a;
394 	uint64_t		b;
395 
396 	x = be32_to_cpu(k1->rmap.rm_startblock);
397 	y = be32_to_cpu(k2->rmap.rm_startblock);
398 	if (x < y)
399 		return 1;
400 	else if (x > y)
401 		return 0;
402 	a = be64_to_cpu(k1->rmap.rm_owner);
403 	b = be64_to_cpu(k2->rmap.rm_owner);
404 	if (a < b)
405 		return 1;
406 	else if (a > b)
407 		return 0;
408 	a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
409 	b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
410 	if (a <= b)
411 		return 1;
412 	return 0;
413 }
414 
415 STATIC int
416 xfs_rmapbt_recs_inorder(
417 	struct xfs_btree_cur	*cur,
418 	union xfs_btree_rec	*r1,
419 	union xfs_btree_rec	*r2)
420 {
421 	uint32_t		x;
422 	uint32_t		y;
423 	uint64_t		a;
424 	uint64_t		b;
425 
426 	x = be32_to_cpu(r1->rmap.rm_startblock);
427 	y = be32_to_cpu(r2->rmap.rm_startblock);
428 	if (x < y)
429 		return 1;
430 	else if (x > y)
431 		return 0;
432 	a = be64_to_cpu(r1->rmap.rm_owner);
433 	b = be64_to_cpu(r2->rmap.rm_owner);
434 	if (a < b)
435 		return 1;
436 	else if (a > b)
437 		return 0;
438 	a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
439 	b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
440 	if (a <= b)
441 		return 1;
442 	return 0;
443 }
444 
445 static const struct xfs_btree_ops xfs_rmapbt_ops = {
446 	.rec_len		= sizeof(struct xfs_rmap_rec),
447 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
448 
449 	.dup_cursor		= xfs_rmapbt_dup_cursor,
450 	.set_root		= xfs_rmapbt_set_root,
451 	.alloc_block		= xfs_rmapbt_alloc_block,
452 	.free_block		= xfs_rmapbt_free_block,
453 	.get_minrecs		= xfs_rmapbt_get_minrecs,
454 	.get_maxrecs		= xfs_rmapbt_get_maxrecs,
455 	.init_key_from_rec	= xfs_rmapbt_init_key_from_rec,
456 	.init_high_key_from_rec	= xfs_rmapbt_init_high_key_from_rec,
457 	.init_rec_from_cur	= xfs_rmapbt_init_rec_from_cur,
458 	.init_ptr_from_cur	= xfs_rmapbt_init_ptr_from_cur,
459 	.key_diff		= xfs_rmapbt_key_diff,
460 	.buf_ops		= &xfs_rmapbt_buf_ops,
461 	.diff_two_keys		= xfs_rmapbt_diff_two_keys,
462 	.keys_inorder		= xfs_rmapbt_keys_inorder,
463 	.recs_inorder		= xfs_rmapbt_recs_inorder,
464 };
465 
466 /*
467  * Allocate a new allocation btree cursor.
468  */
469 struct xfs_btree_cur *
470 xfs_rmapbt_init_cursor(
471 	struct xfs_mount	*mp,
472 	struct xfs_trans	*tp,
473 	struct xfs_buf		*agbp,
474 	xfs_agnumber_t		agno)
475 {
476 	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
477 	struct xfs_btree_cur	*cur;
478 
479 	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
480 	cur->bc_tp = tp;
481 	cur->bc_mp = mp;
482 	/* Overlapping btree; 2 keys per pointer. */
483 	cur->bc_btnum = XFS_BTNUM_RMAP;
484 	cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
485 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
486 	cur->bc_ops = &xfs_rmapbt_ops;
487 	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
488 	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
489 
490 	cur->bc_private.a.agbp = agbp;
491 	cur->bc_private.a.agno = agno;
492 
493 	return cur;
494 }
495 
496 /*
497  * Calculate number of records in an rmap btree block.
498  */
499 int
500 xfs_rmapbt_maxrecs(
501 	int			blocklen,
502 	int			leaf)
503 {
504 	blocklen -= XFS_RMAP_BLOCK_LEN;
505 
506 	if (leaf)
507 		return blocklen / sizeof(struct xfs_rmap_rec);
508 	return blocklen /
509 		(2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
510 }
511 
512 /* Compute the maximum height of an rmap btree. */
513 void
514 xfs_rmapbt_compute_maxlevels(
515 	struct xfs_mount		*mp)
516 {
517 	/*
518 	 * On a non-reflink filesystem, the maximum number of rmap
519 	 * records is the number of blocks in the AG, hence the max
520 	 * rmapbt height is log_$maxrecs($agblocks).  However, with
521 	 * reflink each AG block can have up to 2^32 (per the refcount
522 	 * record format) owners, which means that theoretically we
523 	 * could face up to 2^64 rmap records.
524 	 *
525 	 * That effectively means that the max rmapbt height must be
526 	 * XFS_BTREE_MAXLEVELS.  "Fortunately" we'll run out of AG
527 	 * blocks to feed the rmapbt long before the rmapbt reaches
528 	 * maximum height.  The reflink code uses ag_resv_critical to
529 	 * disallow reflinking when less than 10% of the per-AG metadata
530 	 * block reservation since the fallback is a regular file copy.
531 	 */
532 	if (xfs_sb_version_hasreflink(&mp->m_sb))
533 		mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
534 	else
535 		mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
536 				mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
537 }
538 
539 /* Calculate the refcount btree size for some records. */
540 xfs_extlen_t
541 xfs_rmapbt_calc_size(
542 	struct xfs_mount	*mp,
543 	unsigned long long	len)
544 {
545 	return xfs_btree_calc_size(mp->m_rmap_mnr, len);
546 }
547 
548 /*
549  * Calculate the maximum refcount btree size.
550  */
551 xfs_extlen_t
552 xfs_rmapbt_max_size(
553 	struct xfs_mount	*mp,
554 	xfs_agblock_t		agblocks)
555 {
556 	/* Bail out if we're uninitialized, which can happen in mkfs. */
557 	if (mp->m_rmap_mxr[0] == 0)
558 		return 0;
559 
560 	return xfs_rmapbt_calc_size(mp, agblocks);
561 }
562 
563 /*
564  * Figure out how many blocks to reserve and how many are used by this btree.
565  */
566 int
567 xfs_rmapbt_calc_reserves(
568 	struct xfs_mount	*mp,
569 	xfs_agnumber_t		agno,
570 	xfs_extlen_t		*ask,
571 	xfs_extlen_t		*used)
572 {
573 	struct xfs_buf		*agbp;
574 	struct xfs_agf		*agf;
575 	xfs_agblock_t		agblocks;
576 	xfs_extlen_t		tree_len;
577 	int			error;
578 
579 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
580 		return 0;
581 
582 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
583 	if (error)
584 		return error;
585 
586 	agf = XFS_BUF_TO_AGF(agbp);
587 	agblocks = be32_to_cpu(agf->agf_length);
588 	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
589 	xfs_buf_relse(agbp);
590 
591 	/* Reserve 1% of the AG or enough for 1 block per record. */
592 	*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
593 	*used += tree_len;
594 
595 	return error;
596 }
597