1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_sb.h" 14 #include "xfs_mount.h" 15 #include "xfs_defer.h" 16 #include "xfs_inode.h" 17 #include "xfs_trans.h" 18 #include "xfs_alloc.h" 19 #include "xfs_btree.h" 20 #include "xfs_rmap.h" 21 #include "xfs_rmap_btree.h" 22 #include "xfs_trace.h" 23 #include "xfs_cksum.h" 24 #include "xfs_error.h" 25 #include "xfs_extent_busy.h" 26 #include "xfs_ag_resv.h" 27 28 /* 29 * Reverse map btree. 30 * 31 * This is a per-ag tree used to track the owner(s) of a given extent. With 32 * reflink it is possible for there to be multiple owners, which is a departure 33 * from classic XFS. Owner records for data extents are inserted when the 34 * extent is mapped and removed when an extent is unmapped. Owner records for 35 * all other block types (i.e. metadata) are inserted when an extent is 36 * allocated and removed when an extent is freed. There can only be one owner 37 * of a metadata extent, usually an inode or some other metadata structure like 38 * an AG btree. 39 * 40 * The rmap btree is part of the free space management, so blocks for the tree 41 * are sourced from the agfl. Hence we need transaction reservation support for 42 * this tree so that the freelist is always large enough. This also impacts on 43 * the minimum space we need to leave free in the AG. 44 * 45 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 46 * but it is the only way to enforce unique keys when a block can be owned by 47 * multiple files at any offset. There's no need to order/search by extent 48 * size for online updating/management of the tree. It is intended that most 49 * reverse lookups will be to find the owner(s) of a particular block, or to 50 * try to recover tree and file data from corrupt primary metadata. 51 */ 52 53 static struct xfs_btree_cur * 54 xfs_rmapbt_dup_cursor( 55 struct xfs_btree_cur *cur) 56 { 57 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 58 cur->bc_private.a.agbp, cur->bc_private.a.agno); 59 } 60 61 STATIC void 62 xfs_rmapbt_set_root( 63 struct xfs_btree_cur *cur, 64 union xfs_btree_ptr *ptr, 65 int inc) 66 { 67 struct xfs_buf *agbp = cur->bc_private.a.agbp; 68 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 69 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 70 int btnum = cur->bc_btnum; 71 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); 72 73 ASSERT(ptr->s != 0); 74 75 agf->agf_roots[btnum] = ptr->s; 76 be32_add_cpu(&agf->agf_levels[btnum], inc); 77 pag->pagf_levels[btnum] += inc; 78 xfs_perag_put(pag); 79 80 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 81 } 82 83 STATIC int 84 xfs_rmapbt_alloc_block( 85 struct xfs_btree_cur *cur, 86 union xfs_btree_ptr *start, 87 union xfs_btree_ptr *new, 88 int *stat) 89 { 90 struct xfs_buf *agbp = cur->bc_private.a.agbp; 91 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 92 int error; 93 xfs_agblock_t bno; 94 95 /* Allocate the new block from the freelist. If we can't, give up. */ 96 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, 97 &bno, 1); 98 if (error) 99 return error; 100 101 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, 102 bno, 1); 103 if (bno == NULLAGBLOCK) { 104 *stat = 0; 105 return 0; 106 } 107 108 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, 109 false); 110 111 xfs_trans_agbtree_delta(cur->bc_tp, 1); 112 new->s = cpu_to_be32(bno); 113 be32_add_cpu(&agf->agf_rmap_blocks, 1); 114 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 115 116 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); 117 118 *stat = 1; 119 return 0; 120 } 121 122 STATIC int 123 xfs_rmapbt_free_block( 124 struct xfs_btree_cur *cur, 125 struct xfs_buf *bp) 126 { 127 struct xfs_buf *agbp = cur->bc_private.a.agbp; 128 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 129 xfs_agblock_t bno; 130 int error; 131 132 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); 133 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, 134 bno, 1); 135 be32_add_cpu(&agf->agf_rmap_blocks, -1); 136 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 137 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 138 if (error) 139 return error; 140 141 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 142 XFS_EXTENT_BUSY_SKIP_DISCARD); 143 xfs_trans_agbtree_delta(cur->bc_tp, -1); 144 145 xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); 146 147 return 0; 148 } 149 150 STATIC int 151 xfs_rmapbt_get_minrecs( 152 struct xfs_btree_cur *cur, 153 int level) 154 { 155 return cur->bc_mp->m_rmap_mnr[level != 0]; 156 } 157 158 STATIC int 159 xfs_rmapbt_get_maxrecs( 160 struct xfs_btree_cur *cur, 161 int level) 162 { 163 return cur->bc_mp->m_rmap_mxr[level != 0]; 164 } 165 166 STATIC void 167 xfs_rmapbt_init_key_from_rec( 168 union xfs_btree_key *key, 169 union xfs_btree_rec *rec) 170 { 171 key->rmap.rm_startblock = rec->rmap.rm_startblock; 172 key->rmap.rm_owner = rec->rmap.rm_owner; 173 key->rmap.rm_offset = rec->rmap.rm_offset; 174 } 175 176 /* 177 * The high key for a reverse mapping record can be computed by shifting 178 * the startblock and offset to the highest value that would still map 179 * to that record. In practice this means that we add blockcount-1 to 180 * the startblock for all records, and if the record is for a data/attr 181 * fork mapping, we add blockcount-1 to the offset too. 182 */ 183 STATIC void 184 xfs_rmapbt_init_high_key_from_rec( 185 union xfs_btree_key *key, 186 union xfs_btree_rec *rec) 187 { 188 uint64_t off; 189 int adj; 190 191 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 192 193 key->rmap.rm_startblock = rec->rmap.rm_startblock; 194 be32_add_cpu(&key->rmap.rm_startblock, adj); 195 key->rmap.rm_owner = rec->rmap.rm_owner; 196 key->rmap.rm_offset = rec->rmap.rm_offset; 197 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 198 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 199 return; 200 off = be64_to_cpu(key->rmap.rm_offset); 201 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 202 key->rmap.rm_offset = cpu_to_be64(off); 203 } 204 205 STATIC void 206 xfs_rmapbt_init_rec_from_cur( 207 struct xfs_btree_cur *cur, 208 union xfs_btree_rec *rec) 209 { 210 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 211 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 212 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 213 rec->rmap.rm_offset = cpu_to_be64( 214 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 215 } 216 217 STATIC void 218 xfs_rmapbt_init_ptr_from_cur( 219 struct xfs_btree_cur *cur, 220 union xfs_btree_ptr *ptr) 221 { 222 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 223 224 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); 225 226 ptr->s = agf->agf_roots[cur->bc_btnum]; 227 } 228 229 STATIC int64_t 230 xfs_rmapbt_key_diff( 231 struct xfs_btree_cur *cur, 232 union xfs_btree_key *key) 233 { 234 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 235 struct xfs_rmap_key *kp = &key->rmap; 236 __u64 x, y; 237 int64_t d; 238 239 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 240 if (d) 241 return d; 242 243 x = be64_to_cpu(kp->rm_owner); 244 y = rec->rm_owner; 245 if (x > y) 246 return 1; 247 else if (y > x) 248 return -1; 249 250 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); 251 y = rec->rm_offset; 252 if (x > y) 253 return 1; 254 else if (y > x) 255 return -1; 256 return 0; 257 } 258 259 STATIC int64_t 260 xfs_rmapbt_diff_two_keys( 261 struct xfs_btree_cur *cur, 262 union xfs_btree_key *k1, 263 union xfs_btree_key *k2) 264 { 265 struct xfs_rmap_key *kp1 = &k1->rmap; 266 struct xfs_rmap_key *kp2 = &k2->rmap; 267 int64_t d; 268 __u64 x, y; 269 270 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 271 be32_to_cpu(kp2->rm_startblock); 272 if (d) 273 return d; 274 275 x = be64_to_cpu(kp1->rm_owner); 276 y = be64_to_cpu(kp2->rm_owner); 277 if (x > y) 278 return 1; 279 else if (y > x) 280 return -1; 281 282 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); 283 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); 284 if (x > y) 285 return 1; 286 else if (y > x) 287 return -1; 288 return 0; 289 } 290 291 static xfs_failaddr_t 292 xfs_rmapbt_verify( 293 struct xfs_buf *bp) 294 { 295 struct xfs_mount *mp = bp->b_target->bt_mount; 296 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 297 struct xfs_perag *pag = bp->b_pag; 298 xfs_failaddr_t fa; 299 unsigned int level; 300 301 /* 302 * magic number and level verification 303 * 304 * During growfs operations, we can't verify the exact level or owner as 305 * the perag is not fully initialised and hence not attached to the 306 * buffer. In this case, check against the maximum tree depth. 307 * 308 * Similarly, during log recovery we will have a perag structure 309 * attached, but the agf information will not yet have been initialised 310 * from the on disk AGF. Again, we can only check against maximum limits 311 * in this case. 312 */ 313 if (!xfs_verify_magic(bp, block->bb_magic)) 314 return __this_address; 315 316 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 317 return __this_address; 318 fa = xfs_btree_sblock_v5hdr_verify(bp); 319 if (fa) 320 return fa; 321 322 level = be16_to_cpu(block->bb_level); 323 if (pag && pag->pagf_init) { 324 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) 325 return __this_address; 326 } else if (level >= mp->m_rmap_maxlevels) 327 return __this_address; 328 329 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); 330 } 331 332 static void 333 xfs_rmapbt_read_verify( 334 struct xfs_buf *bp) 335 { 336 xfs_failaddr_t fa; 337 338 if (!xfs_btree_sblock_verify_crc(bp)) 339 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 340 else { 341 fa = xfs_rmapbt_verify(bp); 342 if (fa) 343 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 344 } 345 346 if (bp->b_error) 347 trace_xfs_btree_corrupt(bp, _RET_IP_); 348 } 349 350 static void 351 xfs_rmapbt_write_verify( 352 struct xfs_buf *bp) 353 { 354 xfs_failaddr_t fa; 355 356 fa = xfs_rmapbt_verify(bp); 357 if (fa) { 358 trace_xfs_btree_corrupt(bp, _RET_IP_); 359 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 360 return; 361 } 362 xfs_btree_sblock_calc_crc(bp); 363 364 } 365 366 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 367 .name = "xfs_rmapbt", 368 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) }, 369 .verify_read = xfs_rmapbt_read_verify, 370 .verify_write = xfs_rmapbt_write_verify, 371 .verify_struct = xfs_rmapbt_verify, 372 }; 373 374 STATIC int 375 xfs_rmapbt_keys_inorder( 376 struct xfs_btree_cur *cur, 377 union xfs_btree_key *k1, 378 union xfs_btree_key *k2) 379 { 380 uint32_t x; 381 uint32_t y; 382 uint64_t a; 383 uint64_t b; 384 385 x = be32_to_cpu(k1->rmap.rm_startblock); 386 y = be32_to_cpu(k2->rmap.rm_startblock); 387 if (x < y) 388 return 1; 389 else if (x > y) 390 return 0; 391 a = be64_to_cpu(k1->rmap.rm_owner); 392 b = be64_to_cpu(k2->rmap.rm_owner); 393 if (a < b) 394 return 1; 395 else if (a > b) 396 return 0; 397 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); 398 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); 399 if (a <= b) 400 return 1; 401 return 0; 402 } 403 404 STATIC int 405 xfs_rmapbt_recs_inorder( 406 struct xfs_btree_cur *cur, 407 union xfs_btree_rec *r1, 408 union xfs_btree_rec *r2) 409 { 410 uint32_t x; 411 uint32_t y; 412 uint64_t a; 413 uint64_t b; 414 415 x = be32_to_cpu(r1->rmap.rm_startblock); 416 y = be32_to_cpu(r2->rmap.rm_startblock); 417 if (x < y) 418 return 1; 419 else if (x > y) 420 return 0; 421 a = be64_to_cpu(r1->rmap.rm_owner); 422 b = be64_to_cpu(r2->rmap.rm_owner); 423 if (a < b) 424 return 1; 425 else if (a > b) 426 return 0; 427 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); 428 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); 429 if (a <= b) 430 return 1; 431 return 0; 432 } 433 434 static const struct xfs_btree_ops xfs_rmapbt_ops = { 435 .rec_len = sizeof(struct xfs_rmap_rec), 436 .key_len = 2 * sizeof(struct xfs_rmap_key), 437 438 .dup_cursor = xfs_rmapbt_dup_cursor, 439 .set_root = xfs_rmapbt_set_root, 440 .alloc_block = xfs_rmapbt_alloc_block, 441 .free_block = xfs_rmapbt_free_block, 442 .get_minrecs = xfs_rmapbt_get_minrecs, 443 .get_maxrecs = xfs_rmapbt_get_maxrecs, 444 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 445 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 446 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 447 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 448 .key_diff = xfs_rmapbt_key_diff, 449 .buf_ops = &xfs_rmapbt_buf_ops, 450 .diff_two_keys = xfs_rmapbt_diff_two_keys, 451 .keys_inorder = xfs_rmapbt_keys_inorder, 452 .recs_inorder = xfs_rmapbt_recs_inorder, 453 }; 454 455 /* 456 * Allocate a new allocation btree cursor. 457 */ 458 struct xfs_btree_cur * 459 xfs_rmapbt_init_cursor( 460 struct xfs_mount *mp, 461 struct xfs_trans *tp, 462 struct xfs_buf *agbp, 463 xfs_agnumber_t agno) 464 { 465 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 466 struct xfs_btree_cur *cur; 467 468 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 469 cur->bc_tp = tp; 470 cur->bc_mp = mp; 471 /* Overlapping btree; 2 keys per pointer. */ 472 cur->bc_btnum = XFS_BTNUM_RMAP; 473 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; 474 cur->bc_blocklog = mp->m_sb.sb_blocklog; 475 cur->bc_ops = &xfs_rmapbt_ops; 476 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); 477 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); 478 479 cur->bc_private.a.agbp = agbp; 480 cur->bc_private.a.agno = agno; 481 482 return cur; 483 } 484 485 /* 486 * Calculate number of records in an rmap btree block. 487 */ 488 int 489 xfs_rmapbt_maxrecs( 490 int blocklen, 491 int leaf) 492 { 493 blocklen -= XFS_RMAP_BLOCK_LEN; 494 495 if (leaf) 496 return blocklen / sizeof(struct xfs_rmap_rec); 497 return blocklen / 498 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 499 } 500 501 /* Compute the maximum height of an rmap btree. */ 502 void 503 xfs_rmapbt_compute_maxlevels( 504 struct xfs_mount *mp) 505 { 506 /* 507 * On a non-reflink filesystem, the maximum number of rmap 508 * records is the number of blocks in the AG, hence the max 509 * rmapbt height is log_$maxrecs($agblocks). However, with 510 * reflink each AG block can have up to 2^32 (per the refcount 511 * record format) owners, which means that theoretically we 512 * could face up to 2^64 rmap records. 513 * 514 * That effectively means that the max rmapbt height must be 515 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG 516 * blocks to feed the rmapbt long before the rmapbt reaches 517 * maximum height. The reflink code uses ag_resv_critical to 518 * disallow reflinking when less than 10% of the per-AG metadata 519 * block reservation since the fallback is a regular file copy. 520 */ 521 if (xfs_sb_version_hasreflink(&mp->m_sb)) 522 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; 523 else 524 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 525 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 526 } 527 528 /* Calculate the refcount btree size for some records. */ 529 xfs_extlen_t 530 xfs_rmapbt_calc_size( 531 struct xfs_mount *mp, 532 unsigned long long len) 533 { 534 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 535 } 536 537 /* 538 * Calculate the maximum refcount btree size. 539 */ 540 xfs_extlen_t 541 xfs_rmapbt_max_size( 542 struct xfs_mount *mp, 543 xfs_agblock_t agblocks) 544 { 545 /* Bail out if we're uninitialized, which can happen in mkfs. */ 546 if (mp->m_rmap_mxr[0] == 0) 547 return 0; 548 549 return xfs_rmapbt_calc_size(mp, agblocks); 550 } 551 552 /* 553 * Figure out how many blocks to reserve and how many are used by this btree. 554 */ 555 int 556 xfs_rmapbt_calc_reserves( 557 struct xfs_mount *mp, 558 struct xfs_trans *tp, 559 xfs_agnumber_t agno, 560 xfs_extlen_t *ask, 561 xfs_extlen_t *used) 562 { 563 struct xfs_buf *agbp; 564 struct xfs_agf *agf; 565 xfs_agblock_t agblocks; 566 xfs_extlen_t tree_len; 567 int error; 568 569 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 570 return 0; 571 572 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); 573 if (error) 574 return error; 575 576 agf = XFS_BUF_TO_AGF(agbp); 577 agblocks = be32_to_cpu(agf->agf_length); 578 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 579 xfs_trans_brelse(tp, agbp); 580 581 /* Reserve 1% of the AG or enough for 1 block per record. */ 582 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 583 *used += tree_len; 584 585 return error; 586 } 587