1 /* 2 * Copyright (c) 2014 Red Hat, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_shared.h" 21 #include "xfs_format.h" 22 #include "xfs_log_format.h" 23 #include "xfs_trans_resv.h" 24 #include "xfs_bit.h" 25 #include "xfs_sb.h" 26 #include "xfs_mount.h" 27 #include "xfs_defer.h" 28 #include "xfs_inode.h" 29 #include "xfs_trans.h" 30 #include "xfs_alloc.h" 31 #include "xfs_btree.h" 32 #include "xfs_rmap.h" 33 #include "xfs_rmap_btree.h" 34 #include "xfs_trace.h" 35 #include "xfs_cksum.h" 36 #include "xfs_error.h" 37 #include "xfs_extent_busy.h" 38 #include "xfs_ag_resv.h" 39 40 /* 41 * Reverse map btree. 42 * 43 * This is a per-ag tree used to track the owner(s) of a given extent. With 44 * reflink it is possible for there to be multiple owners, which is a departure 45 * from classic XFS. Owner records for data extents are inserted when the 46 * extent is mapped and removed when an extent is unmapped. Owner records for 47 * all other block types (i.e. metadata) are inserted when an extent is 48 * allocated and removed when an extent is freed. There can only be one owner 49 * of a metadata extent, usually an inode or some other metadata structure like 50 * an AG btree. 51 * 52 * The rmap btree is part of the free space management, so blocks for the tree 53 * are sourced from the agfl. Hence we need transaction reservation support for 54 * this tree so that the freelist is always large enough. This also impacts on 55 * the minimum space we need to leave free in the AG. 56 * 57 * The tree is ordered by [ag block, owner, offset]. This is a large key size, 58 * but it is the only way to enforce unique keys when a block can be owned by 59 * multiple files at any offset. There's no need to order/search by extent 60 * size for online updating/management of the tree. It is intended that most 61 * reverse lookups will be to find the owner(s) of a particular block, or to 62 * try to recover tree and file data from corrupt primary metadata. 63 */ 64 65 static struct xfs_btree_cur * 66 xfs_rmapbt_dup_cursor( 67 struct xfs_btree_cur *cur) 68 { 69 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp, 70 cur->bc_private.a.agbp, cur->bc_private.a.agno); 71 } 72 73 STATIC void 74 xfs_rmapbt_set_root( 75 struct xfs_btree_cur *cur, 76 union xfs_btree_ptr *ptr, 77 int inc) 78 { 79 struct xfs_buf *agbp = cur->bc_private.a.agbp; 80 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 81 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); 82 int btnum = cur->bc_btnum; 83 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); 84 85 ASSERT(ptr->s != 0); 86 87 agf->agf_roots[btnum] = ptr->s; 88 be32_add_cpu(&agf->agf_levels[btnum], inc); 89 pag->pagf_levels[btnum] += inc; 90 xfs_perag_put(pag); 91 92 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); 93 } 94 95 STATIC int 96 xfs_rmapbt_alloc_block( 97 struct xfs_btree_cur *cur, 98 union xfs_btree_ptr *start, 99 union xfs_btree_ptr *new, 100 int *stat) 101 { 102 struct xfs_buf *agbp = cur->bc_private.a.agbp; 103 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 104 int error; 105 xfs_agblock_t bno; 106 107 /* Allocate the new block from the freelist. If we can't, give up. */ 108 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, 109 &bno, 1); 110 if (error) 111 return error; 112 113 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, 114 bno, 1); 115 if (bno == NULLAGBLOCK) { 116 *stat = 0; 117 return 0; 118 } 119 120 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, 121 false); 122 123 xfs_trans_agbtree_delta(cur->bc_tp, 1); 124 new->s = cpu_to_be32(bno); 125 be32_add_cpu(&agf->agf_rmap_blocks, 1); 126 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 127 128 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); 129 130 *stat = 1; 131 return 0; 132 } 133 134 STATIC int 135 xfs_rmapbt_free_block( 136 struct xfs_btree_cur *cur, 137 struct xfs_buf *bp) 138 { 139 struct xfs_buf *agbp = cur->bc_private.a.agbp; 140 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 141 xfs_agblock_t bno; 142 int error; 143 144 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); 145 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno, 146 bno, 1); 147 be32_add_cpu(&agf->agf_rmap_blocks, -1); 148 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); 149 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); 150 if (error) 151 return error; 152 153 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, 154 XFS_EXTENT_BUSY_SKIP_DISCARD); 155 xfs_trans_agbtree_delta(cur->bc_tp, -1); 156 157 xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); 158 159 return 0; 160 } 161 162 STATIC int 163 xfs_rmapbt_get_minrecs( 164 struct xfs_btree_cur *cur, 165 int level) 166 { 167 return cur->bc_mp->m_rmap_mnr[level != 0]; 168 } 169 170 STATIC int 171 xfs_rmapbt_get_maxrecs( 172 struct xfs_btree_cur *cur, 173 int level) 174 { 175 return cur->bc_mp->m_rmap_mxr[level != 0]; 176 } 177 178 STATIC void 179 xfs_rmapbt_init_key_from_rec( 180 union xfs_btree_key *key, 181 union xfs_btree_rec *rec) 182 { 183 key->rmap.rm_startblock = rec->rmap.rm_startblock; 184 key->rmap.rm_owner = rec->rmap.rm_owner; 185 key->rmap.rm_offset = rec->rmap.rm_offset; 186 } 187 188 /* 189 * The high key for a reverse mapping record can be computed by shifting 190 * the startblock and offset to the highest value that would still map 191 * to that record. In practice this means that we add blockcount-1 to 192 * the startblock for all records, and if the record is for a data/attr 193 * fork mapping, we add blockcount-1 to the offset too. 194 */ 195 STATIC void 196 xfs_rmapbt_init_high_key_from_rec( 197 union xfs_btree_key *key, 198 union xfs_btree_rec *rec) 199 { 200 uint64_t off; 201 int adj; 202 203 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; 204 205 key->rmap.rm_startblock = rec->rmap.rm_startblock; 206 be32_add_cpu(&key->rmap.rm_startblock, adj); 207 key->rmap.rm_owner = rec->rmap.rm_owner; 208 key->rmap.rm_offset = rec->rmap.rm_offset; 209 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) || 210 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset))) 211 return; 212 off = be64_to_cpu(key->rmap.rm_offset); 213 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK); 214 key->rmap.rm_offset = cpu_to_be64(off); 215 } 216 217 STATIC void 218 xfs_rmapbt_init_rec_from_cur( 219 struct xfs_btree_cur *cur, 220 union xfs_btree_rec *rec) 221 { 222 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock); 223 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount); 224 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner); 225 rec->rmap.rm_offset = cpu_to_be64( 226 xfs_rmap_irec_offset_pack(&cur->bc_rec.r)); 227 } 228 229 STATIC void 230 xfs_rmapbt_init_ptr_from_cur( 231 struct xfs_btree_cur *cur, 232 union xfs_btree_ptr *ptr) 233 { 234 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); 235 236 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); 237 ASSERT(agf->agf_roots[cur->bc_btnum] != 0); 238 239 ptr->s = agf->agf_roots[cur->bc_btnum]; 240 } 241 242 STATIC int64_t 243 xfs_rmapbt_key_diff( 244 struct xfs_btree_cur *cur, 245 union xfs_btree_key *key) 246 { 247 struct xfs_rmap_irec *rec = &cur->bc_rec.r; 248 struct xfs_rmap_key *kp = &key->rmap; 249 __u64 x, y; 250 int64_t d; 251 252 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; 253 if (d) 254 return d; 255 256 x = be64_to_cpu(kp->rm_owner); 257 y = rec->rm_owner; 258 if (x > y) 259 return 1; 260 else if (y > x) 261 return -1; 262 263 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset)); 264 y = rec->rm_offset; 265 if (x > y) 266 return 1; 267 else if (y > x) 268 return -1; 269 return 0; 270 } 271 272 STATIC int64_t 273 xfs_rmapbt_diff_two_keys( 274 struct xfs_btree_cur *cur, 275 union xfs_btree_key *k1, 276 union xfs_btree_key *k2) 277 { 278 struct xfs_rmap_key *kp1 = &k1->rmap; 279 struct xfs_rmap_key *kp2 = &k2->rmap; 280 int64_t d; 281 __u64 x, y; 282 283 d = (int64_t)be32_to_cpu(kp1->rm_startblock) - 284 be32_to_cpu(kp2->rm_startblock); 285 if (d) 286 return d; 287 288 x = be64_to_cpu(kp1->rm_owner); 289 y = be64_to_cpu(kp2->rm_owner); 290 if (x > y) 291 return 1; 292 else if (y > x) 293 return -1; 294 295 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset)); 296 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset)); 297 if (x > y) 298 return 1; 299 else if (y > x) 300 return -1; 301 return 0; 302 } 303 304 static xfs_failaddr_t 305 xfs_rmapbt_verify( 306 struct xfs_buf *bp) 307 { 308 struct xfs_mount *mp = bp->b_target->bt_mount; 309 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); 310 struct xfs_perag *pag = bp->b_pag; 311 xfs_failaddr_t fa; 312 unsigned int level; 313 314 /* 315 * magic number and level verification 316 * 317 * During growfs operations, we can't verify the exact level or owner as 318 * the perag is not fully initialised and hence not attached to the 319 * buffer. In this case, check against the maximum tree depth. 320 * 321 * Similarly, during log recovery we will have a perag structure 322 * attached, but the agf information will not yet have been initialised 323 * from the on disk AGF. Again, we can only check against maximum limits 324 * in this case. 325 */ 326 if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC)) 327 return __this_address; 328 329 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 330 return __this_address; 331 fa = xfs_btree_sblock_v5hdr_verify(bp); 332 if (fa) 333 return fa; 334 335 level = be16_to_cpu(block->bb_level); 336 if (pag && pag->pagf_init) { 337 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi]) 338 return __this_address; 339 } else if (level >= mp->m_rmap_maxlevels) 340 return __this_address; 341 342 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]); 343 } 344 345 static void 346 xfs_rmapbt_read_verify( 347 struct xfs_buf *bp) 348 { 349 xfs_failaddr_t fa; 350 351 if (!xfs_btree_sblock_verify_crc(bp)) 352 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 353 else { 354 fa = xfs_rmapbt_verify(bp); 355 if (fa) 356 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 357 } 358 359 if (bp->b_error) 360 trace_xfs_btree_corrupt(bp, _RET_IP_); 361 } 362 363 static void 364 xfs_rmapbt_write_verify( 365 struct xfs_buf *bp) 366 { 367 xfs_failaddr_t fa; 368 369 fa = xfs_rmapbt_verify(bp); 370 if (fa) { 371 trace_xfs_btree_corrupt(bp, _RET_IP_); 372 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 373 return; 374 } 375 xfs_btree_sblock_calc_crc(bp); 376 377 } 378 379 const struct xfs_buf_ops xfs_rmapbt_buf_ops = { 380 .name = "xfs_rmapbt", 381 .verify_read = xfs_rmapbt_read_verify, 382 .verify_write = xfs_rmapbt_write_verify, 383 .verify_struct = xfs_rmapbt_verify, 384 }; 385 386 STATIC int 387 xfs_rmapbt_keys_inorder( 388 struct xfs_btree_cur *cur, 389 union xfs_btree_key *k1, 390 union xfs_btree_key *k2) 391 { 392 uint32_t x; 393 uint32_t y; 394 uint64_t a; 395 uint64_t b; 396 397 x = be32_to_cpu(k1->rmap.rm_startblock); 398 y = be32_to_cpu(k2->rmap.rm_startblock); 399 if (x < y) 400 return 1; 401 else if (x > y) 402 return 0; 403 a = be64_to_cpu(k1->rmap.rm_owner); 404 b = be64_to_cpu(k2->rmap.rm_owner); 405 if (a < b) 406 return 1; 407 else if (a > b) 408 return 0; 409 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset)); 410 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset)); 411 if (a <= b) 412 return 1; 413 return 0; 414 } 415 416 STATIC int 417 xfs_rmapbt_recs_inorder( 418 struct xfs_btree_cur *cur, 419 union xfs_btree_rec *r1, 420 union xfs_btree_rec *r2) 421 { 422 uint32_t x; 423 uint32_t y; 424 uint64_t a; 425 uint64_t b; 426 427 x = be32_to_cpu(r1->rmap.rm_startblock); 428 y = be32_to_cpu(r2->rmap.rm_startblock); 429 if (x < y) 430 return 1; 431 else if (x > y) 432 return 0; 433 a = be64_to_cpu(r1->rmap.rm_owner); 434 b = be64_to_cpu(r2->rmap.rm_owner); 435 if (a < b) 436 return 1; 437 else if (a > b) 438 return 0; 439 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset)); 440 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset)); 441 if (a <= b) 442 return 1; 443 return 0; 444 } 445 446 static const struct xfs_btree_ops xfs_rmapbt_ops = { 447 .rec_len = sizeof(struct xfs_rmap_rec), 448 .key_len = 2 * sizeof(struct xfs_rmap_key), 449 450 .dup_cursor = xfs_rmapbt_dup_cursor, 451 .set_root = xfs_rmapbt_set_root, 452 .alloc_block = xfs_rmapbt_alloc_block, 453 .free_block = xfs_rmapbt_free_block, 454 .get_minrecs = xfs_rmapbt_get_minrecs, 455 .get_maxrecs = xfs_rmapbt_get_maxrecs, 456 .init_key_from_rec = xfs_rmapbt_init_key_from_rec, 457 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec, 458 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur, 459 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur, 460 .key_diff = xfs_rmapbt_key_diff, 461 .buf_ops = &xfs_rmapbt_buf_ops, 462 .diff_two_keys = xfs_rmapbt_diff_two_keys, 463 .keys_inorder = xfs_rmapbt_keys_inorder, 464 .recs_inorder = xfs_rmapbt_recs_inorder, 465 }; 466 467 /* 468 * Allocate a new allocation btree cursor. 469 */ 470 struct xfs_btree_cur * 471 xfs_rmapbt_init_cursor( 472 struct xfs_mount *mp, 473 struct xfs_trans *tp, 474 struct xfs_buf *agbp, 475 xfs_agnumber_t agno) 476 { 477 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); 478 struct xfs_btree_cur *cur; 479 480 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); 481 cur->bc_tp = tp; 482 cur->bc_mp = mp; 483 /* Overlapping btree; 2 keys per pointer. */ 484 cur->bc_btnum = XFS_BTNUM_RMAP; 485 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; 486 cur->bc_blocklog = mp->m_sb.sb_blocklog; 487 cur->bc_ops = &xfs_rmapbt_ops; 488 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); 489 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); 490 491 cur->bc_private.a.agbp = agbp; 492 cur->bc_private.a.agno = agno; 493 494 return cur; 495 } 496 497 /* 498 * Calculate number of records in an rmap btree block. 499 */ 500 int 501 xfs_rmapbt_maxrecs( 502 int blocklen, 503 int leaf) 504 { 505 blocklen -= XFS_RMAP_BLOCK_LEN; 506 507 if (leaf) 508 return blocklen / sizeof(struct xfs_rmap_rec); 509 return blocklen / 510 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); 511 } 512 513 /* Compute the maximum height of an rmap btree. */ 514 void 515 xfs_rmapbt_compute_maxlevels( 516 struct xfs_mount *mp) 517 { 518 /* 519 * On a non-reflink filesystem, the maximum number of rmap 520 * records is the number of blocks in the AG, hence the max 521 * rmapbt height is log_$maxrecs($agblocks). However, with 522 * reflink each AG block can have up to 2^32 (per the refcount 523 * record format) owners, which means that theoretically we 524 * could face up to 2^64 rmap records. 525 * 526 * That effectively means that the max rmapbt height must be 527 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG 528 * blocks to feed the rmapbt long before the rmapbt reaches 529 * maximum height. The reflink code uses ag_resv_critical to 530 * disallow reflinking when less than 10% of the per-AG metadata 531 * block reservation since the fallback is a regular file copy. 532 */ 533 if (xfs_sb_version_hasreflink(&mp->m_sb)) 534 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; 535 else 536 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( 537 mp->m_rmap_mnr, mp->m_sb.sb_agblocks); 538 } 539 540 /* Calculate the refcount btree size for some records. */ 541 xfs_extlen_t 542 xfs_rmapbt_calc_size( 543 struct xfs_mount *mp, 544 unsigned long long len) 545 { 546 return xfs_btree_calc_size(mp->m_rmap_mnr, len); 547 } 548 549 /* 550 * Calculate the maximum refcount btree size. 551 */ 552 xfs_extlen_t 553 xfs_rmapbt_max_size( 554 struct xfs_mount *mp, 555 xfs_agblock_t agblocks) 556 { 557 /* Bail out if we're uninitialized, which can happen in mkfs. */ 558 if (mp->m_rmap_mxr[0] == 0) 559 return 0; 560 561 return xfs_rmapbt_calc_size(mp, agblocks); 562 } 563 564 /* 565 * Figure out how many blocks to reserve and how many are used by this btree. 566 */ 567 int 568 xfs_rmapbt_calc_reserves( 569 struct xfs_mount *mp, 570 xfs_agnumber_t agno, 571 xfs_extlen_t *ask, 572 xfs_extlen_t *used) 573 { 574 struct xfs_buf *agbp; 575 struct xfs_agf *agf; 576 xfs_agblock_t agblocks; 577 xfs_extlen_t tree_len; 578 int error; 579 580 if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) 581 return 0; 582 583 error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); 584 if (error) 585 return error; 586 587 agf = XFS_BUF_TO_AGF(agbp); 588 agblocks = be32_to_cpu(agf->agf_length); 589 tree_len = be32_to_cpu(agf->agf_rmap_blocks); 590 xfs_buf_relse(agbp); 591 592 /* Reserve 1% of the AG or enough for 1 block per record. */ 593 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); 594 *used += tree_len; 595 596 return error; 597 } 598