1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * uptodate.c 4 * 5 * Tracking the up-to-date-ness of a local buffer_head with respect to 6 * the cluster. 7 * 8 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. 9 * 10 * Standard buffer head caching flags (uptodate, etc) are insufficient 11 * in a clustered environment - a buffer may be marked up to date on 12 * our local node but could have been modified by another cluster 13 * member. As a result an additional (and performant) caching scheme 14 * is required. A further requirement is that we consume as little 15 * memory as possible - we never pin buffer_head structures in order 16 * to cache them. 17 * 18 * We track the existence of up to date buffers on the inodes which 19 * are associated with them. Because we don't want to pin 20 * buffer_heads, this is only a (strong) hint and several other checks 21 * are made in the I/O path to ensure that we don't use a stale or 22 * invalid buffer without going to disk: 23 * - buffer_jbd is used liberally - if a bh is in the journal on 24 * this node then it *must* be up to date. 25 * - the standard buffer_uptodate() macro is used to detect buffers 26 * which may be invalid (even if we have an up to date tracking 27 * item for them) 28 * 29 * For a full understanding of how this code works together, one 30 * should read the callers in dlmglue.c, the I/O functions in 31 * buffer_head_io.c and ocfs2_journal_access in journal.c 32 */ 33 34 #include <linux/fs.h> 35 #include <linux/types.h> 36 #include <linux/slab.h> 37 #include <linux/highmem.h> 38 #include <linux/buffer_head.h> 39 #include <linux/rbtree.h> 40 41 #include <cluster/masklog.h> 42 43 #include "ocfs2.h" 44 45 #include "inode.h" 46 #include "uptodate.h" 47 #include "ocfs2_trace.h" 48 49 struct ocfs2_meta_cache_item { 50 struct rb_node c_node; 51 sector_t c_block; 52 }; 53 54 static struct kmem_cache *ocfs2_uptodate_cachep; 55 56 u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) 57 { 58 BUG_ON(!ci || !ci->ci_ops); 59 60 return ci->ci_ops->co_owner(ci); 61 } 62 63 struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci) 64 { 65 BUG_ON(!ci || !ci->ci_ops); 66 67 return ci->ci_ops->co_get_super(ci); 68 } 69 70 static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci) 71 { 72 BUG_ON(!ci || !ci->ci_ops); 73 74 ci->ci_ops->co_cache_lock(ci); 75 } 76 77 static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci) 78 { 79 BUG_ON(!ci || !ci->ci_ops); 80 81 ci->ci_ops->co_cache_unlock(ci); 82 } 83 84 void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci) 85 { 86 BUG_ON(!ci || !ci->ci_ops); 87 88 ci->ci_ops->co_io_lock(ci); 89 } 90 91 void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci) 92 { 93 BUG_ON(!ci || !ci->ci_ops); 94 95 ci->ci_ops->co_io_unlock(ci); 96 } 97 98 99 static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci, 100 int clear) 101 { 102 ci->ci_flags |= OCFS2_CACHE_FL_INLINE; 103 ci->ci_num_cached = 0; 104 105 if (clear) { 106 ci->ci_created_trans = 0; 107 ci->ci_last_trans = 0; 108 } 109 } 110 111 void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, 112 const struct ocfs2_caching_operations *ops) 113 { 114 BUG_ON(!ops); 115 116 ci->ci_ops = ops; 117 ocfs2_metadata_cache_reset(ci, 1); 118 } 119 120 void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci) 121 { 122 ocfs2_metadata_cache_purge(ci); 123 ocfs2_metadata_cache_reset(ci, 1); 124 } 125 126 127 /* No lock taken here as 'root' is not expected to be visible to other 128 * processes. */ 129 static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 130 { 131 unsigned int purged = 0; 132 struct rb_node *node; 133 struct ocfs2_meta_cache_item *item; 134 135 while ((node = rb_last(root)) != NULL) { 136 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); 137 138 trace_ocfs2_purge_copied_metadata_tree( 139 (unsigned long long) item->c_block); 140 141 rb_erase(&item->c_node, root); 142 kmem_cache_free(ocfs2_uptodate_cachep, item); 143 144 purged++; 145 } 146 return purged; 147 } 148 149 /* Called from locking and called from ocfs2_clear_inode. Dump the 150 * cache for a given inode. 151 * 152 * This function is a few more lines longer than necessary due to some 153 * accounting done here, but I think it's worth tracking down those 154 * bugs sooner -- Mark */ 155 void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci) 156 { 157 unsigned int tree, to_purge, purged; 158 struct rb_root root = RB_ROOT; 159 160 BUG_ON(!ci || !ci->ci_ops); 161 162 ocfs2_metadata_cache_lock(ci); 163 tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); 164 to_purge = ci->ci_num_cached; 165 166 trace_ocfs2_metadata_cache_purge( 167 (unsigned long long)ocfs2_metadata_cache_owner(ci), 168 to_purge, tree); 169 170 /* If we're a tree, save off the root so that we can safely 171 * initialize the cache. We do the work to free tree members 172 * without the spinlock. */ 173 if (tree) 174 root = ci->ci_cache.ci_tree; 175 176 ocfs2_metadata_cache_reset(ci, 0); 177 ocfs2_metadata_cache_unlock(ci); 178 179 purged = ocfs2_purge_copied_metadata_tree(&root); 180 /* If possible, track the number wiped so that we can more 181 * easily detect counting errors. Unfortunately, this is only 182 * meaningful for trees. */ 183 if (tree && purged != to_purge) 184 mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n", 185 (unsigned long long)ocfs2_metadata_cache_owner(ci), 186 to_purge, purged); 187 } 188 189 /* Returns the index in the cache array, -1 if not found. 190 * Requires ip_lock. */ 191 static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, 192 sector_t item) 193 { 194 int i; 195 196 for (i = 0; i < ci->ci_num_cached; i++) { 197 if (item == ci->ci_cache.ci_array[i]) 198 return i; 199 } 200 201 return -1; 202 } 203 204 /* Returns the cache item if found, otherwise NULL. 205 * Requires ip_lock. */ 206 static struct ocfs2_meta_cache_item * 207 ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, 208 sector_t block) 209 { 210 struct rb_node * n = ci->ci_cache.ci_tree.rb_node; 211 struct ocfs2_meta_cache_item *item = NULL; 212 213 while (n) { 214 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); 215 216 if (block < item->c_block) 217 n = n->rb_left; 218 else if (block > item->c_block) 219 n = n->rb_right; 220 else 221 return item; 222 } 223 224 return NULL; 225 } 226 227 static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, 228 struct buffer_head *bh) 229 { 230 int index = -1; 231 struct ocfs2_meta_cache_item *item = NULL; 232 233 ocfs2_metadata_cache_lock(ci); 234 235 trace_ocfs2_buffer_cached_begin( 236 (unsigned long long)ocfs2_metadata_cache_owner(ci), 237 (unsigned long long) bh->b_blocknr, 238 !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); 239 240 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) 241 index = ocfs2_search_cache_array(ci, bh->b_blocknr); 242 else 243 item = ocfs2_search_cache_tree(ci, bh->b_blocknr); 244 245 ocfs2_metadata_cache_unlock(ci); 246 247 trace_ocfs2_buffer_cached_end(index, item); 248 249 return (index != -1) || (item != NULL); 250 } 251 252 /* Warning: even if it returns true, this does *not* guarantee that 253 * the block is stored in our inode metadata cache. 254 * 255 * This can be called under lock_buffer() 256 */ 257 int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, 258 struct buffer_head *bh) 259 { 260 /* Doesn't matter if the bh is in our cache or not -- if it's 261 * not marked uptodate then we know it can't have correct 262 * data. */ 263 if (!buffer_uptodate(bh)) 264 return 0; 265 266 /* OCFS2 does not allow multiple nodes to be changing the same 267 * block at the same time. */ 268 if (buffer_jbd(bh)) 269 return 1; 270 271 /* Ok, locally the buffer is marked as up to date, now search 272 * our cache to see if we can trust that. */ 273 return ocfs2_buffer_cached(ci, bh); 274 } 275 276 /* 277 * Determine whether a buffer is currently out on a read-ahead request. 278 * ci_io_sem should be held to serialize submitters with the logic here. 279 */ 280 int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, 281 struct buffer_head *bh) 282 { 283 return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh); 284 } 285 286 /* Requires ip_lock */ 287 static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 288 sector_t block) 289 { 290 BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); 291 292 trace_ocfs2_append_cache_array( 293 (unsigned long long)ocfs2_metadata_cache_owner(ci), 294 (unsigned long long)block, ci->ci_num_cached); 295 296 ci->ci_cache.ci_array[ci->ci_num_cached] = block; 297 ci->ci_num_cached++; 298 } 299 300 /* By now the caller should have checked that the item does *not* 301 * exist in the tree. 302 * Requires ip_lock. */ 303 static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, 304 struct ocfs2_meta_cache_item *new) 305 { 306 sector_t block = new->c_block; 307 struct rb_node *parent = NULL; 308 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; 309 struct ocfs2_meta_cache_item *tmp; 310 311 trace_ocfs2_insert_cache_tree( 312 (unsigned long long)ocfs2_metadata_cache_owner(ci), 313 (unsigned long long)block, ci->ci_num_cached); 314 315 while(*p) { 316 parent = *p; 317 318 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); 319 320 if (block < tmp->c_block) 321 p = &(*p)->rb_left; 322 else if (block > tmp->c_block) 323 p = &(*p)->rb_right; 324 else { 325 /* This should never happen! */ 326 mlog(ML_ERROR, "Duplicate block %llu cached!\n", 327 (unsigned long long) block); 328 BUG(); 329 } 330 } 331 332 rb_link_node(&new->c_node, parent, p); 333 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); 334 ci->ci_num_cached++; 335 } 336 337 /* co_cache_lock() must be held */ 338 static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci) 339 { 340 return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) && 341 (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY); 342 } 343 344 /* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the 345 * pointers in tree after we use them - this allows caller to detect 346 * when to free in case of error. 347 * 348 * The co_cache_lock() must be held. */ 349 static void ocfs2_expand_cache(struct ocfs2_caching_info *ci, 350 struct ocfs2_meta_cache_item **tree) 351 { 352 int i; 353 354 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY, 355 "Owner %llu, num cached = %u, should be %u\n", 356 (unsigned long long)ocfs2_metadata_cache_owner(ci), 357 ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY); 358 mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE), 359 "Owner %llu not marked as inline anymore!\n", 360 (unsigned long long)ocfs2_metadata_cache_owner(ci)); 361 362 /* Be careful to initialize the tree members *first* because 363 * once the ci_tree is used, the array is junk... */ 364 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) 365 tree[i]->c_block = ci->ci_cache.ci_array[i]; 366 367 ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE; 368 ci->ci_cache.ci_tree = RB_ROOT; 369 /* this will be set again by __ocfs2_insert_cache_tree */ 370 ci->ci_num_cached = 0; 371 372 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { 373 __ocfs2_insert_cache_tree(ci, tree[i]); 374 tree[i] = NULL; 375 } 376 377 trace_ocfs2_expand_cache( 378 (unsigned long long)ocfs2_metadata_cache_owner(ci), 379 ci->ci_flags, ci->ci_num_cached); 380 } 381 382 /* Slow path function - memory allocation is necessary. See the 383 * comment above ocfs2_set_buffer_uptodate for more information. */ 384 static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, 385 sector_t block, 386 int expand_tree) 387 { 388 int i; 389 struct ocfs2_meta_cache_item *new = NULL; 390 struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = 391 { NULL, }; 392 393 trace_ocfs2_set_buffer_uptodate( 394 (unsigned long long)ocfs2_metadata_cache_owner(ci), 395 (unsigned long long)block, expand_tree); 396 397 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 398 if (!new) { 399 mlog_errno(-ENOMEM); 400 return; 401 } 402 new->c_block = block; 403 404 if (expand_tree) { 405 /* Do *not* allocate an array here - the removal code 406 * has no way of tracking that. */ 407 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { 408 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 409 GFP_NOFS); 410 if (!tree[i]) { 411 mlog_errno(-ENOMEM); 412 goto out_free; 413 } 414 415 /* These are initialized in ocfs2_expand_cache! */ 416 } 417 } 418 419 ocfs2_metadata_cache_lock(ci); 420 if (ocfs2_insert_can_use_array(ci)) { 421 /* Ok, items were removed from the cache in between 422 * locks. Detect this and revert back to the fast path */ 423 ocfs2_append_cache_array(ci, block); 424 ocfs2_metadata_cache_unlock(ci); 425 goto out_free; 426 } 427 428 if (expand_tree) 429 ocfs2_expand_cache(ci, tree); 430 431 __ocfs2_insert_cache_tree(ci, new); 432 ocfs2_metadata_cache_unlock(ci); 433 434 new = NULL; 435 out_free: 436 if (new) 437 kmem_cache_free(ocfs2_uptodate_cachep, new); 438 439 /* If these were used, then ocfs2_expand_cache re-set them to 440 * NULL for us. */ 441 if (tree[0]) { 442 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) 443 if (tree[i]) 444 kmem_cache_free(ocfs2_uptodate_cachep, 445 tree[i]); 446 } 447 } 448 449 /* Item insertion is guarded by co_io_lock(), so the insertion path takes 450 * advantage of this by not rechecking for a duplicate insert during 451 * the slow case. Additionally, if the cache needs to be bumped up to 452 * a tree, the code will not recheck after acquiring the lock -- 453 * multiple paths cannot be expanding to a tree at the same time. 454 * 455 * The slow path takes into account that items can be removed 456 * (including the whole tree wiped and reset) when this process it out 457 * allocating memory. In those cases, it reverts back to the fast 458 * path. 459 * 460 * Note that this function may actually fail to insert the block if 461 * memory cannot be allocated. This is not fatal however (but may 462 * result in a performance penalty) 463 * 464 * Readahead buffers can be passed in here before the I/O request is 465 * completed. 466 */ 467 void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, 468 struct buffer_head *bh) 469 { 470 int expand; 471 472 /* The block may very well exist in our cache already, so avoid 473 * doing any more work in that case. */ 474 if (ocfs2_buffer_cached(ci, bh)) 475 return; 476 477 trace_ocfs2_set_buffer_uptodate_begin( 478 (unsigned long long)ocfs2_metadata_cache_owner(ci), 479 (unsigned long long)bh->b_blocknr); 480 481 /* No need to recheck under spinlock - insertion is guarded by 482 * co_io_lock() */ 483 ocfs2_metadata_cache_lock(ci); 484 if (ocfs2_insert_can_use_array(ci)) { 485 /* Fast case - it's an array and there's a free 486 * spot. */ 487 ocfs2_append_cache_array(ci, bh->b_blocknr); 488 ocfs2_metadata_cache_unlock(ci); 489 return; 490 } 491 492 expand = 0; 493 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { 494 /* We need to bump things up to a tree. */ 495 expand = 1; 496 } 497 ocfs2_metadata_cache_unlock(ci); 498 499 __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand); 500 } 501 502 /* Called against a newly allocated buffer. Most likely nobody should 503 * be able to read this sort of metadata while it's still being 504 * allocated, but this is careful to take co_io_lock() anyway. */ 505 void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, 506 struct buffer_head *bh) 507 { 508 /* This should definitely *not* exist in our cache */ 509 BUG_ON(ocfs2_buffer_cached(ci, bh)); 510 511 set_buffer_uptodate(bh); 512 513 ocfs2_metadata_cache_io_lock(ci); 514 ocfs2_set_buffer_uptodate(ci, bh); 515 ocfs2_metadata_cache_io_unlock(ci); 516 } 517 518 /* Requires ip_lock. */ 519 static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, 520 int index) 521 { 522 sector_t *array = ci->ci_cache.ci_array; 523 int bytes; 524 525 BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY); 526 BUG_ON(index >= ci->ci_num_cached); 527 BUG_ON(!ci->ci_num_cached); 528 529 trace_ocfs2_remove_metadata_array( 530 (unsigned long long)ocfs2_metadata_cache_owner(ci), 531 index, ci->ci_num_cached); 532 533 ci->ci_num_cached--; 534 535 /* don't need to copy if the array is now empty, or if we 536 * removed at the tail */ 537 if (ci->ci_num_cached && index < ci->ci_num_cached) { 538 bytes = sizeof(sector_t) * (ci->ci_num_cached - index); 539 memmove(&array[index], &array[index + 1], bytes); 540 } 541 } 542 543 /* Requires ip_lock. */ 544 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, 545 struct ocfs2_meta_cache_item *item) 546 { 547 trace_ocfs2_remove_metadata_tree( 548 (unsigned long long)ocfs2_metadata_cache_owner(ci), 549 (unsigned long long)item->c_block); 550 551 rb_erase(&item->c_node, &ci->ci_cache.ci_tree); 552 ci->ci_num_cached--; 553 } 554 555 static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci, 556 sector_t block) 557 { 558 int index; 559 struct ocfs2_meta_cache_item *item = NULL; 560 561 ocfs2_metadata_cache_lock(ci); 562 trace_ocfs2_remove_block_from_cache( 563 (unsigned long long)ocfs2_metadata_cache_owner(ci), 564 (unsigned long long) block, ci->ci_num_cached, 565 ci->ci_flags); 566 567 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { 568 index = ocfs2_search_cache_array(ci, block); 569 if (index != -1) 570 ocfs2_remove_metadata_array(ci, index); 571 } else { 572 item = ocfs2_search_cache_tree(ci, block); 573 if (item) 574 ocfs2_remove_metadata_tree(ci, item); 575 } 576 ocfs2_metadata_cache_unlock(ci); 577 578 if (item) 579 kmem_cache_free(ocfs2_uptodate_cachep, item); 580 } 581 582 /* 583 * Called when we remove a chunk of metadata from an inode. We don't 584 * bother reverting things to an inlined array in the case of a remove 585 * which moves us back under the limit. 586 */ 587 void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, 588 struct buffer_head *bh) 589 { 590 sector_t block = bh->b_blocknr; 591 592 ocfs2_remove_block_from_cache(ci, block); 593 } 594 595 /* Called when we remove xattr clusters from an inode. */ 596 void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, 597 sector_t block, 598 u32 c_len) 599 { 600 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 601 unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len; 602 603 for (i = 0; i < b_len; i++, block++) 604 ocfs2_remove_block_from_cache(ci, block); 605 } 606 607 int __init init_ocfs2_uptodate_cache(void) 608 { 609 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 610 sizeof(struct ocfs2_meta_cache_item), 611 0, SLAB_HWCACHE_ALIGN, NULL); 612 if (!ocfs2_uptodate_cachep) 613 return -ENOMEM; 614 615 return 0; 616 } 617 618 void exit_ocfs2_uptodate_cache(void) 619 { 620 kmem_cache_destroy(ocfs2_uptodate_cachep); 621 } 622