1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* -*- mode: c; c-basic-offset: 8; -*- 3 * vim: noexpandtab sw=8 ts=8 sts=0: 4 * 5 * uptodate.c 6 * 7 * Tracking the up-to-date-ness of a local buffer_head with respect to 8 * the cluster. 9 * 10 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. 11 * 12 * Standard buffer head caching flags (uptodate, etc) are insufficient 13 * in a clustered environment - a buffer may be marked up to date on 14 * our local node but could have been modified by another cluster 15 * member. As a result an additional (and performant) caching scheme 16 * is required. A further requirement is that we consume as little 17 * memory as possible - we never pin buffer_head structures in order 18 * to cache them. 19 * 20 * We track the existence of up to date buffers on the inodes which 21 * are associated with them. Because we don't want to pin 22 * buffer_heads, this is only a (strong) hint and several other checks 23 * are made in the I/O path to ensure that we don't use a stale or 24 * invalid buffer without going to disk: 25 * - buffer_jbd is used liberally - if a bh is in the journal on 26 * this node then it *must* be up to date. 27 * - the standard buffer_uptodate() macro is used to detect buffers 28 * which may be invalid (even if we have an up to date tracking 29 * item for them) 30 * 31 * For a full understanding of how this code works together, one 32 * should read the callers in dlmglue.c, the I/O functions in 33 * buffer_head_io.c and ocfs2_journal_access in journal.c 34 */ 35 36 #include <linux/fs.h> 37 #include <linux/types.h> 38 #include <linux/slab.h> 39 #include <linux/highmem.h> 40 #include <linux/buffer_head.h> 41 #include <linux/rbtree.h> 42 43 #include <cluster/masklog.h> 44 45 #include "ocfs2.h" 46 47 #include "inode.h" 48 #include "uptodate.h" 49 #include "ocfs2_trace.h" 50 51 struct ocfs2_meta_cache_item { 52 struct rb_node c_node; 53 sector_t c_block; 54 }; 55 56 static struct kmem_cache *ocfs2_uptodate_cachep; 57 58 u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) 59 { 60 BUG_ON(!ci || !ci->ci_ops); 61 62 return ci->ci_ops->co_owner(ci); 63 } 64 65 struct super_block *ocfs2_metadata_cache_get_super(struct ocfs2_caching_info *ci) 66 { 67 BUG_ON(!ci || !ci->ci_ops); 68 69 return ci->ci_ops->co_get_super(ci); 70 } 71 72 static void ocfs2_metadata_cache_lock(struct ocfs2_caching_info *ci) 73 { 74 BUG_ON(!ci || !ci->ci_ops); 75 76 ci->ci_ops->co_cache_lock(ci); 77 } 78 79 static void ocfs2_metadata_cache_unlock(struct ocfs2_caching_info *ci) 80 { 81 BUG_ON(!ci || !ci->ci_ops); 82 83 ci->ci_ops->co_cache_unlock(ci); 84 } 85 86 void ocfs2_metadata_cache_io_lock(struct ocfs2_caching_info *ci) 87 { 88 BUG_ON(!ci || !ci->ci_ops); 89 90 ci->ci_ops->co_io_lock(ci); 91 } 92 93 void ocfs2_metadata_cache_io_unlock(struct ocfs2_caching_info *ci) 94 { 95 BUG_ON(!ci || !ci->ci_ops); 96 97 ci->ci_ops->co_io_unlock(ci); 98 } 99 100 101 static void ocfs2_metadata_cache_reset(struct ocfs2_caching_info *ci, 102 int clear) 103 { 104 ci->ci_flags |= OCFS2_CACHE_FL_INLINE; 105 ci->ci_num_cached = 0; 106 107 if (clear) { 108 ci->ci_created_trans = 0; 109 ci->ci_last_trans = 0; 110 } 111 } 112 113 void ocfs2_metadata_cache_init(struct ocfs2_caching_info *ci, 114 const struct ocfs2_caching_operations *ops) 115 { 116 BUG_ON(!ops); 117 118 ci->ci_ops = ops; 119 ocfs2_metadata_cache_reset(ci, 1); 120 } 121 122 void ocfs2_metadata_cache_exit(struct ocfs2_caching_info *ci) 123 { 124 ocfs2_metadata_cache_purge(ci); 125 ocfs2_metadata_cache_reset(ci, 1); 126 } 127 128 129 /* No lock taken here as 'root' is not expected to be visible to other 130 * processes. */ 131 static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 132 { 133 unsigned int purged = 0; 134 struct rb_node *node; 135 struct ocfs2_meta_cache_item *item; 136 137 while ((node = rb_last(root)) != NULL) { 138 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); 139 140 trace_ocfs2_purge_copied_metadata_tree( 141 (unsigned long long) item->c_block); 142 143 rb_erase(&item->c_node, root); 144 kmem_cache_free(ocfs2_uptodate_cachep, item); 145 146 purged++; 147 } 148 return purged; 149 } 150 151 /* Called from locking and called from ocfs2_clear_inode. Dump the 152 * cache for a given inode. 153 * 154 * This function is a few more lines longer than necessary due to some 155 * accounting done here, but I think it's worth tracking down those 156 * bugs sooner -- Mark */ 157 void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci) 158 { 159 unsigned int tree, to_purge, purged; 160 struct rb_root root = RB_ROOT; 161 162 BUG_ON(!ci || !ci->ci_ops); 163 164 ocfs2_metadata_cache_lock(ci); 165 tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); 166 to_purge = ci->ci_num_cached; 167 168 trace_ocfs2_metadata_cache_purge( 169 (unsigned long long)ocfs2_metadata_cache_owner(ci), 170 to_purge, tree); 171 172 /* If we're a tree, save off the root so that we can safely 173 * initialize the cache. We do the work to free tree members 174 * without the spinlock. */ 175 if (tree) 176 root = ci->ci_cache.ci_tree; 177 178 ocfs2_metadata_cache_reset(ci, 0); 179 ocfs2_metadata_cache_unlock(ci); 180 181 purged = ocfs2_purge_copied_metadata_tree(&root); 182 /* If possible, track the number wiped so that we can more 183 * easily detect counting errors. Unfortunately, this is only 184 * meaningful for trees. */ 185 if (tree && purged != to_purge) 186 mlog(ML_ERROR, "Owner %llu, count = %u, purged = %u\n", 187 (unsigned long long)ocfs2_metadata_cache_owner(ci), 188 to_purge, purged); 189 } 190 191 /* Returns the index in the cache array, -1 if not found. 192 * Requires ip_lock. */ 193 static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, 194 sector_t item) 195 { 196 int i; 197 198 for (i = 0; i < ci->ci_num_cached; i++) { 199 if (item == ci->ci_cache.ci_array[i]) 200 return i; 201 } 202 203 return -1; 204 } 205 206 /* Returns the cache item if found, otherwise NULL. 207 * Requires ip_lock. */ 208 static struct ocfs2_meta_cache_item * 209 ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, 210 sector_t block) 211 { 212 struct rb_node * n = ci->ci_cache.ci_tree.rb_node; 213 struct ocfs2_meta_cache_item *item = NULL; 214 215 while (n) { 216 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); 217 218 if (block < item->c_block) 219 n = n->rb_left; 220 else if (block > item->c_block) 221 n = n->rb_right; 222 else 223 return item; 224 } 225 226 return NULL; 227 } 228 229 static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci, 230 struct buffer_head *bh) 231 { 232 int index = -1; 233 struct ocfs2_meta_cache_item *item = NULL; 234 235 ocfs2_metadata_cache_lock(ci); 236 237 trace_ocfs2_buffer_cached_begin( 238 (unsigned long long)ocfs2_metadata_cache_owner(ci), 239 (unsigned long long) bh->b_blocknr, 240 !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); 241 242 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) 243 index = ocfs2_search_cache_array(ci, bh->b_blocknr); 244 else 245 item = ocfs2_search_cache_tree(ci, bh->b_blocknr); 246 247 ocfs2_metadata_cache_unlock(ci); 248 249 trace_ocfs2_buffer_cached_end(index, item); 250 251 return (index != -1) || (item != NULL); 252 } 253 254 /* Warning: even if it returns true, this does *not* guarantee that 255 * the block is stored in our inode metadata cache. 256 * 257 * This can be called under lock_buffer() 258 */ 259 int ocfs2_buffer_uptodate(struct ocfs2_caching_info *ci, 260 struct buffer_head *bh) 261 { 262 /* Doesn't matter if the bh is in our cache or not -- if it's 263 * not marked uptodate then we know it can't have correct 264 * data. */ 265 if (!buffer_uptodate(bh)) 266 return 0; 267 268 /* OCFS2 does not allow multiple nodes to be changing the same 269 * block at the same time. */ 270 if (buffer_jbd(bh)) 271 return 1; 272 273 /* Ok, locally the buffer is marked as up to date, now search 274 * our cache to see if we can trust that. */ 275 return ocfs2_buffer_cached(ci, bh); 276 } 277 278 /* 279 * Determine whether a buffer is currently out on a read-ahead request. 280 * ci_io_sem should be held to serialize submitters with the logic here. 281 */ 282 int ocfs2_buffer_read_ahead(struct ocfs2_caching_info *ci, 283 struct buffer_head *bh) 284 { 285 return buffer_locked(bh) && ocfs2_buffer_cached(ci, bh); 286 } 287 288 /* Requires ip_lock */ 289 static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 290 sector_t block) 291 { 292 BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); 293 294 trace_ocfs2_append_cache_array( 295 (unsigned long long)ocfs2_metadata_cache_owner(ci), 296 (unsigned long long)block, ci->ci_num_cached); 297 298 ci->ci_cache.ci_array[ci->ci_num_cached] = block; 299 ci->ci_num_cached++; 300 } 301 302 /* By now the caller should have checked that the item does *not* 303 * exist in the tree. 304 * Requires ip_lock. */ 305 static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, 306 struct ocfs2_meta_cache_item *new) 307 { 308 sector_t block = new->c_block; 309 struct rb_node *parent = NULL; 310 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; 311 struct ocfs2_meta_cache_item *tmp; 312 313 trace_ocfs2_insert_cache_tree( 314 (unsigned long long)ocfs2_metadata_cache_owner(ci), 315 (unsigned long long)block, ci->ci_num_cached); 316 317 while(*p) { 318 parent = *p; 319 320 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); 321 322 if (block < tmp->c_block) 323 p = &(*p)->rb_left; 324 else if (block > tmp->c_block) 325 p = &(*p)->rb_right; 326 else { 327 /* This should never happen! */ 328 mlog(ML_ERROR, "Duplicate block %llu cached!\n", 329 (unsigned long long) block); 330 BUG(); 331 } 332 } 333 334 rb_link_node(&new->c_node, parent, p); 335 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); 336 ci->ci_num_cached++; 337 } 338 339 /* co_cache_lock() must be held */ 340 static inline int ocfs2_insert_can_use_array(struct ocfs2_caching_info *ci) 341 { 342 return (ci->ci_flags & OCFS2_CACHE_FL_INLINE) && 343 (ci->ci_num_cached < OCFS2_CACHE_INFO_MAX_ARRAY); 344 } 345 346 /* tree should be exactly OCFS2_CACHE_INFO_MAX_ARRAY wide. NULL the 347 * pointers in tree after we use them - this allows caller to detect 348 * when to free in case of error. 349 * 350 * The co_cache_lock() must be held. */ 351 static void ocfs2_expand_cache(struct ocfs2_caching_info *ci, 352 struct ocfs2_meta_cache_item **tree) 353 { 354 int i; 355 356 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_CACHE_INFO_MAX_ARRAY, 357 "Owner %llu, num cached = %u, should be %u\n", 358 (unsigned long long)ocfs2_metadata_cache_owner(ci), 359 ci->ci_num_cached, OCFS2_CACHE_INFO_MAX_ARRAY); 360 mlog_bug_on_msg(!(ci->ci_flags & OCFS2_CACHE_FL_INLINE), 361 "Owner %llu not marked as inline anymore!\n", 362 (unsigned long long)ocfs2_metadata_cache_owner(ci)); 363 364 /* Be careful to initialize the tree members *first* because 365 * once the ci_tree is used, the array is junk... */ 366 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) 367 tree[i]->c_block = ci->ci_cache.ci_array[i]; 368 369 ci->ci_flags &= ~OCFS2_CACHE_FL_INLINE; 370 ci->ci_cache.ci_tree = RB_ROOT; 371 /* this will be set again by __ocfs2_insert_cache_tree */ 372 ci->ci_num_cached = 0; 373 374 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { 375 __ocfs2_insert_cache_tree(ci, tree[i]); 376 tree[i] = NULL; 377 } 378 379 trace_ocfs2_expand_cache( 380 (unsigned long long)ocfs2_metadata_cache_owner(ci), 381 ci->ci_flags, ci->ci_num_cached); 382 } 383 384 /* Slow path function - memory allocation is necessary. See the 385 * comment above ocfs2_set_buffer_uptodate for more information. */ 386 static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, 387 sector_t block, 388 int expand_tree) 389 { 390 int i; 391 struct ocfs2_meta_cache_item *new = NULL; 392 struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = 393 { NULL, }; 394 395 trace_ocfs2_set_buffer_uptodate( 396 (unsigned long long)ocfs2_metadata_cache_owner(ci), 397 (unsigned long long)block, expand_tree); 398 399 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 400 if (!new) { 401 mlog_errno(-ENOMEM); 402 return; 403 } 404 new->c_block = block; 405 406 if (expand_tree) { 407 /* Do *not* allocate an array here - the removal code 408 * has no way of tracking that. */ 409 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) { 410 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 411 GFP_NOFS); 412 if (!tree[i]) { 413 mlog_errno(-ENOMEM); 414 goto out_free; 415 } 416 417 /* These are initialized in ocfs2_expand_cache! */ 418 } 419 } 420 421 ocfs2_metadata_cache_lock(ci); 422 if (ocfs2_insert_can_use_array(ci)) { 423 /* Ok, items were removed from the cache in between 424 * locks. Detect this and revert back to the fast path */ 425 ocfs2_append_cache_array(ci, block); 426 ocfs2_metadata_cache_unlock(ci); 427 goto out_free; 428 } 429 430 if (expand_tree) 431 ocfs2_expand_cache(ci, tree); 432 433 __ocfs2_insert_cache_tree(ci, new); 434 ocfs2_metadata_cache_unlock(ci); 435 436 new = NULL; 437 out_free: 438 if (new) 439 kmem_cache_free(ocfs2_uptodate_cachep, new); 440 441 /* If these were used, then ocfs2_expand_cache re-set them to 442 * NULL for us. */ 443 if (tree[0]) { 444 for (i = 0; i < OCFS2_CACHE_INFO_MAX_ARRAY; i++) 445 if (tree[i]) 446 kmem_cache_free(ocfs2_uptodate_cachep, 447 tree[i]); 448 } 449 } 450 451 /* Item insertion is guarded by co_io_lock(), so the insertion path takes 452 * advantage of this by not rechecking for a duplicate insert during 453 * the slow case. Additionally, if the cache needs to be bumped up to 454 * a tree, the code will not recheck after acquiring the lock -- 455 * multiple paths cannot be expanding to a tree at the same time. 456 * 457 * The slow path takes into account that items can be removed 458 * (including the whole tree wiped and reset) when this process it out 459 * allocating memory. In those cases, it reverts back to the fast 460 * path. 461 * 462 * Note that this function may actually fail to insert the block if 463 * memory cannot be allocated. This is not fatal however (but may 464 * result in a performance penalty) 465 * 466 * Readahead buffers can be passed in here before the I/O request is 467 * completed. 468 */ 469 void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci, 470 struct buffer_head *bh) 471 { 472 int expand; 473 474 /* The block may very well exist in our cache already, so avoid 475 * doing any more work in that case. */ 476 if (ocfs2_buffer_cached(ci, bh)) 477 return; 478 479 trace_ocfs2_set_buffer_uptodate_begin( 480 (unsigned long long)ocfs2_metadata_cache_owner(ci), 481 (unsigned long long)bh->b_blocknr); 482 483 /* No need to recheck under spinlock - insertion is guarded by 484 * co_io_lock() */ 485 ocfs2_metadata_cache_lock(ci); 486 if (ocfs2_insert_can_use_array(ci)) { 487 /* Fast case - it's an array and there's a free 488 * spot. */ 489 ocfs2_append_cache_array(ci, bh->b_blocknr); 490 ocfs2_metadata_cache_unlock(ci); 491 return; 492 } 493 494 expand = 0; 495 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { 496 /* We need to bump things up to a tree. */ 497 expand = 1; 498 } 499 ocfs2_metadata_cache_unlock(ci); 500 501 __ocfs2_set_buffer_uptodate(ci, bh->b_blocknr, expand); 502 } 503 504 /* Called against a newly allocated buffer. Most likely nobody should 505 * be able to read this sort of metadata while it's still being 506 * allocated, but this is careful to take co_io_lock() anyway. */ 507 void ocfs2_set_new_buffer_uptodate(struct ocfs2_caching_info *ci, 508 struct buffer_head *bh) 509 { 510 /* This should definitely *not* exist in our cache */ 511 BUG_ON(ocfs2_buffer_cached(ci, bh)); 512 513 set_buffer_uptodate(bh); 514 515 ocfs2_metadata_cache_io_lock(ci); 516 ocfs2_set_buffer_uptodate(ci, bh); 517 ocfs2_metadata_cache_io_unlock(ci); 518 } 519 520 /* Requires ip_lock. */ 521 static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, 522 int index) 523 { 524 sector_t *array = ci->ci_cache.ci_array; 525 int bytes; 526 527 BUG_ON(index < 0 || index >= OCFS2_CACHE_INFO_MAX_ARRAY); 528 BUG_ON(index >= ci->ci_num_cached); 529 BUG_ON(!ci->ci_num_cached); 530 531 trace_ocfs2_remove_metadata_array( 532 (unsigned long long)ocfs2_metadata_cache_owner(ci), 533 index, ci->ci_num_cached); 534 535 ci->ci_num_cached--; 536 537 /* don't need to copy if the array is now empty, or if we 538 * removed at the tail */ 539 if (ci->ci_num_cached && index < ci->ci_num_cached) { 540 bytes = sizeof(sector_t) * (ci->ci_num_cached - index); 541 memmove(&array[index], &array[index + 1], bytes); 542 } 543 } 544 545 /* Requires ip_lock. */ 546 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, 547 struct ocfs2_meta_cache_item *item) 548 { 549 trace_ocfs2_remove_metadata_tree( 550 (unsigned long long)ocfs2_metadata_cache_owner(ci), 551 (unsigned long long)item->c_block); 552 553 rb_erase(&item->c_node, &ci->ci_cache.ci_tree); 554 ci->ci_num_cached--; 555 } 556 557 static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci, 558 sector_t block) 559 { 560 int index; 561 struct ocfs2_meta_cache_item *item = NULL; 562 563 ocfs2_metadata_cache_lock(ci); 564 trace_ocfs2_remove_block_from_cache( 565 (unsigned long long)ocfs2_metadata_cache_owner(ci), 566 (unsigned long long) block, ci->ci_num_cached, 567 ci->ci_flags); 568 569 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { 570 index = ocfs2_search_cache_array(ci, block); 571 if (index != -1) 572 ocfs2_remove_metadata_array(ci, index); 573 } else { 574 item = ocfs2_search_cache_tree(ci, block); 575 if (item) 576 ocfs2_remove_metadata_tree(ci, item); 577 } 578 ocfs2_metadata_cache_unlock(ci); 579 580 if (item) 581 kmem_cache_free(ocfs2_uptodate_cachep, item); 582 } 583 584 /* 585 * Called when we remove a chunk of metadata from an inode. We don't 586 * bother reverting things to an inlined array in the case of a remove 587 * which moves us back under the limit. 588 */ 589 void ocfs2_remove_from_cache(struct ocfs2_caching_info *ci, 590 struct buffer_head *bh) 591 { 592 sector_t block = bh->b_blocknr; 593 594 ocfs2_remove_block_from_cache(ci, block); 595 } 596 597 /* Called when we remove xattr clusters from an inode. */ 598 void ocfs2_remove_xattr_clusters_from_cache(struct ocfs2_caching_info *ci, 599 sector_t block, 600 u32 c_len) 601 { 602 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 603 unsigned int i, b_len = ocfs2_clusters_to_blocks(sb, 1) * c_len; 604 605 for (i = 0; i < b_len; i++, block++) 606 ocfs2_remove_block_from_cache(ci, block); 607 } 608 609 int __init init_ocfs2_uptodate_cache(void) 610 { 611 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 612 sizeof(struct ocfs2_meta_cache_item), 613 0, SLAB_HWCACHE_ALIGN, NULL); 614 if (!ocfs2_uptodate_cachep) 615 return -ENOMEM; 616 617 return 0; 618 } 619 620 void exit_ocfs2_uptodate_cache(void) 621 { 622 kmem_cache_destroy(ocfs2_uptodate_cachep); 623 } 624