1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * uptodate.c 5 * 6 * Tracking the up-to-date-ness of a local buffer_head with respect to 7 * the cluster. 8 * 9 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public 22 * License along with this program; if not, write to the 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 * Boston, MA 021110-1307, USA. 25 * 26 * Standard buffer head caching flags (uptodate, etc) are insufficient 27 * in a clustered environment - a buffer may be marked up to date on 28 * our local node but could have been modified by another cluster 29 * member. As a result an additional (and performant) caching scheme 30 * is required. A further requirement is that we consume as little 31 * memory as possible - we never pin buffer_head structures in order 32 * to cache them. 33 * 34 * We track the existence of up to date buffers on the inodes which 35 * are associated with them. Because we don't want to pin 36 * buffer_heads, this is only a (strong) hint and several other checks 37 * are made in the I/O path to ensure that we don't use a stale or 38 * invalid buffer without going to disk: 39 * - buffer_jbd is used liberally - if a bh is in the journal on 40 * this node then it *must* be up to date. 41 * - the standard buffer_uptodate() macro is used to detect buffers 42 * which may be invalid (even if we have an up to date tracking 43 * item for them) 44 * 45 * For a full understanding of how this code works together, one 46 * should read the callers in dlmglue.c, the I/O functions in 47 * buffer_head_io.c and ocfs2_journal_access in journal.c 48 */ 49 50 #include <linux/fs.h> 51 #include <linux/types.h> 52 #include <linux/slab.h> 53 #include <linux/highmem.h> 54 #include <linux/buffer_head.h> 55 #include <linux/rbtree.h> 56 #include <linux/jbd.h> 57 58 #define MLOG_MASK_PREFIX ML_UPTODATE 59 60 #include <cluster/masklog.h> 61 62 #include "ocfs2.h" 63 64 #include "inode.h" 65 #include "uptodate.h" 66 67 struct ocfs2_meta_cache_item { 68 struct rb_node c_node; 69 sector_t c_block; 70 }; 71 72 static struct kmem_cache *ocfs2_uptodate_cachep = NULL; 73 74 void ocfs2_metadata_cache_init(struct inode *inode) 75 { 76 struct ocfs2_inode_info *oi = OCFS2_I(inode); 77 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 78 79 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; 80 ci->ci_num_cached = 0; 81 } 82 83 /* No lock taken here as 'root' is not expected to be visible to other 84 * processes. */ 85 static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 86 { 87 unsigned int purged = 0; 88 struct rb_node *node; 89 struct ocfs2_meta_cache_item *item; 90 91 while ((node = rb_last(root)) != NULL) { 92 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); 93 94 mlog(0, "Purge item %llu\n", 95 (unsigned long long) item->c_block); 96 97 rb_erase(&item->c_node, root); 98 kmem_cache_free(ocfs2_uptodate_cachep, item); 99 100 purged++; 101 } 102 return purged; 103 } 104 105 /* Called from locking and called from ocfs2_clear_inode. Dump the 106 * cache for a given inode. 107 * 108 * This function is a few more lines longer than necessary due to some 109 * accounting done here, but I think it's worth tracking down those 110 * bugs sooner -- Mark */ 111 void ocfs2_metadata_cache_purge(struct inode *inode) 112 { 113 struct ocfs2_inode_info *oi = OCFS2_I(inode); 114 unsigned int tree, to_purge, purged; 115 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 116 struct rb_root root = RB_ROOT; 117 118 spin_lock(&oi->ip_lock); 119 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 120 to_purge = ci->ci_num_cached; 121 122 mlog(0, "Purge %u %s items from Inode %llu\n", to_purge, 123 tree ? "array" : "tree", (unsigned long long)oi->ip_blkno); 124 125 /* If we're a tree, save off the root so that we can safely 126 * initialize the cache. We do the work to free tree members 127 * without the spinlock. */ 128 if (tree) 129 root = ci->ci_cache.ci_tree; 130 131 ocfs2_metadata_cache_init(inode); 132 spin_unlock(&oi->ip_lock); 133 134 purged = ocfs2_purge_copied_metadata_tree(&root); 135 /* If possible, track the number wiped so that we can more 136 * easily detect counting errors. Unfortunately, this is only 137 * meaningful for trees. */ 138 if (tree && purged != to_purge) 139 mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n", 140 (unsigned long long)oi->ip_blkno, to_purge, purged); 141 } 142 143 /* Returns the index in the cache array, -1 if not found. 144 * Requires ip_lock. */ 145 static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, 146 sector_t item) 147 { 148 int i; 149 150 for (i = 0; i < ci->ci_num_cached; i++) { 151 if (item == ci->ci_cache.ci_array[i]) 152 return i; 153 } 154 155 return -1; 156 } 157 158 /* Returns the cache item if found, otherwise NULL. 159 * Requires ip_lock. */ 160 static struct ocfs2_meta_cache_item * 161 ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, 162 sector_t block) 163 { 164 struct rb_node * n = ci->ci_cache.ci_tree.rb_node; 165 struct ocfs2_meta_cache_item *item = NULL; 166 167 while (n) { 168 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); 169 170 if (block < item->c_block) 171 n = n->rb_left; 172 else if (block > item->c_block) 173 n = n->rb_right; 174 else 175 return item; 176 } 177 178 return NULL; 179 } 180 181 static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, 182 struct buffer_head *bh) 183 { 184 int index = -1; 185 struct ocfs2_meta_cache_item *item = NULL; 186 187 spin_lock(&oi->ip_lock); 188 189 mlog(0, "Inode %llu, query block %llu (inline = %u)\n", 190 (unsigned long long)oi->ip_blkno, 191 (unsigned long long) bh->b_blocknr, 192 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); 193 194 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) 195 index = ocfs2_search_cache_array(&oi->ip_metadata_cache, 196 bh->b_blocknr); 197 else 198 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, 199 bh->b_blocknr); 200 201 spin_unlock(&oi->ip_lock); 202 203 mlog(0, "index = %d, item = %p\n", index, item); 204 205 return (index != -1) || (item != NULL); 206 } 207 208 /* Warning: even if it returns true, this does *not* guarantee that 209 * the block is stored in our inode metadata cache. 210 * 211 * This can be called under lock_buffer() 212 */ 213 int ocfs2_buffer_uptodate(struct inode *inode, 214 struct buffer_head *bh) 215 { 216 /* Doesn't matter if the bh is in our cache or not -- if it's 217 * not marked uptodate then we know it can't have correct 218 * data. */ 219 if (!buffer_uptodate(bh)) 220 return 0; 221 222 /* OCFS2 does not allow multiple nodes to be changing the same 223 * block at the same time. */ 224 if (buffer_jbd(bh)) 225 return 1; 226 227 /* Ok, locally the buffer is marked as up to date, now search 228 * our cache to see if we can trust that. */ 229 return ocfs2_buffer_cached(OCFS2_I(inode), bh); 230 } 231 232 /* 233 * Determine whether a buffer is currently out on a read-ahead request. 234 * ip_io_sem should be held to serialize submitters with the logic here. 235 */ 236 int ocfs2_buffer_read_ahead(struct inode *inode, 237 struct buffer_head *bh) 238 { 239 return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh); 240 } 241 242 /* Requires ip_lock */ 243 static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 244 sector_t block) 245 { 246 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); 247 248 mlog(0, "block %llu takes position %u\n", (unsigned long long) block, 249 ci->ci_num_cached); 250 251 ci->ci_cache.ci_array[ci->ci_num_cached] = block; 252 ci->ci_num_cached++; 253 } 254 255 /* By now the caller should have checked that the item does *not* 256 * exist in the tree. 257 * Requires ip_lock. */ 258 static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, 259 struct ocfs2_meta_cache_item *new) 260 { 261 sector_t block = new->c_block; 262 struct rb_node *parent = NULL; 263 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; 264 struct ocfs2_meta_cache_item *tmp; 265 266 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, 267 ci->ci_num_cached); 268 269 while(*p) { 270 parent = *p; 271 272 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); 273 274 if (block < tmp->c_block) 275 p = &(*p)->rb_left; 276 else if (block > tmp->c_block) 277 p = &(*p)->rb_right; 278 else { 279 /* This should never happen! */ 280 mlog(ML_ERROR, "Duplicate block %llu cached!\n", 281 (unsigned long long) block); 282 BUG(); 283 } 284 } 285 286 rb_link_node(&new->c_node, parent, p); 287 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); 288 ci->ci_num_cached++; 289 } 290 291 static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, 292 struct ocfs2_caching_info *ci) 293 { 294 assert_spin_locked(&oi->ip_lock); 295 296 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && 297 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); 298 } 299 300 /* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the 301 * pointers in tree after we use them - this allows caller to detect 302 * when to free in case of error. */ 303 static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, 304 struct ocfs2_meta_cache_item **tree) 305 { 306 int i; 307 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 308 309 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, 310 "Inode %llu, num cached = %u, should be %u\n", 311 (unsigned long long)oi->ip_blkno, ci->ci_num_cached, 312 OCFS2_INODE_MAX_CACHE_ARRAY); 313 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 314 "Inode %llu not marked as inline anymore!\n", 315 (unsigned long long)oi->ip_blkno); 316 assert_spin_locked(&oi->ip_lock); 317 318 /* Be careful to initialize the tree members *first* because 319 * once the ci_tree is used, the array is junk... */ 320 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 321 tree[i]->c_block = ci->ci_cache.ci_array[i]; 322 323 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; 324 ci->ci_cache.ci_tree = RB_ROOT; 325 /* this will be set again by __ocfs2_insert_cache_tree */ 326 ci->ci_num_cached = 0; 327 328 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 329 __ocfs2_insert_cache_tree(ci, tree[i]); 330 tree[i] = NULL; 331 } 332 333 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", 334 (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); 335 } 336 337 /* Slow path function - memory allocation is necessary. See the 338 * comment above ocfs2_set_buffer_uptodate for more information. */ 339 static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, 340 sector_t block, 341 int expand_tree) 342 { 343 int i; 344 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 345 struct ocfs2_meta_cache_item *new = NULL; 346 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = 347 { NULL, }; 348 349 mlog(0, "Inode %llu, block %llu, expand = %d\n", 350 (unsigned long long)oi->ip_blkno, 351 (unsigned long long)block, expand_tree); 352 353 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 354 if (!new) { 355 mlog_errno(-ENOMEM); 356 return; 357 } 358 new->c_block = block; 359 360 if (expand_tree) { 361 /* Do *not* allocate an array here - the removal code 362 * has no way of tracking that. */ 363 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 364 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 365 GFP_NOFS); 366 if (!tree[i]) { 367 mlog_errno(-ENOMEM); 368 goto out_free; 369 } 370 371 /* These are initialized in ocfs2_expand_cache! */ 372 } 373 } 374 375 spin_lock(&oi->ip_lock); 376 if (ocfs2_insert_can_use_array(oi, ci)) { 377 mlog(0, "Someone cleared the tree underneath us\n"); 378 /* Ok, items were removed from the cache in between 379 * locks. Detect this and revert back to the fast path */ 380 ocfs2_append_cache_array(ci, block); 381 spin_unlock(&oi->ip_lock); 382 goto out_free; 383 } 384 385 if (expand_tree) 386 ocfs2_expand_cache(oi, tree); 387 388 __ocfs2_insert_cache_tree(ci, new); 389 spin_unlock(&oi->ip_lock); 390 391 new = NULL; 392 out_free: 393 if (new) 394 kmem_cache_free(ocfs2_uptodate_cachep, new); 395 396 /* If these were used, then ocfs2_expand_cache re-set them to 397 * NULL for us. */ 398 if (tree[0]) { 399 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 400 if (tree[i]) 401 kmem_cache_free(ocfs2_uptodate_cachep, 402 tree[i]); 403 } 404 } 405 406 /* Item insertion is guarded by ip_io_mutex, so the insertion path takes 407 * advantage of this by not rechecking for a duplicate insert during 408 * the slow case. Additionally, if the cache needs to be bumped up to 409 * a tree, the code will not recheck after acquiring the lock -- 410 * multiple paths cannot be expanding to a tree at the same time. 411 * 412 * The slow path takes into account that items can be removed 413 * (including the whole tree wiped and reset) when this process it out 414 * allocating memory. In those cases, it reverts back to the fast 415 * path. 416 * 417 * Note that this function may actually fail to insert the block if 418 * memory cannot be allocated. This is not fatal however (but may 419 * result in a performance penalty) 420 * 421 * Readahead buffers can be passed in here before the I/O request is 422 * completed. 423 */ 424 void ocfs2_set_buffer_uptodate(struct inode *inode, 425 struct buffer_head *bh) 426 { 427 int expand; 428 struct ocfs2_inode_info *oi = OCFS2_I(inode); 429 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 430 431 /* The block may very well exist in our cache already, so avoid 432 * doing any more work in that case. */ 433 if (ocfs2_buffer_cached(oi, bh)) 434 return; 435 436 mlog(0, "Inode %llu, inserting block %llu\n", 437 (unsigned long long)oi->ip_blkno, 438 (unsigned long long)bh->b_blocknr); 439 440 /* No need to recheck under spinlock - insertion is guarded by 441 * ip_io_mutex */ 442 spin_lock(&oi->ip_lock); 443 if (ocfs2_insert_can_use_array(oi, ci)) { 444 /* Fast case - it's an array and there's a free 445 * spot. */ 446 ocfs2_append_cache_array(ci, bh->b_blocknr); 447 spin_unlock(&oi->ip_lock); 448 return; 449 } 450 451 expand = 0; 452 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 453 /* We need to bump things up to a tree. */ 454 expand = 1; 455 } 456 spin_unlock(&oi->ip_lock); 457 458 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); 459 } 460 461 /* Called against a newly allocated buffer. Most likely nobody should 462 * be able to read this sort of metadata while it's still being 463 * allocated, but this is careful to take ip_io_mutex anyway. */ 464 void ocfs2_set_new_buffer_uptodate(struct inode *inode, 465 struct buffer_head *bh) 466 { 467 struct ocfs2_inode_info *oi = OCFS2_I(inode); 468 469 /* This should definitely *not* exist in our cache */ 470 BUG_ON(ocfs2_buffer_cached(oi, bh)); 471 472 set_buffer_uptodate(bh); 473 474 mutex_lock(&oi->ip_io_mutex); 475 ocfs2_set_buffer_uptodate(inode, bh); 476 mutex_unlock(&oi->ip_io_mutex); 477 } 478 479 /* Requires ip_lock. */ 480 static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, 481 int index) 482 { 483 sector_t *array = ci->ci_cache.ci_array; 484 int bytes; 485 486 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); 487 BUG_ON(index >= ci->ci_num_cached); 488 BUG_ON(!ci->ci_num_cached); 489 490 mlog(0, "remove index %d (num_cached = %u\n", index, 491 ci->ci_num_cached); 492 493 ci->ci_num_cached--; 494 495 /* don't need to copy if the array is now empty, or if we 496 * removed at the tail */ 497 if (ci->ci_num_cached && index < ci->ci_num_cached) { 498 bytes = sizeof(sector_t) * (ci->ci_num_cached - index); 499 memmove(&array[index], &array[index + 1], bytes); 500 } 501 } 502 503 /* Requires ip_lock. */ 504 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, 505 struct ocfs2_meta_cache_item *item) 506 { 507 mlog(0, "remove block %llu from tree\n", 508 (unsigned long long) item->c_block); 509 510 rb_erase(&item->c_node, &ci->ci_cache.ci_tree); 511 ci->ci_num_cached--; 512 } 513 514 /* Called when we remove a chunk of metadata from an inode. We don't 515 * bother reverting things to an inlined array in the case of a remove 516 * which moves us back under the limit. */ 517 void ocfs2_remove_from_cache(struct inode *inode, 518 struct buffer_head *bh) 519 { 520 int index; 521 sector_t block = bh->b_blocknr; 522 struct ocfs2_meta_cache_item *item = NULL; 523 struct ocfs2_inode_info *oi = OCFS2_I(inode); 524 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 525 526 spin_lock(&oi->ip_lock); 527 mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n", 528 (unsigned long long)oi->ip_blkno, 529 (unsigned long long) block, ci->ci_num_cached, 530 oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 531 532 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 533 index = ocfs2_search_cache_array(ci, block); 534 if (index != -1) 535 ocfs2_remove_metadata_array(ci, index); 536 } else { 537 item = ocfs2_search_cache_tree(ci, block); 538 if (item) 539 ocfs2_remove_metadata_tree(ci, item); 540 } 541 spin_unlock(&oi->ip_lock); 542 543 if (item) 544 kmem_cache_free(ocfs2_uptodate_cachep, item); 545 } 546 547 int __init init_ocfs2_uptodate_cache(void) 548 { 549 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 550 sizeof(struct ocfs2_meta_cache_item), 551 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 552 if (!ocfs2_uptodate_cachep) 553 return -ENOMEM; 554 555 mlog(0, "%u inlined cache items per inode.\n", 556 OCFS2_INODE_MAX_CACHE_ARRAY); 557 558 return 0; 559 } 560 561 void exit_ocfs2_uptodate_cache(void) 562 { 563 if (ocfs2_uptodate_cachep) 564 kmem_cache_destroy(ocfs2_uptodate_cachep); 565 } 566