1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * uptodate.c 5 * 6 * Tracking the up-to-date-ness of a local buffer_head with respect to 7 * the cluster. 8 * 9 * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. 10 * 11 * This program is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This program is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * General Public License for more details. 20 * 21 * You should have received a copy of the GNU General Public 22 * License along with this program; if not, write to the 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 * Boston, MA 021110-1307, USA. 25 * 26 * Standard buffer head caching flags (uptodate, etc) are insufficient 27 * in a clustered environment - a buffer may be marked up to date on 28 * our local node but could have been modified by another cluster 29 * member. As a result an additional (and performant) caching scheme 30 * is required. A further requirement is that we consume as little 31 * memory as possible - we never pin buffer_head structures in order 32 * to cache them. 33 * 34 * We track the existence of up to date buffers on the inodes which 35 * are associated with them. Because we don't want to pin 36 * buffer_heads, this is only a (strong) hint and several other checks 37 * are made in the I/O path to ensure that we don't use a stale or 38 * invalid buffer without going to disk: 39 * - buffer_jbd is used liberally - if a bh is in the journal on 40 * this node then it *must* be up to date. 41 * - the standard buffer_uptodate() macro is used to detect buffers 42 * which may be invalid (even if we have an up to date tracking 43 * item for them) 44 * 45 * For a full understanding of how this code works together, one 46 * should read the callers in dlmglue.c, the I/O functions in 47 * buffer_head_io.c and ocfs2_journal_access in journal.c 48 */ 49 50 #include <linux/fs.h> 51 #include <linux/types.h> 52 #include <linux/slab.h> 53 #include <linux/highmem.h> 54 #include <linux/buffer_head.h> 55 #include <linux/rbtree.h> 56 #include <linux/jbd.h> 57 58 #define MLOG_MASK_PREFIX ML_UPTODATE 59 60 #include <cluster/masklog.h> 61 62 #include "ocfs2.h" 63 64 #include "inode.h" 65 #include "uptodate.h" 66 67 struct ocfs2_meta_cache_item { 68 struct rb_node c_node; 69 sector_t c_block; 70 }; 71 72 static kmem_cache_t *ocfs2_uptodate_cachep = NULL; 73 74 void ocfs2_metadata_cache_init(struct inode *inode) 75 { 76 struct ocfs2_inode_info *oi = OCFS2_I(inode); 77 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 78 79 oi->ip_flags |= OCFS2_INODE_CACHE_INLINE; 80 ci->ci_num_cached = 0; 81 } 82 83 /* No lock taken here as 'root' is not expected to be visible to other 84 * processes. */ 85 static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root) 86 { 87 unsigned int purged = 0; 88 struct rb_node *node; 89 struct ocfs2_meta_cache_item *item; 90 91 while ((node = rb_last(root)) != NULL) { 92 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); 93 94 mlog(0, "Purge item %llu\n", 95 (unsigned long long) item->c_block); 96 97 rb_erase(&item->c_node, root); 98 kmem_cache_free(ocfs2_uptodate_cachep, item); 99 100 purged++; 101 } 102 return purged; 103 } 104 105 /* Called from locking and called from ocfs2_clear_inode. Dump the 106 * cache for a given inode. 107 * 108 * This function is a few more lines longer than necessary due to some 109 * accounting done here, but I think it's worth tracking down those 110 * bugs sooner -- Mark */ 111 void ocfs2_metadata_cache_purge(struct inode *inode) 112 { 113 struct ocfs2_inode_info *oi = OCFS2_I(inode); 114 unsigned int tree, to_purge, purged; 115 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 116 struct rb_root root = RB_ROOT; 117 118 spin_lock(&oi->ip_lock); 119 tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 120 to_purge = ci->ci_num_cached; 121 122 mlog(0, "Purge %u %s items from Inode %"MLFu64"\n", to_purge, 123 tree ? "array" : "tree", oi->ip_blkno); 124 125 /* If we're a tree, save off the root so that we can safely 126 * initialize the cache. We do the work to free tree members 127 * without the spinlock. */ 128 if (tree) 129 root = ci->ci_cache.ci_tree; 130 131 ocfs2_metadata_cache_init(inode); 132 spin_unlock(&oi->ip_lock); 133 134 purged = ocfs2_purge_copied_metadata_tree(&root); 135 /* If possible, track the number wiped so that we can more 136 * easily detect counting errors. Unfortunately, this is only 137 * meaningful for trees. */ 138 if (tree && purged != to_purge) 139 mlog(ML_ERROR, "Inode %"MLFu64", count = %u, purged = %u\n", 140 oi->ip_blkno, to_purge, purged); 141 } 142 143 /* Returns the index in the cache array, -1 if not found. 144 * Requires ip_lock. */ 145 static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci, 146 sector_t item) 147 { 148 int i; 149 150 for (i = 0; i < ci->ci_num_cached; i++) { 151 if (item == ci->ci_cache.ci_array[i]) 152 return i; 153 } 154 155 return -1; 156 } 157 158 /* Returns the cache item if found, otherwise NULL. 159 * Requires ip_lock. */ 160 static struct ocfs2_meta_cache_item * 161 ocfs2_search_cache_tree(struct ocfs2_caching_info *ci, 162 sector_t block) 163 { 164 struct rb_node * n = ci->ci_cache.ci_tree.rb_node; 165 struct ocfs2_meta_cache_item *item = NULL; 166 167 while (n) { 168 item = rb_entry(n, struct ocfs2_meta_cache_item, c_node); 169 170 if (block < item->c_block) 171 n = n->rb_left; 172 else if (block > item->c_block) 173 n = n->rb_right; 174 else 175 return item; 176 } 177 178 return NULL; 179 } 180 181 static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi, 182 struct buffer_head *bh) 183 { 184 int index = -1; 185 struct ocfs2_meta_cache_item *item = NULL; 186 187 spin_lock(&oi->ip_lock); 188 189 mlog(0, "Inode %"MLFu64", query block %llu (inline = %u)\n", 190 oi->ip_blkno, (unsigned long long) bh->b_blocknr, 191 !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE)); 192 193 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) 194 index = ocfs2_search_cache_array(&oi->ip_metadata_cache, 195 bh->b_blocknr); 196 else 197 item = ocfs2_search_cache_tree(&oi->ip_metadata_cache, 198 bh->b_blocknr); 199 200 spin_unlock(&oi->ip_lock); 201 202 mlog(0, "index = %d, item = %p\n", index, item); 203 204 return (index != -1) || (item != NULL); 205 } 206 207 /* Warning: even if it returns true, this does *not* guarantee that 208 * the block is stored in our inode metadata cache. */ 209 int ocfs2_buffer_uptodate(struct inode *inode, 210 struct buffer_head *bh) 211 { 212 /* Doesn't matter if the bh is in our cache or not -- if it's 213 * not marked uptodate then we know it can't have correct 214 * data. */ 215 if (!buffer_uptodate(bh)) 216 return 0; 217 218 /* OCFS2 does not allow multiple nodes to be changing the same 219 * block at the same time. */ 220 if (buffer_jbd(bh)) 221 return 1; 222 223 /* Ok, locally the buffer is marked as up to date, now search 224 * our cache to see if we can trust that. */ 225 return ocfs2_buffer_cached(OCFS2_I(inode), bh); 226 } 227 228 /* Requires ip_lock */ 229 static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci, 230 sector_t block) 231 { 232 BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY); 233 234 mlog(0, "block %llu takes position %u\n", (unsigned long long) block, 235 ci->ci_num_cached); 236 237 ci->ci_cache.ci_array[ci->ci_num_cached] = block; 238 ci->ci_num_cached++; 239 } 240 241 /* By now the caller should have checked that the item does *not* 242 * exist in the tree. 243 * Requires ip_lock. */ 244 static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci, 245 struct ocfs2_meta_cache_item *new) 246 { 247 sector_t block = new->c_block; 248 struct rb_node *parent = NULL; 249 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; 250 struct ocfs2_meta_cache_item *tmp; 251 252 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, 253 ci->ci_num_cached); 254 255 while(*p) { 256 parent = *p; 257 258 tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node); 259 260 if (block < tmp->c_block) 261 p = &(*p)->rb_left; 262 else if (block > tmp->c_block) 263 p = &(*p)->rb_right; 264 else { 265 /* This should never happen! */ 266 mlog(ML_ERROR, "Duplicate block %llu cached!\n", 267 (unsigned long long) block); 268 BUG(); 269 } 270 } 271 272 rb_link_node(&new->c_node, parent, p); 273 rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree); 274 ci->ci_num_cached++; 275 } 276 277 static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi, 278 struct ocfs2_caching_info *ci) 279 { 280 assert_spin_locked(&oi->ip_lock); 281 282 return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) && 283 (ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY); 284 } 285 286 /* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the 287 * pointers in tree after we use them - this allows caller to detect 288 * when to free in case of error. */ 289 static void ocfs2_expand_cache(struct ocfs2_inode_info *oi, 290 struct ocfs2_meta_cache_item **tree) 291 { 292 int i; 293 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 294 295 mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY, 296 "Inode %"MLFu64", num cached = %u, should be %u\n", 297 oi->ip_blkno, ci->ci_num_cached, 298 OCFS2_INODE_MAX_CACHE_ARRAY); 299 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 300 "Inode %"MLFu64" not marked as inline anymore!\n", 301 oi->ip_blkno); 302 assert_spin_locked(&oi->ip_lock); 303 304 /* Be careful to initialize the tree members *first* because 305 * once the ci_tree is used, the array is junk... */ 306 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 307 tree[i]->c_block = ci->ci_cache.ci_array[i]; 308 309 oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE; 310 ci->ci_cache.ci_tree = RB_ROOT; 311 /* this will be set again by __ocfs2_insert_cache_tree */ 312 ci->ci_num_cached = 0; 313 314 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 315 __ocfs2_insert_cache_tree(ci, tree[i]); 316 tree[i] = NULL; 317 } 318 319 mlog(0, "Expanded %"MLFu64" to a tree cache: flags 0x%x, num = %u\n", 320 oi->ip_blkno, oi->ip_flags, ci->ci_num_cached); 321 } 322 323 /* Slow path function - memory allocation is necessary. See the 324 * comment above ocfs2_set_buffer_uptodate for more information. */ 325 static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, 326 sector_t block, 327 int expand_tree) 328 { 329 int i; 330 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 331 struct ocfs2_meta_cache_item *new = NULL; 332 struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] = 333 { NULL, }; 334 335 mlog(0, "Inode %"MLFu64", block %llu, expand = %d\n", 336 oi->ip_blkno, (unsigned long long) block, expand_tree); 337 338 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL); 339 if (!new) { 340 mlog_errno(-ENOMEM); 341 return; 342 } 343 new->c_block = block; 344 345 if (expand_tree) { 346 /* Do *not* allocate an array here - the removal code 347 * has no way of tracking that. */ 348 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { 349 tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, 350 GFP_KERNEL); 351 if (!tree[i]) { 352 mlog_errno(-ENOMEM); 353 goto out_free; 354 } 355 356 /* These are initialized in ocfs2_expand_cache! */ 357 } 358 } 359 360 spin_lock(&oi->ip_lock); 361 if (ocfs2_insert_can_use_array(oi, ci)) { 362 mlog(0, "Someone cleared the tree underneath us\n"); 363 /* Ok, items were removed from the cache in between 364 * locks. Detect this and revert back to the fast path */ 365 ocfs2_append_cache_array(ci, block); 366 spin_unlock(&oi->ip_lock); 367 goto out_free; 368 } 369 370 if (expand_tree) 371 ocfs2_expand_cache(oi, tree); 372 373 __ocfs2_insert_cache_tree(ci, new); 374 spin_unlock(&oi->ip_lock); 375 376 new = NULL; 377 out_free: 378 if (new) 379 kmem_cache_free(ocfs2_uptodate_cachep, new); 380 381 /* If these were used, then ocfs2_expand_cache re-set them to 382 * NULL for us. */ 383 if (tree[0]) { 384 for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) 385 if (tree[i]) 386 kmem_cache_free(ocfs2_uptodate_cachep, 387 tree[i]); 388 } 389 } 390 391 /* Item insertion is guarded by ip_io_sem, so the insertion path takes 392 * advantage of this by not rechecking for a duplicate insert during 393 * the slow case. Additionally, if the cache needs to be bumped up to 394 * a tree, the code will not recheck after acquiring the lock -- 395 * multiple paths cannot be expanding to a tree at the same time. 396 * 397 * The slow path takes into account that items can be removed 398 * (including the whole tree wiped and reset) when this process it out 399 * allocating memory. In those cases, it reverts back to the fast 400 * path. 401 * 402 * Note that this function may actually fail to insert the block if 403 * memory cannot be allocated. This is not fatal however (but may 404 * result in a performance penalty) */ 405 void ocfs2_set_buffer_uptodate(struct inode *inode, 406 struct buffer_head *bh) 407 { 408 int expand; 409 struct ocfs2_inode_info *oi = OCFS2_I(inode); 410 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 411 412 /* The block may very well exist in our cache already, so avoid 413 * doing any more work in that case. */ 414 if (ocfs2_buffer_cached(oi, bh)) 415 return; 416 417 mlog(0, "Inode %"MLFu64", inserting block %llu\n", oi->ip_blkno, 418 (unsigned long long) bh->b_blocknr); 419 420 /* No need to recheck under spinlock - insertion is guarded by 421 * ip_io_sem */ 422 spin_lock(&oi->ip_lock); 423 if (ocfs2_insert_can_use_array(oi, ci)) { 424 /* Fast case - it's an array and there's a free 425 * spot. */ 426 ocfs2_append_cache_array(ci, bh->b_blocknr); 427 spin_unlock(&oi->ip_lock); 428 return; 429 } 430 431 expand = 0; 432 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 433 /* We need to bump things up to a tree. */ 434 expand = 1; 435 } 436 spin_unlock(&oi->ip_lock); 437 438 __ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand); 439 } 440 441 /* Called against a newly allocated buffer. Most likely nobody should 442 * be able to read this sort of metadata while it's still being 443 * allocated, but this is careful to take ip_io_sem anyway. */ 444 void ocfs2_set_new_buffer_uptodate(struct inode *inode, 445 struct buffer_head *bh) 446 { 447 struct ocfs2_inode_info *oi = OCFS2_I(inode); 448 449 /* This should definitely *not* exist in our cache */ 450 BUG_ON(ocfs2_buffer_cached(oi, bh)); 451 452 set_buffer_uptodate(bh); 453 454 down(&oi->ip_io_sem); 455 ocfs2_set_buffer_uptodate(inode, bh); 456 up(&oi->ip_io_sem); 457 } 458 459 /* Requires ip_lock. */ 460 static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci, 461 int index) 462 { 463 sector_t *array = ci->ci_cache.ci_array; 464 int bytes; 465 466 BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY); 467 BUG_ON(index >= ci->ci_num_cached); 468 BUG_ON(!ci->ci_num_cached); 469 470 mlog(0, "remove index %d (num_cached = %u\n", index, 471 ci->ci_num_cached); 472 473 ci->ci_num_cached--; 474 475 /* don't need to copy if the array is now empty, or if we 476 * removed at the tail */ 477 if (ci->ci_num_cached && index < ci->ci_num_cached) { 478 bytes = sizeof(sector_t) * (ci->ci_num_cached - index); 479 memmove(&array[index], &array[index + 1], bytes); 480 } 481 } 482 483 /* Requires ip_lock. */ 484 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, 485 struct ocfs2_meta_cache_item *item) 486 { 487 mlog(0, "remove block %llu from tree\n", 488 (unsigned long long) item->c_block); 489 490 rb_erase(&item->c_node, &ci->ci_cache.ci_tree); 491 ci->ci_num_cached--; 492 } 493 494 /* Called when we remove a chunk of metadata from an inode. We don't 495 * bother reverting things to an inlined array in the case of a remove 496 * which moves us back under the limit. */ 497 void ocfs2_remove_from_cache(struct inode *inode, 498 struct buffer_head *bh) 499 { 500 int index; 501 sector_t block = bh->b_blocknr; 502 struct ocfs2_meta_cache_item *item = NULL; 503 struct ocfs2_inode_info *oi = OCFS2_I(inode); 504 struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; 505 506 spin_lock(&oi->ip_lock); 507 mlog(0, "Inode %"MLFu64", remove %llu, items = %u, array = %u\n", 508 oi->ip_blkno, (unsigned long long) block, ci->ci_num_cached, 509 oi->ip_flags & OCFS2_INODE_CACHE_INLINE); 510 511 if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) { 512 index = ocfs2_search_cache_array(ci, block); 513 if (index != -1) 514 ocfs2_remove_metadata_array(ci, index); 515 } else { 516 item = ocfs2_search_cache_tree(ci, block); 517 if (item) 518 ocfs2_remove_metadata_tree(ci, item); 519 } 520 spin_unlock(&oi->ip_lock); 521 522 if (item) 523 kmem_cache_free(ocfs2_uptodate_cachep, item); 524 } 525 526 int __init init_ocfs2_uptodate_cache(void) 527 { 528 ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", 529 sizeof(struct ocfs2_meta_cache_item), 530 0, SLAB_HWCACHE_ALIGN, NULL, NULL); 531 if (!ocfs2_uptodate_cachep) 532 return -ENOMEM; 533 534 mlog(0, "%u inlined cache items per inode.\n", 535 OCFS2_INODE_MAX_CACHE_ARRAY); 536 537 return 0; 538 } 539 540 void __exit exit_ocfs2_uptodate_cache(void) 541 { 542 if (ocfs2_uptodate_cachep) 543 kmem_cache_destroy(ocfs2_uptodate_cachep); 544 } 545