1 /* 2 * Copyright (c) 2014-2016 Christoph Hellwig. 3 */ 4 5 #include <linux/vmalloc.h> 6 7 #include "blocklayout.h" 8 9 #define NFSDBG_FACILITY NFSDBG_PNFS_LD 10 11 static inline struct pnfs_block_extent * 12 ext_node(struct rb_node *node) 13 { 14 return rb_entry(node, struct pnfs_block_extent, be_node); 15 } 16 17 static struct pnfs_block_extent * 18 ext_tree_first(struct rb_root *root) 19 { 20 struct rb_node *node = rb_first(root); 21 return node ? ext_node(node) : NULL; 22 } 23 24 static struct pnfs_block_extent * 25 ext_tree_prev(struct pnfs_block_extent *be) 26 { 27 struct rb_node *node = rb_prev(&be->be_node); 28 return node ? ext_node(node) : NULL; 29 } 30 31 static struct pnfs_block_extent * 32 ext_tree_next(struct pnfs_block_extent *be) 33 { 34 struct rb_node *node = rb_next(&be->be_node); 35 return node ? ext_node(node) : NULL; 36 } 37 38 static inline sector_t 39 ext_f_end(struct pnfs_block_extent *be) 40 { 41 return be->be_f_offset + be->be_length; 42 } 43 44 static struct pnfs_block_extent * 45 __ext_tree_search(struct rb_root *root, sector_t start) 46 { 47 struct rb_node *node = root->rb_node; 48 struct pnfs_block_extent *be = NULL; 49 50 while (node) { 51 be = ext_node(node); 52 if (start < be->be_f_offset) 53 node = node->rb_left; 54 else if (start >= ext_f_end(be)) 55 node = node->rb_right; 56 else 57 return be; 58 } 59 60 if (be) { 61 if (start < be->be_f_offset) 62 return be; 63 64 if (start >= ext_f_end(be)) 65 return ext_tree_next(be); 66 } 67 68 return NULL; 69 } 70 71 static bool 72 ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) 73 { 74 if (be1->be_state != be2->be_state) 75 return false; 76 if (be1->be_device != be2->be_device) 77 return false; 78 79 if (be1->be_f_offset + be1->be_length != be2->be_f_offset) 80 return false; 81 82 if (be1->be_state != PNFS_BLOCK_NONE_DATA && 83 (be1->be_v_offset + be1->be_length != be2->be_v_offset)) 84 return false; 85 86 if (be1->be_state == PNFS_BLOCK_INVALID_DATA && 87 be1->be_tag != be2->be_tag) 88 return false; 89 90 return true; 91 } 92 93 static struct pnfs_block_extent * 94 ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) 95 { 96 struct pnfs_block_extent *left = ext_tree_prev(be); 97 98 if (left && ext_can_merge(left, be)) { 99 left->be_length += be->be_length; 100 rb_erase(&be->be_node, root); 101 nfs4_put_deviceid_node(be->be_device); 102 kfree(be); 103 return left; 104 } 105 106 return be; 107 } 108 109 static struct pnfs_block_extent * 110 ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) 111 { 112 struct pnfs_block_extent *right = ext_tree_next(be); 113 114 if (right && ext_can_merge(be, right)) { 115 be->be_length += right->be_length; 116 rb_erase(&right->be_node, root); 117 nfs4_put_deviceid_node(right->be_device); 118 kfree(right); 119 } 120 121 return be; 122 } 123 124 static void __ext_put_deviceids(struct list_head *head) 125 { 126 struct pnfs_block_extent *be, *tmp; 127 128 list_for_each_entry_safe(be, tmp, head, be_list) { 129 nfs4_put_deviceid_node(be->be_device); 130 kfree(be); 131 } 132 } 133 134 static void 135 __ext_tree_insert(struct rb_root *root, 136 struct pnfs_block_extent *new, bool merge_ok) 137 { 138 struct rb_node **p = &root->rb_node, *parent = NULL; 139 struct pnfs_block_extent *be; 140 141 while (*p) { 142 parent = *p; 143 be = ext_node(parent); 144 145 if (new->be_f_offset < be->be_f_offset) { 146 if (merge_ok && ext_can_merge(new, be)) { 147 be->be_f_offset = new->be_f_offset; 148 if (be->be_state != PNFS_BLOCK_NONE_DATA) 149 be->be_v_offset = new->be_v_offset; 150 be->be_length += new->be_length; 151 be = ext_try_to_merge_left(root, be); 152 goto free_new; 153 } 154 p = &(*p)->rb_left; 155 } else if (new->be_f_offset >= ext_f_end(be)) { 156 if (merge_ok && ext_can_merge(be, new)) { 157 be->be_length += new->be_length; 158 be = ext_try_to_merge_right(root, be); 159 goto free_new; 160 } 161 p = &(*p)->rb_right; 162 } else { 163 BUG(); 164 } 165 } 166 167 rb_link_node(&new->be_node, parent, p); 168 rb_insert_color(&new->be_node, root); 169 return; 170 free_new: 171 nfs4_put_deviceid_node(new->be_device); 172 kfree(new); 173 } 174 175 static int 176 __ext_tree_remove(struct rb_root *root, 177 sector_t start, sector_t end, struct list_head *tmp) 178 { 179 struct pnfs_block_extent *be; 180 sector_t len1 = 0, len2 = 0; 181 sector_t orig_v_offset; 182 sector_t orig_len; 183 184 be = __ext_tree_search(root, start); 185 if (!be) 186 return 0; 187 if (be->be_f_offset >= end) 188 return 0; 189 190 orig_v_offset = be->be_v_offset; 191 orig_len = be->be_length; 192 193 if (start > be->be_f_offset) 194 len1 = start - be->be_f_offset; 195 if (ext_f_end(be) > end) 196 len2 = ext_f_end(be) - end; 197 198 if (len2 > 0) { 199 if (len1 > 0) { 200 struct pnfs_block_extent *new; 201 202 new = kzalloc(sizeof(*new), GFP_ATOMIC); 203 if (!new) 204 return -ENOMEM; 205 206 be->be_length = len1; 207 208 new->be_f_offset = end; 209 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 210 new->be_v_offset = 211 orig_v_offset + orig_len - len2; 212 } 213 new->be_length = len2; 214 new->be_state = be->be_state; 215 new->be_tag = be->be_tag; 216 new->be_device = nfs4_get_deviceid(be->be_device); 217 218 __ext_tree_insert(root, new, true); 219 } else { 220 be->be_f_offset = end; 221 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 222 be->be_v_offset = 223 orig_v_offset + orig_len - len2; 224 } 225 be->be_length = len2; 226 } 227 } else { 228 if (len1 > 0) { 229 be->be_length = len1; 230 be = ext_tree_next(be); 231 } 232 233 while (be && ext_f_end(be) <= end) { 234 struct pnfs_block_extent *next = ext_tree_next(be); 235 236 rb_erase(&be->be_node, root); 237 list_add_tail(&be->be_list, tmp); 238 be = next; 239 } 240 241 if (be && be->be_f_offset < end) { 242 len1 = ext_f_end(be) - end; 243 be->be_f_offset = end; 244 if (be->be_state != PNFS_BLOCK_NONE_DATA) 245 be->be_v_offset += be->be_length - len1; 246 be->be_length = len1; 247 } 248 } 249 250 return 0; 251 } 252 253 int 254 ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) 255 { 256 struct pnfs_block_extent *be; 257 struct rb_root *root; 258 int err = 0; 259 260 switch (new->be_state) { 261 case PNFS_BLOCK_READWRITE_DATA: 262 case PNFS_BLOCK_INVALID_DATA: 263 root = &bl->bl_ext_rw; 264 break; 265 case PNFS_BLOCK_READ_DATA: 266 case PNFS_BLOCK_NONE_DATA: 267 root = &bl->bl_ext_ro; 268 break; 269 default: 270 dprintk("invalid extent type\n"); 271 return -EINVAL; 272 } 273 274 spin_lock(&bl->bl_ext_lock); 275 retry: 276 be = __ext_tree_search(root, new->be_f_offset); 277 if (!be || be->be_f_offset >= ext_f_end(new)) { 278 __ext_tree_insert(root, new, true); 279 } else if (new->be_f_offset >= be->be_f_offset) { 280 if (ext_f_end(new) <= ext_f_end(be)) { 281 nfs4_put_deviceid_node(new->be_device); 282 kfree(new); 283 } else { 284 sector_t new_len = ext_f_end(new) - ext_f_end(be); 285 sector_t diff = new->be_length - new_len; 286 287 new->be_f_offset += diff; 288 new->be_v_offset += diff; 289 new->be_length = new_len; 290 goto retry; 291 } 292 } else if (ext_f_end(new) <= ext_f_end(be)) { 293 new->be_length = be->be_f_offset - new->be_f_offset; 294 __ext_tree_insert(root, new, true); 295 } else { 296 struct pnfs_block_extent *split; 297 sector_t new_len = ext_f_end(new) - ext_f_end(be); 298 sector_t diff = new->be_length - new_len; 299 300 split = kmemdup(new, sizeof(*new), GFP_ATOMIC); 301 if (!split) { 302 err = -EINVAL; 303 goto out; 304 } 305 306 split->be_length = be->be_f_offset - split->be_f_offset; 307 split->be_device = nfs4_get_deviceid(new->be_device); 308 __ext_tree_insert(root, split, true); 309 310 new->be_f_offset += diff; 311 new->be_v_offset += diff; 312 new->be_length = new_len; 313 goto retry; 314 } 315 out: 316 spin_unlock(&bl->bl_ext_lock); 317 return err; 318 } 319 320 static bool 321 __ext_tree_lookup(struct rb_root *root, sector_t isect, 322 struct pnfs_block_extent *ret) 323 { 324 struct rb_node *node; 325 struct pnfs_block_extent *be; 326 327 node = root->rb_node; 328 while (node) { 329 be = ext_node(node); 330 if (isect < be->be_f_offset) 331 node = node->rb_left; 332 else if (isect >= ext_f_end(be)) 333 node = node->rb_right; 334 else { 335 *ret = *be; 336 return true; 337 } 338 } 339 340 return false; 341 } 342 343 bool 344 ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, 345 struct pnfs_block_extent *ret, bool rw) 346 { 347 bool found = false; 348 349 spin_lock(&bl->bl_ext_lock); 350 if (!rw) 351 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); 352 if (!found) 353 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); 354 spin_unlock(&bl->bl_ext_lock); 355 356 return found; 357 } 358 359 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, 360 sector_t start, sector_t end) 361 { 362 int err, err2; 363 LIST_HEAD(tmp); 364 365 spin_lock(&bl->bl_ext_lock); 366 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 367 if (rw) { 368 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); 369 if (!err) 370 err = err2; 371 } 372 spin_unlock(&bl->bl_ext_lock); 373 374 __ext_put_deviceids(&tmp); 375 return err; 376 } 377 378 static int 379 ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, 380 sector_t split) 381 { 382 struct pnfs_block_extent *new; 383 sector_t orig_len = be->be_length; 384 385 new = kzalloc(sizeof(*new), GFP_ATOMIC); 386 if (!new) 387 return -ENOMEM; 388 389 be->be_length = split - be->be_f_offset; 390 391 new->be_f_offset = split; 392 if (be->be_state != PNFS_BLOCK_NONE_DATA) 393 new->be_v_offset = be->be_v_offset + be->be_length; 394 new->be_length = orig_len - be->be_length; 395 new->be_state = be->be_state; 396 new->be_tag = be->be_tag; 397 new->be_device = nfs4_get_deviceid(be->be_device); 398 399 __ext_tree_insert(root, new, false); 400 return 0; 401 } 402 403 int 404 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, 405 sector_t len) 406 { 407 struct rb_root *root = &bl->bl_ext_rw; 408 sector_t end = start + len; 409 struct pnfs_block_extent *be; 410 int err = 0; 411 LIST_HEAD(tmp); 412 413 spin_lock(&bl->bl_ext_lock); 414 /* 415 * First remove all COW extents or holes from written to range. 416 */ 417 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 418 if (err) 419 goto out; 420 421 /* 422 * Then mark all invalid extents in the range as written to. 423 */ 424 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { 425 if (be->be_f_offset >= end) 426 break; 427 428 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) 429 continue; 430 431 if (be->be_f_offset < start) { 432 struct pnfs_block_extent *left = ext_tree_prev(be); 433 434 if (left && ext_can_merge(left, be)) { 435 sector_t diff = start - be->be_f_offset; 436 437 left->be_length += diff; 438 439 be->be_f_offset += diff; 440 be->be_v_offset += diff; 441 be->be_length -= diff; 442 } else { 443 err = ext_tree_split(root, be, start); 444 if (err) 445 goto out; 446 } 447 } 448 449 if (ext_f_end(be) > end) { 450 struct pnfs_block_extent *right = ext_tree_next(be); 451 452 if (right && ext_can_merge(be, right)) { 453 sector_t diff = end - be->be_f_offset; 454 455 be->be_length -= diff; 456 457 right->be_f_offset -= diff; 458 right->be_v_offset -= diff; 459 right->be_length += diff; 460 } else { 461 err = ext_tree_split(root, be, end); 462 if (err) 463 goto out; 464 } 465 } 466 467 if (be->be_f_offset >= start && ext_f_end(be) <= end) { 468 be->be_tag = EXTENT_WRITTEN; 469 be = ext_try_to_merge_left(root, be); 470 be = ext_try_to_merge_right(root, be); 471 } 472 } 473 out: 474 spin_unlock(&bl->bl_ext_lock); 475 476 __ext_put_deviceids(&tmp); 477 return err; 478 } 479 480 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count) 481 { 482 if (bl->bl_scsi_layout) 483 return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count; 484 else 485 return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count; 486 } 487 488 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, 489 size_t buffer_size) 490 { 491 if (arg->layoutupdate_pages != &arg->layoutupdate_page) { 492 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; 493 494 for (i = 0; i < nr_pages; i++) 495 put_page(arg->layoutupdate_pages[i]); 496 vfree(arg->start_p); 497 kfree(arg->layoutupdate_pages); 498 } else { 499 put_page(arg->layoutupdate_page); 500 } 501 } 502 503 static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p) 504 { 505 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, 506 NFS4_DEVICEID4_SIZE); 507 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 508 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 509 p = xdr_encode_hyper(p, 0LL); 510 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 511 return p; 512 } 513 514 static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) 515 { 516 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 517 return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 518 } 519 520 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 521 size_t buffer_size, size_t *count) 522 { 523 struct pnfs_block_extent *be; 524 int ret = 0; 525 526 spin_lock(&bl->bl_ext_lock); 527 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { 528 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 529 be->be_tag != EXTENT_WRITTEN) 530 continue; 531 532 (*count)++; 533 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { 534 /* keep counting.. */ 535 ret = -ENOSPC; 536 continue; 537 } 538 539 if (bl->bl_scsi_layout) 540 p = encode_scsi_range(be, p); 541 else 542 p = encode_block_extent(be, p); 543 be->be_tag = EXTENT_COMMITTING; 544 } 545 spin_unlock(&bl->bl_ext_lock); 546 547 return ret; 548 } 549 550 int 551 ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) 552 { 553 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 554 size_t count = 0, buffer_size = PAGE_SIZE; 555 __be32 *start_p; 556 int ret; 557 558 dprintk("%s enter\n", __func__); 559 560 arg->layoutupdate_page = alloc_page(GFP_NOFS); 561 if (!arg->layoutupdate_page) 562 return -ENOMEM; 563 start_p = page_address(arg->layoutupdate_page); 564 arg->layoutupdate_pages = &arg->layoutupdate_page; 565 566 retry: 567 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); 568 if (unlikely(ret)) { 569 ext_tree_free_commitdata(arg, buffer_size); 570 571 buffer_size = ext_tree_layoutupdate_size(bl, count); 572 count = 0; 573 574 arg->layoutupdate_pages = 575 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 576 sizeof(struct page *), GFP_NOFS); 577 if (!arg->layoutupdate_pages) 578 return -ENOMEM; 579 580 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); 581 if (!start_p) { 582 kfree(arg->layoutupdate_pages); 583 return -ENOMEM; 584 } 585 586 goto retry; 587 } 588 589 *start_p = cpu_to_be32(count); 590 arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count); 591 592 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { 593 void *p = start_p, *end = p + arg->layoutupdate_len; 594 struct page *page = NULL; 595 int i = 0; 596 597 arg->start_p = start_p; 598 for ( ; p < end; p += PAGE_SIZE) { 599 page = vmalloc_to_page(p); 600 arg->layoutupdate_pages[i++] = page; 601 get_page(page); 602 } 603 } 604 605 dprintk("%s found %zu ranges\n", __func__, count); 606 return 0; 607 } 608 609 void 610 ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) 611 { 612 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 613 struct rb_root *root = &bl->bl_ext_rw; 614 struct pnfs_block_extent *be; 615 616 dprintk("%s status %d\n", __func__, status); 617 618 ext_tree_free_commitdata(arg, arg->layoutupdate_len); 619 620 spin_lock(&bl->bl_ext_lock); 621 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { 622 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 623 be->be_tag != EXTENT_COMMITTING) 624 continue; 625 626 if (status) { 627 /* 628 * Mark as written and try again. 629 * 630 * XXX: some real error handling here wouldn't hurt.. 631 */ 632 be->be_tag = EXTENT_WRITTEN; 633 } else { 634 be->be_state = PNFS_BLOCK_READWRITE_DATA; 635 be->be_tag = 0; 636 } 637 638 be = ext_try_to_merge_left(root, be); 639 be = ext_try_to_merge_right(root, be); 640 } 641 spin_unlock(&bl->bl_ext_lock); 642 } 643