1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014-2016 Christoph Hellwig. 4 */ 5 6 #include <linux/vmalloc.h> 7 8 #include "blocklayout.h" 9 10 #define NFSDBG_FACILITY NFSDBG_PNFS_LD 11 12 static inline struct pnfs_block_extent * 13 ext_node(struct rb_node *node) 14 { 15 return rb_entry(node, struct pnfs_block_extent, be_node); 16 } 17 18 static struct pnfs_block_extent * 19 ext_tree_first(struct rb_root *root) 20 { 21 struct rb_node *node = rb_first(root); 22 return node ? ext_node(node) : NULL; 23 } 24 25 static struct pnfs_block_extent * 26 ext_tree_prev(struct pnfs_block_extent *be) 27 { 28 struct rb_node *node = rb_prev(&be->be_node); 29 return node ? ext_node(node) : NULL; 30 } 31 32 static struct pnfs_block_extent * 33 ext_tree_next(struct pnfs_block_extent *be) 34 { 35 struct rb_node *node = rb_next(&be->be_node); 36 return node ? ext_node(node) : NULL; 37 } 38 39 static inline sector_t 40 ext_f_end(struct pnfs_block_extent *be) 41 { 42 return be->be_f_offset + be->be_length; 43 } 44 45 static struct pnfs_block_extent * 46 __ext_tree_search(struct rb_root *root, sector_t start) 47 { 48 struct rb_node *node = root->rb_node; 49 struct pnfs_block_extent *be = NULL; 50 51 while (node) { 52 be = ext_node(node); 53 if (start < be->be_f_offset) 54 node = node->rb_left; 55 else if (start >= ext_f_end(be)) 56 node = node->rb_right; 57 else 58 return be; 59 } 60 61 if (be) { 62 if (start < be->be_f_offset) 63 return be; 64 65 if (start >= ext_f_end(be)) 66 return ext_tree_next(be); 67 } 68 69 return NULL; 70 } 71 72 static bool 73 ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) 74 { 75 if (be1->be_state != be2->be_state) 76 return false; 77 if (be1->be_device != be2->be_device) 78 return false; 79 80 if (be1->be_f_offset + be1->be_length != be2->be_f_offset) 81 return false; 82 83 if (be1->be_state != PNFS_BLOCK_NONE_DATA && 84 (be1->be_v_offset + be1->be_length != be2->be_v_offset)) 85 return false; 86 87 if (be1->be_state == PNFS_BLOCK_INVALID_DATA && 88 be1->be_tag != be2->be_tag) 89 return false; 90 91 return true; 92 } 93 94 static struct pnfs_block_extent * 95 ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) 96 { 97 struct pnfs_block_extent *left = ext_tree_prev(be); 98 99 if (left && ext_can_merge(left, be)) { 100 left->be_length += be->be_length; 101 rb_erase(&be->be_node, root); 102 nfs4_put_deviceid_node(be->be_device); 103 kfree(be); 104 return left; 105 } 106 107 return be; 108 } 109 110 static struct pnfs_block_extent * 111 ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) 112 { 113 struct pnfs_block_extent *right = ext_tree_next(be); 114 115 if (right && ext_can_merge(be, right)) { 116 be->be_length += right->be_length; 117 rb_erase(&right->be_node, root); 118 nfs4_put_deviceid_node(right->be_device); 119 kfree(right); 120 } 121 122 return be; 123 } 124 125 static void __ext_put_deviceids(struct list_head *head) 126 { 127 struct pnfs_block_extent *be, *tmp; 128 129 list_for_each_entry_safe(be, tmp, head, be_list) { 130 nfs4_put_deviceid_node(be->be_device); 131 kfree(be); 132 } 133 } 134 135 static void 136 __ext_tree_insert(struct rb_root *root, 137 struct pnfs_block_extent *new, bool merge_ok) 138 { 139 struct rb_node **p = &root->rb_node, *parent = NULL; 140 struct pnfs_block_extent *be; 141 142 while (*p) { 143 parent = *p; 144 be = ext_node(parent); 145 146 if (new->be_f_offset < be->be_f_offset) { 147 if (merge_ok && ext_can_merge(new, be)) { 148 be->be_f_offset = new->be_f_offset; 149 if (be->be_state != PNFS_BLOCK_NONE_DATA) 150 be->be_v_offset = new->be_v_offset; 151 be->be_length += new->be_length; 152 be = ext_try_to_merge_left(root, be); 153 goto free_new; 154 } 155 p = &(*p)->rb_left; 156 } else if (new->be_f_offset >= ext_f_end(be)) { 157 if (merge_ok && ext_can_merge(be, new)) { 158 be->be_length += new->be_length; 159 be = ext_try_to_merge_right(root, be); 160 goto free_new; 161 } 162 p = &(*p)->rb_right; 163 } else { 164 BUG(); 165 } 166 } 167 168 rb_link_node(&new->be_node, parent, p); 169 rb_insert_color(&new->be_node, root); 170 return; 171 free_new: 172 nfs4_put_deviceid_node(new->be_device); 173 kfree(new); 174 } 175 176 static int 177 __ext_tree_remove(struct rb_root *root, 178 sector_t start, sector_t end, struct list_head *tmp) 179 { 180 struct pnfs_block_extent *be; 181 sector_t len1 = 0, len2 = 0; 182 sector_t orig_v_offset; 183 sector_t orig_len; 184 185 be = __ext_tree_search(root, start); 186 if (!be) 187 return 0; 188 if (be->be_f_offset >= end) 189 return 0; 190 191 orig_v_offset = be->be_v_offset; 192 orig_len = be->be_length; 193 194 if (start > be->be_f_offset) 195 len1 = start - be->be_f_offset; 196 if (ext_f_end(be) > end) 197 len2 = ext_f_end(be) - end; 198 199 if (len2 > 0) { 200 if (len1 > 0) { 201 struct pnfs_block_extent *new; 202 203 new = kzalloc(sizeof(*new), GFP_ATOMIC); 204 if (!new) 205 return -ENOMEM; 206 207 be->be_length = len1; 208 209 new->be_f_offset = end; 210 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 211 new->be_v_offset = 212 orig_v_offset + orig_len - len2; 213 } 214 new->be_length = len2; 215 new->be_state = be->be_state; 216 new->be_tag = be->be_tag; 217 new->be_device = nfs4_get_deviceid(be->be_device); 218 219 __ext_tree_insert(root, new, true); 220 } else { 221 be->be_f_offset = end; 222 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 223 be->be_v_offset = 224 orig_v_offset + orig_len - len2; 225 } 226 be->be_length = len2; 227 } 228 } else { 229 if (len1 > 0) { 230 be->be_length = len1; 231 be = ext_tree_next(be); 232 } 233 234 while (be && ext_f_end(be) <= end) { 235 struct pnfs_block_extent *next = ext_tree_next(be); 236 237 rb_erase(&be->be_node, root); 238 list_add_tail(&be->be_list, tmp); 239 be = next; 240 } 241 242 if (be && be->be_f_offset < end) { 243 len1 = ext_f_end(be) - end; 244 be->be_f_offset = end; 245 if (be->be_state != PNFS_BLOCK_NONE_DATA) 246 be->be_v_offset += be->be_length - len1; 247 be->be_length = len1; 248 } 249 } 250 251 return 0; 252 } 253 254 int 255 ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) 256 { 257 struct pnfs_block_extent *be; 258 struct rb_root *root; 259 int err = 0; 260 261 switch (new->be_state) { 262 case PNFS_BLOCK_READWRITE_DATA: 263 case PNFS_BLOCK_INVALID_DATA: 264 root = &bl->bl_ext_rw; 265 break; 266 case PNFS_BLOCK_READ_DATA: 267 case PNFS_BLOCK_NONE_DATA: 268 root = &bl->bl_ext_ro; 269 break; 270 default: 271 dprintk("invalid extent type\n"); 272 return -EINVAL; 273 } 274 275 spin_lock(&bl->bl_ext_lock); 276 retry: 277 be = __ext_tree_search(root, new->be_f_offset); 278 if (!be || be->be_f_offset >= ext_f_end(new)) { 279 __ext_tree_insert(root, new, true); 280 } else if (new->be_f_offset >= be->be_f_offset) { 281 if (ext_f_end(new) <= ext_f_end(be)) { 282 nfs4_put_deviceid_node(new->be_device); 283 kfree(new); 284 } else { 285 sector_t new_len = ext_f_end(new) - ext_f_end(be); 286 sector_t diff = new->be_length - new_len; 287 288 new->be_f_offset += diff; 289 new->be_v_offset += diff; 290 new->be_length = new_len; 291 goto retry; 292 } 293 } else if (ext_f_end(new) <= ext_f_end(be)) { 294 new->be_length = be->be_f_offset - new->be_f_offset; 295 __ext_tree_insert(root, new, true); 296 } else { 297 struct pnfs_block_extent *split; 298 sector_t new_len = ext_f_end(new) - ext_f_end(be); 299 sector_t diff = new->be_length - new_len; 300 301 split = kmemdup(new, sizeof(*new), GFP_ATOMIC); 302 if (!split) { 303 err = -EINVAL; 304 goto out; 305 } 306 307 split->be_length = be->be_f_offset - split->be_f_offset; 308 split->be_device = nfs4_get_deviceid(new->be_device); 309 __ext_tree_insert(root, split, true); 310 311 new->be_f_offset += diff; 312 new->be_v_offset += diff; 313 new->be_length = new_len; 314 goto retry; 315 } 316 out: 317 spin_unlock(&bl->bl_ext_lock); 318 return err; 319 } 320 321 static bool 322 __ext_tree_lookup(struct rb_root *root, sector_t isect, 323 struct pnfs_block_extent *ret) 324 { 325 struct rb_node *node; 326 struct pnfs_block_extent *be; 327 328 node = root->rb_node; 329 while (node) { 330 be = ext_node(node); 331 if (isect < be->be_f_offset) 332 node = node->rb_left; 333 else if (isect >= ext_f_end(be)) 334 node = node->rb_right; 335 else { 336 *ret = *be; 337 return true; 338 } 339 } 340 341 return false; 342 } 343 344 bool 345 ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, 346 struct pnfs_block_extent *ret, bool rw) 347 { 348 bool found = false; 349 350 spin_lock(&bl->bl_ext_lock); 351 if (!rw) 352 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); 353 if (!found) 354 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); 355 spin_unlock(&bl->bl_ext_lock); 356 357 return found; 358 } 359 360 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, 361 sector_t start, sector_t end) 362 { 363 int err, err2; 364 LIST_HEAD(tmp); 365 366 spin_lock(&bl->bl_ext_lock); 367 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 368 if (rw) { 369 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); 370 if (!err) 371 err = err2; 372 } 373 spin_unlock(&bl->bl_ext_lock); 374 375 __ext_put_deviceids(&tmp); 376 return err; 377 } 378 379 static int 380 ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, 381 sector_t split) 382 { 383 struct pnfs_block_extent *new; 384 sector_t orig_len = be->be_length; 385 386 new = kzalloc(sizeof(*new), GFP_ATOMIC); 387 if (!new) 388 return -ENOMEM; 389 390 be->be_length = split - be->be_f_offset; 391 392 new->be_f_offset = split; 393 if (be->be_state != PNFS_BLOCK_NONE_DATA) 394 new->be_v_offset = be->be_v_offset + be->be_length; 395 new->be_length = orig_len - be->be_length; 396 new->be_state = be->be_state; 397 new->be_tag = be->be_tag; 398 new->be_device = nfs4_get_deviceid(be->be_device); 399 400 __ext_tree_insert(root, new, false); 401 return 0; 402 } 403 404 int 405 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, 406 sector_t len, u64 lwb) 407 { 408 struct rb_root *root = &bl->bl_ext_rw; 409 sector_t end = start + len; 410 struct pnfs_block_extent *be; 411 int err = 0; 412 LIST_HEAD(tmp); 413 414 spin_lock(&bl->bl_ext_lock); 415 /* 416 * First remove all COW extents or holes from written to range. 417 */ 418 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 419 if (err) 420 goto out; 421 422 /* 423 * Then mark all invalid extents in the range as written to. 424 */ 425 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { 426 if (be->be_f_offset >= end) 427 break; 428 429 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) 430 continue; 431 432 if (be->be_f_offset < start) { 433 struct pnfs_block_extent *left = ext_tree_prev(be); 434 435 if (left && ext_can_merge(left, be)) { 436 sector_t diff = start - be->be_f_offset; 437 438 left->be_length += diff; 439 440 be->be_f_offset += diff; 441 be->be_v_offset += diff; 442 be->be_length -= diff; 443 } else { 444 err = ext_tree_split(root, be, start); 445 if (err) 446 goto out; 447 } 448 } 449 450 if (ext_f_end(be) > end) { 451 struct pnfs_block_extent *right = ext_tree_next(be); 452 453 if (right && ext_can_merge(be, right)) { 454 sector_t diff = end - be->be_f_offset; 455 456 be->be_length -= diff; 457 458 right->be_f_offset -= diff; 459 right->be_v_offset -= diff; 460 right->be_length += diff; 461 } else { 462 err = ext_tree_split(root, be, end); 463 if (err) 464 goto out; 465 } 466 } 467 468 if (be->be_f_offset >= start && ext_f_end(be) <= end) { 469 be->be_tag = EXTENT_WRITTEN; 470 be = ext_try_to_merge_left(root, be); 471 be = ext_try_to_merge_right(root, be); 472 } 473 } 474 out: 475 if (bl->bl_lwb < lwb) 476 bl->bl_lwb = lwb; 477 spin_unlock(&bl->bl_ext_lock); 478 479 __ext_put_deviceids(&tmp); 480 return err; 481 } 482 483 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count) 484 { 485 if (bl->bl_scsi_layout) 486 return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count; 487 else 488 return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count; 489 } 490 491 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, 492 size_t buffer_size) 493 { 494 if (arg->layoutupdate_pages != &arg->layoutupdate_page) { 495 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; 496 497 for (i = 0; i < nr_pages; i++) 498 put_page(arg->layoutupdate_pages[i]); 499 vfree(arg->start_p); 500 kfree(arg->layoutupdate_pages); 501 } else { 502 put_page(arg->layoutupdate_page); 503 } 504 } 505 506 static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p) 507 { 508 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, 509 NFS4_DEVICEID4_SIZE); 510 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 511 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 512 p = xdr_encode_hyper(p, 0LL); 513 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 514 return p; 515 } 516 517 static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) 518 { 519 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 520 return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 521 } 522 523 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 524 size_t buffer_size, size_t *count, __u64 *lastbyte) 525 { 526 struct pnfs_block_extent *be; 527 int ret = 0; 528 529 spin_lock(&bl->bl_ext_lock); 530 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { 531 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 532 be->be_tag != EXTENT_WRITTEN) 533 continue; 534 535 (*count)++; 536 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { 537 /* keep counting.. */ 538 ret = -ENOSPC; 539 continue; 540 } 541 542 if (bl->bl_scsi_layout) 543 p = encode_scsi_range(be, p); 544 else 545 p = encode_block_extent(be, p); 546 be->be_tag = EXTENT_COMMITTING; 547 } 548 *lastbyte = bl->bl_lwb - 1; 549 bl->bl_lwb = 0; 550 spin_unlock(&bl->bl_ext_lock); 551 552 return ret; 553 } 554 555 int 556 ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) 557 { 558 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 559 size_t count = 0, buffer_size = PAGE_SIZE; 560 __be32 *start_p; 561 int ret; 562 563 dprintk("%s enter\n", __func__); 564 565 arg->layoutupdate_page = alloc_page(GFP_NOFS); 566 if (!arg->layoutupdate_page) 567 return -ENOMEM; 568 start_p = page_address(arg->layoutupdate_page); 569 arg->layoutupdate_pages = &arg->layoutupdate_page; 570 571 retry: 572 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten); 573 if (unlikely(ret)) { 574 ext_tree_free_commitdata(arg, buffer_size); 575 576 buffer_size = ext_tree_layoutupdate_size(bl, count); 577 count = 0; 578 579 arg->layoutupdate_pages = 580 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 581 sizeof(struct page *), GFP_NOFS); 582 if (!arg->layoutupdate_pages) 583 return -ENOMEM; 584 585 start_p = __vmalloc(buffer_size, GFP_NOFS); 586 if (!start_p) { 587 kfree(arg->layoutupdate_pages); 588 return -ENOMEM; 589 } 590 591 goto retry; 592 } 593 594 *start_p = cpu_to_be32(count); 595 arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count); 596 597 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { 598 void *p = start_p, *end = p + arg->layoutupdate_len; 599 struct page *page = NULL; 600 int i = 0; 601 602 arg->start_p = start_p; 603 for ( ; p < end; p += PAGE_SIZE) { 604 page = vmalloc_to_page(p); 605 arg->layoutupdate_pages[i++] = page; 606 get_page(page); 607 } 608 } 609 610 dprintk("%s found %zu ranges\n", __func__, count); 611 return 0; 612 } 613 614 void 615 ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) 616 { 617 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 618 struct rb_root *root = &bl->bl_ext_rw; 619 struct pnfs_block_extent *be; 620 621 dprintk("%s status %d\n", __func__, status); 622 623 ext_tree_free_commitdata(arg, arg->layoutupdate_len); 624 625 spin_lock(&bl->bl_ext_lock); 626 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { 627 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 628 be->be_tag != EXTENT_COMMITTING) 629 continue; 630 631 if (status) { 632 /* 633 * Mark as written and try again. 634 * 635 * XXX: some real error handling here wouldn't hurt.. 636 */ 637 be->be_tag = EXTENT_WRITTEN; 638 } else { 639 be->be_state = PNFS_BLOCK_READWRITE_DATA; 640 be->be_tag = 0; 641 } 642 643 be = ext_try_to_merge_left(root, be); 644 be = ext_try_to_merge_right(root, be); 645 } 646 spin_unlock(&bl->bl_ext_lock); 647 } 648