1 /* 2 * Block Translation Table 3 * Copyright (c) 2014-2015, Intel Corporation. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 #include <linux/highmem.h> 15 #include <linux/debugfs.h> 16 #include <linux/blkdev.h> 17 #include <linux/module.h> 18 #include <linux/device.h> 19 #include <linux/mutex.h> 20 #include <linux/hdreg.h> 21 #include <linux/genhd.h> 22 #include <linux/sizes.h> 23 #include <linux/ndctl.h> 24 #include <linux/fs.h> 25 #include <linux/nd.h> 26 #include <linux/backing-dev.h> 27 #include "btt.h" 28 #include "nd.h" 29 30 enum log_ent_request { 31 LOG_NEW_ENT = 0, 32 LOG_OLD_ENT 33 }; 34 35 static struct device *to_dev(struct arena_info *arena) 36 { 37 return &arena->nd_btt->dev; 38 } 39 40 static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset) 41 { 42 return offset + nd_btt->initial_offset; 43 } 44 45 static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, 46 void *buf, size_t n, unsigned long flags) 47 { 48 struct nd_btt *nd_btt = arena->nd_btt; 49 struct nd_namespace_common *ndns = nd_btt->ndns; 50 51 /* arena offsets may be shifted from the base of the device */ 52 offset = adjust_initial_offset(nd_btt, offset); 53 return nvdimm_read_bytes(ndns, offset, buf, n, flags); 54 } 55 56 static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, 57 void *buf, size_t n, unsigned long flags) 58 { 59 struct nd_btt *nd_btt = arena->nd_btt; 60 struct nd_namespace_common *ndns = nd_btt->ndns; 61 62 /* arena offsets may be shifted from the base of the device */ 63 offset = adjust_initial_offset(nd_btt, offset); 64 return nvdimm_write_bytes(ndns, offset, buf, n, flags); 65 } 66 67 static int btt_info_write(struct arena_info *arena, struct btt_sb *super) 68 { 69 int ret; 70 71 /* 72 * infooff and info2off should always be at least 512B aligned. 73 * We rely on that to make sure rw_bytes does error clearing 74 * correctly, so make sure that is the case. 75 */ 76 dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512), 77 "arena->infooff: %#llx is unaligned\n", arena->infooff); 78 dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512), 79 "arena->info2off: %#llx is unaligned\n", arena->info2off); 80 81 ret = arena_write_bytes(arena, arena->info2off, super, 82 sizeof(struct btt_sb), 0); 83 if (ret) 84 return ret; 85 86 return arena_write_bytes(arena, arena->infooff, super, 87 sizeof(struct btt_sb), 0); 88 } 89 90 static int btt_info_read(struct arena_info *arena, struct btt_sb *super) 91 { 92 return arena_read_bytes(arena, arena->infooff, super, 93 sizeof(struct btt_sb), 0); 94 } 95 96 /* 97 * 'raw' version of btt_map write 98 * Assumptions: 99 * mapping is in little-endian 100 * mapping contains 'E' and 'Z' flags as desired 101 */ 102 static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping, 103 unsigned long flags) 104 { 105 u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); 106 107 if (unlikely(lba >= arena->external_nlba)) 108 dev_err_ratelimited(to_dev(arena), 109 "%s: lba %#x out of range (max: %#x)\n", 110 __func__, lba, arena->external_nlba); 111 return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags); 112 } 113 114 static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, 115 u32 z_flag, u32 e_flag, unsigned long rwb_flags) 116 { 117 u32 ze; 118 __le32 mapping_le; 119 120 /* 121 * This 'mapping' is supposed to be just the LBA mapping, without 122 * any flags set, so strip the flag bits. 123 */ 124 mapping = ent_lba(mapping); 125 126 ze = (z_flag << 1) + e_flag; 127 switch (ze) { 128 case 0: 129 /* 130 * We want to set neither of the Z or E flags, and 131 * in the actual layout, this means setting the bit 132 * positions of both to '1' to indicate a 'normal' 133 * map entry 134 */ 135 mapping |= MAP_ENT_NORMAL; 136 break; 137 case 1: 138 mapping |= (1 << MAP_ERR_SHIFT); 139 break; 140 case 2: 141 mapping |= (1 << MAP_TRIM_SHIFT); 142 break; 143 default: 144 /* 145 * The case where Z and E are both sent in as '1' could be 146 * construed as a valid 'normal' case, but we decide not to, 147 * to avoid confusion 148 */ 149 dev_err_ratelimited(to_dev(arena), 150 "Invalid use of Z and E flags\n"); 151 return -EIO; 152 } 153 154 mapping_le = cpu_to_le32(mapping); 155 return __btt_map_write(arena, lba, mapping_le, rwb_flags); 156 } 157 158 static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, 159 int *trim, int *error, unsigned long rwb_flags) 160 { 161 int ret; 162 __le32 in; 163 u32 raw_mapping, postmap, ze, z_flag, e_flag; 164 u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); 165 166 if (unlikely(lba >= arena->external_nlba)) 167 dev_err_ratelimited(to_dev(arena), 168 "%s: lba %#x out of range (max: %#x)\n", 169 __func__, lba, arena->external_nlba); 170 171 ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags); 172 if (ret) 173 return ret; 174 175 raw_mapping = le32_to_cpu(in); 176 177 z_flag = ent_z_flag(raw_mapping); 178 e_flag = ent_e_flag(raw_mapping); 179 ze = (z_flag << 1) + e_flag; 180 postmap = ent_lba(raw_mapping); 181 182 /* Reuse the {z,e}_flag variables for *trim and *error */ 183 z_flag = 0; 184 e_flag = 0; 185 186 switch (ze) { 187 case 0: 188 /* Initial state. Return postmap = premap */ 189 *mapping = lba; 190 break; 191 case 1: 192 *mapping = postmap; 193 e_flag = 1; 194 break; 195 case 2: 196 *mapping = postmap; 197 z_flag = 1; 198 break; 199 case 3: 200 *mapping = postmap; 201 break; 202 default: 203 return -EIO; 204 } 205 206 if (trim) 207 *trim = z_flag; 208 if (error) 209 *error = e_flag; 210 211 return ret; 212 } 213 214 static int btt_log_group_read(struct arena_info *arena, u32 lane, 215 struct log_group *log) 216 { 217 return arena_read_bytes(arena, 218 arena->logoff + (lane * LOG_GRP_SIZE), log, 219 LOG_GRP_SIZE, 0); 220 } 221 222 static struct dentry *debugfs_root; 223 224 static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, 225 int idx) 226 { 227 char dirname[32]; 228 struct dentry *d; 229 230 /* If for some reason, parent bttN was not created, exit */ 231 if (!parent) 232 return; 233 234 snprintf(dirname, 32, "arena%d", idx); 235 d = debugfs_create_dir(dirname, parent); 236 if (IS_ERR_OR_NULL(d)) 237 return; 238 a->debugfs_dir = d; 239 240 debugfs_create_x64("size", S_IRUGO, d, &a->size); 241 debugfs_create_x64("external_lba_start", S_IRUGO, d, 242 &a->external_lba_start); 243 debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba); 244 debugfs_create_u32("internal_lbasize", S_IRUGO, d, 245 &a->internal_lbasize); 246 debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba); 247 debugfs_create_u32("external_lbasize", S_IRUGO, d, 248 &a->external_lbasize); 249 debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree); 250 debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major); 251 debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor); 252 debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff); 253 debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff); 254 debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff); 255 debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff); 256 debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); 257 debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); 258 debugfs_create_x32("flags", S_IRUGO, d, &a->flags); 259 debugfs_create_u32("log_index_0", S_IRUGO, d, &a->log_index[0]); 260 debugfs_create_u32("log_index_1", S_IRUGO, d, &a->log_index[1]); 261 } 262 263 static void btt_debugfs_init(struct btt *btt) 264 { 265 int i = 0; 266 struct arena_info *arena; 267 268 btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev), 269 debugfs_root); 270 if (IS_ERR_OR_NULL(btt->debugfs_dir)) 271 return; 272 273 list_for_each_entry(arena, &btt->arena_list, list) { 274 arena_debugfs_init(arena, btt->debugfs_dir, i); 275 i++; 276 } 277 } 278 279 static u32 log_seq(struct log_group *log, int log_idx) 280 { 281 return le32_to_cpu(log->ent[log_idx].seq); 282 } 283 284 /* 285 * This function accepts two log entries, and uses the 286 * sequence number to find the 'older' entry. 287 * It also updates the sequence number in this old entry to 288 * make it the 'new' one if the mark_flag is set. 289 * Finally, it returns which of the entries was the older one. 290 * 291 * TODO The logic feels a bit kludge-y. make it better.. 292 */ 293 static int btt_log_get_old(struct arena_info *a, struct log_group *log) 294 { 295 int idx0 = a->log_index[0]; 296 int idx1 = a->log_index[1]; 297 int old; 298 299 /* 300 * the first ever time this is seen, the entry goes into [0] 301 * the next time, the following logic works out to put this 302 * (next) entry into [1] 303 */ 304 if (log_seq(log, idx0) == 0) { 305 log->ent[idx0].seq = cpu_to_le32(1); 306 return 0; 307 } 308 309 if (log_seq(log, idx0) == log_seq(log, idx1)) 310 return -EINVAL; 311 if (log_seq(log, idx0) + log_seq(log, idx1) > 5) 312 return -EINVAL; 313 314 if (log_seq(log, idx0) < log_seq(log, idx1)) { 315 if ((log_seq(log, idx1) - log_seq(log, idx0)) == 1) 316 old = 0; 317 else 318 old = 1; 319 } else { 320 if ((log_seq(log, idx0) - log_seq(log, idx1)) == 1) 321 old = 1; 322 else 323 old = 0; 324 } 325 326 return old; 327 } 328 329 /* 330 * This function copies the desired (old/new) log entry into ent if 331 * it is not NULL. It returns the sub-slot number (0 or 1) 332 * where the desired log entry was found. Negative return values 333 * indicate errors. 334 */ 335 static int btt_log_read(struct arena_info *arena, u32 lane, 336 struct log_entry *ent, int old_flag) 337 { 338 int ret; 339 int old_ent, ret_ent; 340 struct log_group log; 341 342 ret = btt_log_group_read(arena, lane, &log); 343 if (ret) 344 return -EIO; 345 346 old_ent = btt_log_get_old(arena, &log); 347 if (old_ent < 0 || old_ent > 1) { 348 dev_err(to_dev(arena), 349 "log corruption (%d): lane %d seq [%d, %d]\n", 350 old_ent, lane, log.ent[arena->log_index[0]].seq, 351 log.ent[arena->log_index[1]].seq); 352 /* TODO set error state? */ 353 return -EIO; 354 } 355 356 ret_ent = (old_flag ? old_ent : (1 - old_ent)); 357 358 if (ent != NULL) 359 memcpy(ent, &log.ent[arena->log_index[ret_ent]], LOG_ENT_SIZE); 360 361 return ret_ent; 362 } 363 364 /* 365 * This function commits a log entry to media 366 * It does _not_ prepare the freelist entry for the next write 367 * btt_flog_write is the wrapper for updating the freelist elements 368 */ 369 static int __btt_log_write(struct arena_info *arena, u32 lane, 370 u32 sub, struct log_entry *ent, unsigned long flags) 371 { 372 int ret; 373 u32 group_slot = arena->log_index[sub]; 374 unsigned int log_half = LOG_ENT_SIZE / 2; 375 void *src = ent; 376 u64 ns_off; 377 378 ns_off = arena->logoff + (lane * LOG_GRP_SIZE) + 379 (group_slot * LOG_ENT_SIZE); 380 /* split the 16B write into atomic, durable halves */ 381 ret = arena_write_bytes(arena, ns_off, src, log_half, flags); 382 if (ret) 383 return ret; 384 385 ns_off += log_half; 386 src += log_half; 387 return arena_write_bytes(arena, ns_off, src, log_half, flags); 388 } 389 390 static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, 391 struct log_entry *ent) 392 { 393 int ret; 394 395 ret = __btt_log_write(arena, lane, sub, ent, NVDIMM_IO_ATOMIC); 396 if (ret) 397 return ret; 398 399 /* prepare the next free entry */ 400 arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; 401 if (++(arena->freelist[lane].seq) == 4) 402 arena->freelist[lane].seq = 1; 403 if (ent_e_flag(ent->old_map)) 404 arena->freelist[lane].has_err = 1; 405 arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map)); 406 407 return ret; 408 } 409 410 /* 411 * This function initializes the BTT map to the initial state, which is 412 * all-zeroes, and indicates an identity mapping 413 */ 414 static int btt_map_init(struct arena_info *arena) 415 { 416 int ret = -EINVAL; 417 void *zerobuf; 418 size_t offset = 0; 419 size_t chunk_size = SZ_2M; 420 size_t mapsize = arena->logoff - arena->mapoff; 421 422 zerobuf = kzalloc(chunk_size, GFP_KERNEL); 423 if (!zerobuf) 424 return -ENOMEM; 425 426 /* 427 * mapoff should always be at least 512B aligned. We rely on that to 428 * make sure rw_bytes does error clearing correctly, so make sure that 429 * is the case. 430 */ 431 dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512), 432 "arena->mapoff: %#llx is unaligned\n", arena->mapoff); 433 434 while (mapsize) { 435 size_t size = min(mapsize, chunk_size); 436 437 dev_WARN_ONCE(to_dev(arena), size < 512, 438 "chunk size: %#zx is unaligned\n", size); 439 ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf, 440 size, 0); 441 if (ret) 442 goto free; 443 444 offset += size; 445 mapsize -= size; 446 cond_resched(); 447 } 448 449 free: 450 kfree(zerobuf); 451 return ret; 452 } 453 454 /* 455 * This function initializes the BTT log with 'fake' entries pointing 456 * to the initial reserved set of blocks as being free 457 */ 458 static int btt_log_init(struct arena_info *arena) 459 { 460 size_t logsize = arena->info2off - arena->logoff; 461 size_t chunk_size = SZ_4K, offset = 0; 462 struct log_entry ent; 463 void *zerobuf; 464 int ret; 465 u32 i; 466 467 zerobuf = kzalloc(chunk_size, GFP_KERNEL); 468 if (!zerobuf) 469 return -ENOMEM; 470 /* 471 * logoff should always be at least 512B aligned. We rely on that to 472 * make sure rw_bytes does error clearing correctly, so make sure that 473 * is the case. 474 */ 475 dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512), 476 "arena->logoff: %#llx is unaligned\n", arena->logoff); 477 478 while (logsize) { 479 size_t size = min(logsize, chunk_size); 480 481 dev_WARN_ONCE(to_dev(arena), size < 512, 482 "chunk size: %#zx is unaligned\n", size); 483 ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf, 484 size, 0); 485 if (ret) 486 goto free; 487 488 offset += size; 489 logsize -= size; 490 cond_resched(); 491 } 492 493 for (i = 0; i < arena->nfree; i++) { 494 ent.lba = cpu_to_le32(i); 495 ent.old_map = cpu_to_le32(arena->external_nlba + i); 496 ent.new_map = cpu_to_le32(arena->external_nlba + i); 497 ent.seq = cpu_to_le32(LOG_SEQ_INIT); 498 ret = __btt_log_write(arena, i, 0, &ent, 0); 499 if (ret) 500 goto free; 501 } 502 503 free: 504 kfree(zerobuf); 505 return ret; 506 } 507 508 static u64 to_namespace_offset(struct arena_info *arena, u64 lba) 509 { 510 return arena->dataoff + ((u64)lba * arena->internal_lbasize); 511 } 512 513 static int arena_clear_freelist_error(struct arena_info *arena, u32 lane) 514 { 515 int ret = 0; 516 517 if (arena->freelist[lane].has_err) { 518 void *zero_page = page_address(ZERO_PAGE(0)); 519 u32 lba = arena->freelist[lane].block; 520 u64 nsoff = to_namespace_offset(arena, lba); 521 unsigned long len = arena->sector_size; 522 523 mutex_lock(&arena->err_lock); 524 525 while (len) { 526 unsigned long chunk = min(len, PAGE_SIZE); 527 528 ret = arena_write_bytes(arena, nsoff, zero_page, 529 chunk, 0); 530 if (ret) 531 break; 532 len -= chunk; 533 nsoff += chunk; 534 if (len == 0) 535 arena->freelist[lane].has_err = 0; 536 } 537 mutex_unlock(&arena->err_lock); 538 } 539 return ret; 540 } 541 542 static int btt_freelist_init(struct arena_info *arena) 543 { 544 int new, ret; 545 struct log_entry log_new; 546 u32 i, map_entry, log_oldmap, log_newmap; 547 548 arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry), 549 GFP_KERNEL); 550 if (!arena->freelist) 551 return -ENOMEM; 552 553 for (i = 0; i < arena->nfree; i++) { 554 new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT); 555 if (new < 0) 556 return new; 557 558 /* old and new map entries with any flags stripped out */ 559 log_oldmap = ent_lba(le32_to_cpu(log_new.old_map)); 560 log_newmap = ent_lba(le32_to_cpu(log_new.new_map)); 561 562 /* sub points to the next one to be overwritten */ 563 arena->freelist[i].sub = 1 - new; 564 arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq)); 565 arena->freelist[i].block = log_oldmap; 566 567 /* 568 * FIXME: if error clearing fails during init, we want to make 569 * the BTT read-only 570 */ 571 if (ent_e_flag(log_new.old_map) && 572 !ent_normal(log_new.old_map)) { 573 arena->freelist[i].has_err = 1; 574 ret = arena_clear_freelist_error(arena, i); 575 if (ret) 576 dev_err_ratelimited(to_dev(arena), 577 "Unable to clear known errors\n"); 578 } 579 580 /* This implies a newly created or untouched flog entry */ 581 if (log_oldmap == log_newmap) 582 continue; 583 584 /* Check if map recovery is needed */ 585 ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry, 586 NULL, NULL, 0); 587 if (ret) 588 return ret; 589 590 /* 591 * The map_entry from btt_read_map is stripped of any flag bits, 592 * so use the stripped out versions from the log as well for 593 * testing whether recovery is needed. For restoration, use the 594 * 'raw' version of the log entries as that captured what we 595 * were going to write originally. 596 */ 597 if ((log_newmap != map_entry) && (log_oldmap == map_entry)) { 598 /* 599 * Last transaction wrote the flog, but wasn't able 600 * to complete the map write. So fix up the map. 601 */ 602 ret = btt_map_write(arena, le32_to_cpu(log_new.lba), 603 le32_to_cpu(log_new.new_map), 0, 0, 0); 604 if (ret) 605 return ret; 606 } 607 } 608 609 return 0; 610 } 611 612 static bool ent_is_padding(struct log_entry *ent) 613 { 614 return (ent->lba == 0) && (ent->old_map == 0) && (ent->new_map == 0) 615 && (ent->seq == 0); 616 } 617 618 /* 619 * Detecting valid log indices: We read a log group (see the comments in btt.h 620 * for a description of a 'log_group' and its 'slots'), and iterate over its 621 * four slots. We expect that a padding slot will be all-zeroes, and use this 622 * to detect a padding slot vs. an actual entry. 623 * 624 * If a log_group is in the initial state, i.e. hasn't been used since the 625 * creation of this BTT layout, it will have three of the four slots with 626 * zeroes. We skip over these log_groups for the detection of log_index. If 627 * all log_groups are in the initial state (i.e. the BTT has never been 628 * written to), it is safe to assume the 'new format' of log entries in slots 629 * (0, 1). 630 */ 631 static int log_set_indices(struct arena_info *arena) 632 { 633 bool idx_set = false, initial_state = true; 634 int ret, log_index[2] = {-1, -1}; 635 u32 i, j, next_idx = 0; 636 struct log_group log; 637 u32 pad_count = 0; 638 639 for (i = 0; i < arena->nfree; i++) { 640 ret = btt_log_group_read(arena, i, &log); 641 if (ret < 0) 642 return ret; 643 644 for (j = 0; j < 4; j++) { 645 if (!idx_set) { 646 if (ent_is_padding(&log.ent[j])) { 647 pad_count++; 648 continue; 649 } else { 650 /* Skip if index has been recorded */ 651 if ((next_idx == 1) && 652 (j == log_index[0])) 653 continue; 654 /* valid entry, record index */ 655 log_index[next_idx] = j; 656 next_idx++; 657 } 658 if (next_idx == 2) { 659 /* two valid entries found */ 660 idx_set = true; 661 } else if (next_idx > 2) { 662 /* too many valid indices */ 663 return -ENXIO; 664 } 665 } else { 666 /* 667 * once the indices have been set, just verify 668 * that all subsequent log groups are either in 669 * their initial state or follow the same 670 * indices. 671 */ 672 if (j == log_index[0]) { 673 /* entry must be 'valid' */ 674 if (ent_is_padding(&log.ent[j])) 675 return -ENXIO; 676 } else if (j == log_index[1]) { 677 ; 678 /* 679 * log_index[1] can be padding if the 680 * lane never got used and it is still 681 * in the initial state (three 'padding' 682 * entries) 683 */ 684 } else { 685 /* entry must be invalid (padding) */ 686 if (!ent_is_padding(&log.ent[j])) 687 return -ENXIO; 688 } 689 } 690 } 691 /* 692 * If any of the log_groups have more than one valid, 693 * non-padding entry, then the we are no longer in the 694 * initial_state 695 */ 696 if (pad_count < 3) 697 initial_state = false; 698 pad_count = 0; 699 } 700 701 if (!initial_state && !idx_set) 702 return -ENXIO; 703 704 /* 705 * If all the entries in the log were in the initial state, 706 * assume new padding scheme 707 */ 708 if (initial_state) 709 log_index[1] = 1; 710 711 /* 712 * Only allow the known permutations of log/padding indices, 713 * i.e. (0, 1), and (0, 2) 714 */ 715 if ((log_index[0] == 0) && ((log_index[1] == 1) || (log_index[1] == 2))) 716 ; /* known index possibilities */ 717 else { 718 dev_err(to_dev(arena), "Found an unknown padding scheme\n"); 719 return -ENXIO; 720 } 721 722 arena->log_index[0] = log_index[0]; 723 arena->log_index[1] = log_index[1]; 724 dev_dbg(to_dev(arena), "log_index_0 = %d\n", log_index[0]); 725 dev_dbg(to_dev(arena), "log_index_1 = %d\n", log_index[1]); 726 return 0; 727 } 728 729 static int btt_rtt_init(struct arena_info *arena) 730 { 731 arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); 732 if (arena->rtt == NULL) 733 return -ENOMEM; 734 735 return 0; 736 } 737 738 static int btt_maplocks_init(struct arena_info *arena) 739 { 740 u32 i; 741 742 arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock), 743 GFP_KERNEL); 744 if (!arena->map_locks) 745 return -ENOMEM; 746 747 for (i = 0; i < arena->nfree; i++) 748 spin_lock_init(&arena->map_locks[i].lock); 749 750 return 0; 751 } 752 753 static struct arena_info *alloc_arena(struct btt *btt, size_t size, 754 size_t start, size_t arena_off) 755 { 756 struct arena_info *arena; 757 u64 logsize, mapsize, datasize; 758 u64 available = size; 759 760 arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL); 761 if (!arena) 762 return NULL; 763 arena->nd_btt = btt->nd_btt; 764 arena->sector_size = btt->sector_size; 765 mutex_init(&arena->err_lock); 766 767 if (!size) 768 return arena; 769 770 arena->size = size; 771 arena->external_lba_start = start; 772 arena->external_lbasize = btt->lbasize; 773 arena->internal_lbasize = roundup(arena->external_lbasize, 774 INT_LBASIZE_ALIGNMENT); 775 arena->nfree = BTT_DEFAULT_NFREE; 776 arena->version_major = btt->nd_btt->version_major; 777 arena->version_minor = btt->nd_btt->version_minor; 778 779 if (available % BTT_PG_SIZE) 780 available -= (available % BTT_PG_SIZE); 781 782 /* Two pages are reserved for the super block and its copy */ 783 available -= 2 * BTT_PG_SIZE; 784 785 /* The log takes a fixed amount of space based on nfree */ 786 logsize = roundup(arena->nfree * LOG_GRP_SIZE, BTT_PG_SIZE); 787 available -= logsize; 788 789 /* Calculate optimal split between map and data area */ 790 arena->internal_nlba = div_u64(available - BTT_PG_SIZE, 791 arena->internal_lbasize + MAP_ENT_SIZE); 792 arena->external_nlba = arena->internal_nlba - arena->nfree; 793 794 mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE); 795 datasize = available - mapsize; 796 797 /* 'Absolute' values, relative to start of storage space */ 798 arena->infooff = arena_off; 799 arena->dataoff = arena->infooff + BTT_PG_SIZE; 800 arena->mapoff = arena->dataoff + datasize; 801 arena->logoff = arena->mapoff + mapsize; 802 arena->info2off = arena->logoff + logsize; 803 804 /* Default log indices are (0,1) */ 805 arena->log_index[0] = 0; 806 arena->log_index[1] = 1; 807 return arena; 808 } 809 810 static void free_arenas(struct btt *btt) 811 { 812 struct arena_info *arena, *next; 813 814 list_for_each_entry_safe(arena, next, &btt->arena_list, list) { 815 list_del(&arena->list); 816 kfree(arena->rtt); 817 kfree(arena->map_locks); 818 kfree(arena->freelist); 819 debugfs_remove_recursive(arena->debugfs_dir); 820 kfree(arena); 821 } 822 } 823 824 /* 825 * This function reads an existing valid btt superblock and 826 * populates the corresponding arena_info struct 827 */ 828 static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, 829 u64 arena_off) 830 { 831 arena->internal_nlba = le32_to_cpu(super->internal_nlba); 832 arena->internal_lbasize = le32_to_cpu(super->internal_lbasize); 833 arena->external_nlba = le32_to_cpu(super->external_nlba); 834 arena->external_lbasize = le32_to_cpu(super->external_lbasize); 835 arena->nfree = le32_to_cpu(super->nfree); 836 arena->version_major = le16_to_cpu(super->version_major); 837 arena->version_minor = le16_to_cpu(super->version_minor); 838 839 arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off + 840 le64_to_cpu(super->nextoff)); 841 arena->infooff = arena_off; 842 arena->dataoff = arena_off + le64_to_cpu(super->dataoff); 843 arena->mapoff = arena_off + le64_to_cpu(super->mapoff); 844 arena->logoff = arena_off + le64_to_cpu(super->logoff); 845 arena->info2off = arena_off + le64_to_cpu(super->info2off); 846 847 arena->size = (le64_to_cpu(super->nextoff) > 0) 848 ? (le64_to_cpu(super->nextoff)) 849 : (arena->info2off - arena->infooff + BTT_PG_SIZE); 850 851 arena->flags = le32_to_cpu(super->flags); 852 } 853 854 static int discover_arenas(struct btt *btt) 855 { 856 int ret = 0; 857 struct arena_info *arena; 858 struct btt_sb *super; 859 size_t remaining = btt->rawsize; 860 u64 cur_nlba = 0; 861 size_t cur_off = 0; 862 int num_arenas = 0; 863 864 super = kzalloc(sizeof(*super), GFP_KERNEL); 865 if (!super) 866 return -ENOMEM; 867 868 while (remaining) { 869 /* Alloc memory for arena */ 870 arena = alloc_arena(btt, 0, 0, 0); 871 if (!arena) { 872 ret = -ENOMEM; 873 goto out_super; 874 } 875 876 arena->infooff = cur_off; 877 ret = btt_info_read(arena, super); 878 if (ret) 879 goto out; 880 881 if (!nd_btt_arena_is_valid(btt->nd_btt, super)) { 882 if (remaining == btt->rawsize) { 883 btt->init_state = INIT_NOTFOUND; 884 dev_info(to_dev(arena), "No existing arenas\n"); 885 goto out; 886 } else { 887 dev_err(to_dev(arena), 888 "Found corrupted metadata!\n"); 889 ret = -ENODEV; 890 goto out; 891 } 892 } 893 894 arena->external_lba_start = cur_nlba; 895 parse_arena_meta(arena, super, cur_off); 896 897 ret = log_set_indices(arena); 898 if (ret) { 899 dev_err(to_dev(arena), 900 "Unable to deduce log/padding indices\n"); 901 goto out; 902 } 903 904 ret = btt_freelist_init(arena); 905 if (ret) 906 goto out; 907 908 ret = btt_rtt_init(arena); 909 if (ret) 910 goto out; 911 912 ret = btt_maplocks_init(arena); 913 if (ret) 914 goto out; 915 916 list_add_tail(&arena->list, &btt->arena_list); 917 918 remaining -= arena->size; 919 cur_off += arena->size; 920 cur_nlba += arena->external_nlba; 921 num_arenas++; 922 923 if (arena->nextoff == 0) 924 break; 925 } 926 btt->num_arenas = num_arenas; 927 btt->nlba = cur_nlba; 928 btt->init_state = INIT_READY; 929 930 kfree(super); 931 return ret; 932 933 out: 934 kfree(arena); 935 free_arenas(btt); 936 out_super: 937 kfree(super); 938 return ret; 939 } 940 941 static int create_arenas(struct btt *btt) 942 { 943 size_t remaining = btt->rawsize; 944 size_t cur_off = 0; 945 946 while (remaining) { 947 struct arena_info *arena; 948 size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining); 949 950 remaining -= arena_size; 951 if (arena_size < ARENA_MIN_SIZE) 952 break; 953 954 arena = alloc_arena(btt, arena_size, btt->nlba, cur_off); 955 if (!arena) { 956 free_arenas(btt); 957 return -ENOMEM; 958 } 959 btt->nlba += arena->external_nlba; 960 if (remaining >= ARENA_MIN_SIZE) 961 arena->nextoff = arena->size; 962 else 963 arena->nextoff = 0; 964 cur_off += arena_size; 965 list_add_tail(&arena->list, &btt->arena_list); 966 } 967 968 return 0; 969 } 970 971 /* 972 * This function completes arena initialization by writing 973 * all the metadata. 974 * It is only called for an uninitialized arena when a write 975 * to that arena occurs for the first time. 976 */ 977 static int btt_arena_write_layout(struct arena_info *arena) 978 { 979 int ret; 980 u64 sum; 981 struct btt_sb *super; 982 struct nd_btt *nd_btt = arena->nd_btt; 983 const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev); 984 985 ret = btt_map_init(arena); 986 if (ret) 987 return ret; 988 989 ret = btt_log_init(arena); 990 if (ret) 991 return ret; 992 993 super = kzalloc(sizeof(struct btt_sb), GFP_NOIO); 994 if (!super) 995 return -ENOMEM; 996 997 strncpy(super->signature, BTT_SIG, BTT_SIG_LEN); 998 memcpy(super->uuid, nd_btt->uuid, 16); 999 memcpy(super->parent_uuid, parent_uuid, 16); 1000 super->flags = cpu_to_le32(arena->flags); 1001 super->version_major = cpu_to_le16(arena->version_major); 1002 super->version_minor = cpu_to_le16(arena->version_minor); 1003 super->external_lbasize = cpu_to_le32(arena->external_lbasize); 1004 super->external_nlba = cpu_to_le32(arena->external_nlba); 1005 super->internal_lbasize = cpu_to_le32(arena->internal_lbasize); 1006 super->internal_nlba = cpu_to_le32(arena->internal_nlba); 1007 super->nfree = cpu_to_le32(arena->nfree); 1008 super->infosize = cpu_to_le32(sizeof(struct btt_sb)); 1009 super->nextoff = cpu_to_le64(arena->nextoff); 1010 /* 1011 * Subtract arena->infooff (arena start) so numbers are relative 1012 * to 'this' arena 1013 */ 1014 super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff); 1015 super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff); 1016 super->logoff = cpu_to_le64(arena->logoff - arena->infooff); 1017 super->info2off = cpu_to_le64(arena->info2off - arena->infooff); 1018 1019 super->flags = 0; 1020 sum = nd_sb_checksum((struct nd_gen_sb *) super); 1021 super->checksum = cpu_to_le64(sum); 1022 1023 ret = btt_info_write(arena, super); 1024 1025 kfree(super); 1026 return ret; 1027 } 1028 1029 /* 1030 * This function completes the initialization for the BTT namespace 1031 * such that it is ready to accept IOs 1032 */ 1033 static int btt_meta_init(struct btt *btt) 1034 { 1035 int ret = 0; 1036 struct arena_info *arena; 1037 1038 mutex_lock(&btt->init_lock); 1039 list_for_each_entry(arena, &btt->arena_list, list) { 1040 ret = btt_arena_write_layout(arena); 1041 if (ret) 1042 goto unlock; 1043 1044 ret = btt_freelist_init(arena); 1045 if (ret) 1046 goto unlock; 1047 1048 ret = btt_rtt_init(arena); 1049 if (ret) 1050 goto unlock; 1051 1052 ret = btt_maplocks_init(arena); 1053 if (ret) 1054 goto unlock; 1055 } 1056 1057 btt->init_state = INIT_READY; 1058 1059 unlock: 1060 mutex_unlock(&btt->init_lock); 1061 return ret; 1062 } 1063 1064 static u32 btt_meta_size(struct btt *btt) 1065 { 1066 return btt->lbasize - btt->sector_size; 1067 } 1068 1069 /* 1070 * This function calculates the arena in which the given LBA lies 1071 * by doing a linear walk. This is acceptable since we expect only 1072 * a few arenas. If we have backing devices that get much larger, 1073 * we can construct a balanced binary tree of arenas at init time 1074 * so that this range search becomes faster. 1075 */ 1076 static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap, 1077 struct arena_info **arena) 1078 { 1079 struct arena_info *arena_list; 1080 __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size); 1081 1082 list_for_each_entry(arena_list, &btt->arena_list, list) { 1083 if (lba < arena_list->external_nlba) { 1084 *arena = arena_list; 1085 *premap = lba; 1086 return 0; 1087 } 1088 lba -= arena_list->external_nlba; 1089 } 1090 1091 return -EIO; 1092 } 1093 1094 /* 1095 * The following (lock_map, unlock_map) are mostly just to improve 1096 * readability, since they index into an array of locks 1097 */ 1098 static void lock_map(struct arena_info *arena, u32 premap) 1099 __acquires(&arena->map_locks[idx].lock) 1100 { 1101 u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; 1102 1103 spin_lock(&arena->map_locks[idx].lock); 1104 } 1105 1106 static void unlock_map(struct arena_info *arena, u32 premap) 1107 __releases(&arena->map_locks[idx].lock) 1108 { 1109 u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; 1110 1111 spin_unlock(&arena->map_locks[idx].lock); 1112 } 1113 1114 static int btt_data_read(struct arena_info *arena, struct page *page, 1115 unsigned int off, u32 lba, u32 len) 1116 { 1117 int ret; 1118 u64 nsoff = to_namespace_offset(arena, lba); 1119 void *mem = kmap_atomic(page); 1120 1121 ret = arena_read_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC); 1122 kunmap_atomic(mem); 1123 1124 return ret; 1125 } 1126 1127 static int btt_data_write(struct arena_info *arena, u32 lba, 1128 struct page *page, unsigned int off, u32 len) 1129 { 1130 int ret; 1131 u64 nsoff = to_namespace_offset(arena, lba); 1132 void *mem = kmap_atomic(page); 1133 1134 ret = arena_write_bytes(arena, nsoff, mem + off, len, NVDIMM_IO_ATOMIC); 1135 kunmap_atomic(mem); 1136 1137 return ret; 1138 } 1139 1140 static void zero_fill_data(struct page *page, unsigned int off, u32 len) 1141 { 1142 void *mem = kmap_atomic(page); 1143 1144 memset(mem + off, 0, len); 1145 kunmap_atomic(mem); 1146 } 1147 1148 #ifdef CONFIG_BLK_DEV_INTEGRITY 1149 static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, 1150 struct arena_info *arena, u32 postmap, int rw) 1151 { 1152 unsigned int len = btt_meta_size(btt); 1153 u64 meta_nsoff; 1154 int ret = 0; 1155 1156 if (bip == NULL) 1157 return 0; 1158 1159 meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size; 1160 1161 while (len) { 1162 unsigned int cur_len; 1163 struct bio_vec bv; 1164 void *mem; 1165 1166 bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); 1167 /* 1168 * The 'bv' obtained from bvec_iter_bvec has its .bv_len and 1169 * .bv_offset already adjusted for iter->bi_bvec_done, and we 1170 * can use those directly 1171 */ 1172 1173 cur_len = min(len, bv.bv_len); 1174 mem = kmap_atomic(bv.bv_page); 1175 if (rw) 1176 ret = arena_write_bytes(arena, meta_nsoff, 1177 mem + bv.bv_offset, cur_len, 1178 NVDIMM_IO_ATOMIC); 1179 else 1180 ret = arena_read_bytes(arena, meta_nsoff, 1181 mem + bv.bv_offset, cur_len, 1182 NVDIMM_IO_ATOMIC); 1183 1184 kunmap_atomic(mem); 1185 if (ret) 1186 return ret; 1187 1188 len -= cur_len; 1189 meta_nsoff += cur_len; 1190 if (!bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len)) 1191 return -EIO; 1192 } 1193 1194 return ret; 1195 } 1196 1197 #else /* CONFIG_BLK_DEV_INTEGRITY */ 1198 static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, 1199 struct arena_info *arena, u32 postmap, int rw) 1200 { 1201 return 0; 1202 } 1203 #endif 1204 1205 static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, 1206 struct page *page, unsigned int off, sector_t sector, 1207 unsigned int len) 1208 { 1209 int ret = 0; 1210 int t_flag, e_flag; 1211 struct arena_info *arena = NULL; 1212 u32 lane = 0, premap, postmap; 1213 1214 while (len) { 1215 u32 cur_len; 1216 1217 lane = nd_region_acquire_lane(btt->nd_region); 1218 1219 ret = lba_to_arena(btt, sector, &premap, &arena); 1220 if (ret) 1221 goto out_lane; 1222 1223 cur_len = min(btt->sector_size, len); 1224 1225 ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag, 1226 NVDIMM_IO_ATOMIC); 1227 if (ret) 1228 goto out_lane; 1229 1230 /* 1231 * We loop to make sure that the post map LBA didn't change 1232 * from under us between writing the RTT and doing the actual 1233 * read. 1234 */ 1235 while (1) { 1236 u32 new_map; 1237 int new_t, new_e; 1238 1239 if (t_flag) { 1240 zero_fill_data(page, off, cur_len); 1241 goto out_lane; 1242 } 1243 1244 if (e_flag) { 1245 ret = -EIO; 1246 goto out_lane; 1247 } 1248 1249 arena->rtt[lane] = RTT_VALID | postmap; 1250 /* 1251 * Barrier to make sure this write is not reordered 1252 * to do the verification map_read before the RTT store 1253 */ 1254 barrier(); 1255 1256 ret = btt_map_read(arena, premap, &new_map, &new_t, 1257 &new_e, NVDIMM_IO_ATOMIC); 1258 if (ret) 1259 goto out_rtt; 1260 1261 if ((postmap == new_map) && (t_flag == new_t) && 1262 (e_flag == new_e)) 1263 break; 1264 1265 postmap = new_map; 1266 t_flag = new_t; 1267 e_flag = new_e; 1268 } 1269 1270 ret = btt_data_read(arena, page, off, postmap, cur_len); 1271 if (ret) { 1272 int rc; 1273 1274 /* Media error - set the e_flag */ 1275 rc = btt_map_write(arena, premap, postmap, 0, 1, 1276 NVDIMM_IO_ATOMIC); 1277 goto out_rtt; 1278 } 1279 1280 if (bip) { 1281 ret = btt_rw_integrity(btt, bip, arena, postmap, READ); 1282 if (ret) 1283 goto out_rtt; 1284 } 1285 1286 arena->rtt[lane] = RTT_INVALID; 1287 nd_region_release_lane(btt->nd_region, lane); 1288 1289 len -= cur_len; 1290 off += cur_len; 1291 sector += btt->sector_size >> SECTOR_SHIFT; 1292 } 1293 1294 return 0; 1295 1296 out_rtt: 1297 arena->rtt[lane] = RTT_INVALID; 1298 out_lane: 1299 nd_region_release_lane(btt->nd_region, lane); 1300 return ret; 1301 } 1302 1303 /* 1304 * Normally, arena_{read,write}_bytes will take care of the initial offset 1305 * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem, 1306 * we need the final, raw namespace offset here 1307 */ 1308 static bool btt_is_badblock(struct btt *btt, struct arena_info *arena, 1309 u32 postmap) 1310 { 1311 u64 nsoff = adjust_initial_offset(arena->nd_btt, 1312 to_namespace_offset(arena, postmap)); 1313 sector_t phys_sector = nsoff >> 9; 1314 1315 return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize); 1316 } 1317 1318 static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, 1319 sector_t sector, struct page *page, unsigned int off, 1320 unsigned int len) 1321 { 1322 int ret = 0; 1323 struct arena_info *arena = NULL; 1324 u32 premap = 0, old_postmap, new_postmap, lane = 0, i; 1325 struct log_entry log; 1326 int sub; 1327 1328 while (len) { 1329 u32 cur_len; 1330 int e_flag; 1331 1332 retry: 1333 lane = nd_region_acquire_lane(btt->nd_region); 1334 1335 ret = lba_to_arena(btt, sector, &premap, &arena); 1336 if (ret) 1337 goto out_lane; 1338 cur_len = min(btt->sector_size, len); 1339 1340 if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) { 1341 ret = -EIO; 1342 goto out_lane; 1343 } 1344 1345 if (btt_is_badblock(btt, arena, arena->freelist[lane].block)) 1346 arena->freelist[lane].has_err = 1; 1347 1348 if (mutex_is_locked(&arena->err_lock) 1349 || arena->freelist[lane].has_err) { 1350 nd_region_release_lane(btt->nd_region, lane); 1351 1352 ret = arena_clear_freelist_error(arena, lane); 1353 if (ret) 1354 return ret; 1355 1356 /* OK to acquire a different lane/free block */ 1357 goto retry; 1358 } 1359 1360 new_postmap = arena->freelist[lane].block; 1361 1362 /* Wait if the new block is being read from */ 1363 for (i = 0; i < arena->nfree; i++) 1364 while (arena->rtt[i] == (RTT_VALID | new_postmap)) 1365 cpu_relax(); 1366 1367 1368 if (new_postmap >= arena->internal_nlba) { 1369 ret = -EIO; 1370 goto out_lane; 1371 } 1372 1373 ret = btt_data_write(arena, new_postmap, page, off, cur_len); 1374 if (ret) 1375 goto out_lane; 1376 1377 if (bip) { 1378 ret = btt_rw_integrity(btt, bip, arena, new_postmap, 1379 WRITE); 1380 if (ret) 1381 goto out_lane; 1382 } 1383 1384 lock_map(arena, premap); 1385 ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag, 1386 NVDIMM_IO_ATOMIC); 1387 if (ret) 1388 goto out_map; 1389 if (old_postmap >= arena->internal_nlba) { 1390 ret = -EIO; 1391 goto out_map; 1392 } 1393 if (e_flag) 1394 set_e_flag(old_postmap); 1395 1396 log.lba = cpu_to_le32(premap); 1397 log.old_map = cpu_to_le32(old_postmap); 1398 log.new_map = cpu_to_le32(new_postmap); 1399 log.seq = cpu_to_le32(arena->freelist[lane].seq); 1400 sub = arena->freelist[lane].sub; 1401 ret = btt_flog_write(arena, lane, sub, &log); 1402 if (ret) 1403 goto out_map; 1404 1405 ret = btt_map_write(arena, premap, new_postmap, 0, 0, 1406 NVDIMM_IO_ATOMIC); 1407 if (ret) 1408 goto out_map; 1409 1410 unlock_map(arena, premap); 1411 nd_region_release_lane(btt->nd_region, lane); 1412 1413 if (e_flag) { 1414 ret = arena_clear_freelist_error(arena, lane); 1415 if (ret) 1416 return ret; 1417 } 1418 1419 len -= cur_len; 1420 off += cur_len; 1421 sector += btt->sector_size >> SECTOR_SHIFT; 1422 } 1423 1424 return 0; 1425 1426 out_map: 1427 unlock_map(arena, premap); 1428 out_lane: 1429 nd_region_release_lane(btt->nd_region, lane); 1430 return ret; 1431 } 1432 1433 static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, 1434 struct page *page, unsigned int len, unsigned int off, 1435 unsigned int op, sector_t sector) 1436 { 1437 int ret; 1438 1439 if (!op_is_write(op)) { 1440 ret = btt_read_pg(btt, bip, page, off, sector, len); 1441 flush_dcache_page(page); 1442 } else { 1443 flush_dcache_page(page); 1444 ret = btt_write_pg(btt, bip, sector, page, off, len); 1445 } 1446 1447 return ret; 1448 } 1449 1450 static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) 1451 { 1452 struct bio_integrity_payload *bip = bio_integrity(bio); 1453 struct btt *btt = q->queuedata; 1454 struct bvec_iter iter; 1455 unsigned long start; 1456 struct bio_vec bvec; 1457 int err = 0; 1458 bool do_acct; 1459 1460 if (!bio_integrity_prep(bio)) 1461 return BLK_QC_T_NONE; 1462 1463 do_acct = nd_iostat_start(bio, &start); 1464 bio_for_each_segment(bvec, bio, iter) { 1465 unsigned int len = bvec.bv_len; 1466 1467 if (len > PAGE_SIZE || len < btt->sector_size || 1468 len % btt->sector_size) { 1469 dev_err_ratelimited(&btt->nd_btt->dev, 1470 "unaligned bio segment (len: %d)\n", len); 1471 bio->bi_status = BLK_STS_IOERR; 1472 break; 1473 } 1474 1475 err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, 1476 bio_op(bio), iter.bi_sector); 1477 if (err) { 1478 dev_err(&btt->nd_btt->dev, 1479 "io error in %s sector %lld, len %d,\n", 1480 (op_is_write(bio_op(bio))) ? "WRITE" : 1481 "READ", 1482 (unsigned long long) iter.bi_sector, len); 1483 bio->bi_status = errno_to_blk_status(err); 1484 break; 1485 } 1486 } 1487 if (do_acct) 1488 nd_iostat_end(bio, start); 1489 1490 bio_endio(bio); 1491 return BLK_QC_T_NONE; 1492 } 1493 1494 static int btt_rw_page(struct block_device *bdev, sector_t sector, 1495 struct page *page, unsigned int op) 1496 { 1497 struct btt *btt = bdev->bd_disk->private_data; 1498 int rc; 1499 unsigned int len; 1500 1501 len = hpage_nr_pages(page) * PAGE_SIZE; 1502 rc = btt_do_bvec(btt, NULL, page, len, 0, op, sector); 1503 if (rc == 0) 1504 page_endio(page, op_is_write(op), 0); 1505 1506 return rc; 1507 } 1508 1509 1510 static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) 1511 { 1512 /* some standard values */ 1513 geo->heads = 1 << 6; 1514 geo->sectors = 1 << 5; 1515 geo->cylinders = get_capacity(bd->bd_disk) >> 11; 1516 return 0; 1517 } 1518 1519 static const struct block_device_operations btt_fops = { 1520 .owner = THIS_MODULE, 1521 .rw_page = btt_rw_page, 1522 .getgeo = btt_getgeo, 1523 .revalidate_disk = nvdimm_revalidate_disk, 1524 }; 1525 1526 static int btt_blk_init(struct btt *btt) 1527 { 1528 struct nd_btt *nd_btt = btt->nd_btt; 1529 struct nd_namespace_common *ndns = nd_btt->ndns; 1530 1531 /* create a new disk and request queue for btt */ 1532 btt->btt_queue = blk_alloc_queue(GFP_KERNEL); 1533 if (!btt->btt_queue) 1534 return -ENOMEM; 1535 1536 btt->btt_disk = alloc_disk(0); 1537 if (!btt->btt_disk) { 1538 blk_cleanup_queue(btt->btt_queue); 1539 return -ENOMEM; 1540 } 1541 1542 nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); 1543 btt->btt_disk->first_minor = 0; 1544 btt->btt_disk->fops = &btt_fops; 1545 btt->btt_disk->private_data = btt; 1546 btt->btt_disk->queue = btt->btt_queue; 1547 btt->btt_disk->flags = GENHD_FL_EXT_DEVT; 1548 btt->btt_disk->queue->backing_dev_info->capabilities |= 1549 BDI_CAP_SYNCHRONOUS_IO; 1550 1551 blk_queue_make_request(btt->btt_queue, btt_make_request); 1552 blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); 1553 blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); 1554 blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue); 1555 btt->btt_queue->queuedata = btt; 1556 1557 if (btt_meta_size(btt)) { 1558 int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); 1559 1560 if (rc) { 1561 del_gendisk(btt->btt_disk); 1562 put_disk(btt->btt_disk); 1563 blk_cleanup_queue(btt->btt_queue); 1564 return rc; 1565 } 1566 } 1567 set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); 1568 device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL); 1569 btt->nd_btt->size = btt->nlba * (u64)btt->sector_size; 1570 revalidate_disk(btt->btt_disk); 1571 1572 return 0; 1573 } 1574 1575 static void btt_blk_cleanup(struct btt *btt) 1576 { 1577 del_gendisk(btt->btt_disk); 1578 put_disk(btt->btt_disk); 1579 blk_cleanup_queue(btt->btt_queue); 1580 } 1581 1582 /** 1583 * btt_init - initialize a block translation table for the given device 1584 * @nd_btt: device with BTT geometry and backing device info 1585 * @rawsize: raw size in bytes of the backing device 1586 * @lbasize: lba size of the backing device 1587 * @uuid: A uuid for the backing device - this is stored on media 1588 * @maxlane: maximum number of parallel requests the device can handle 1589 * 1590 * Initialize a Block Translation Table on a backing device to provide 1591 * single sector power fail atomicity. 1592 * 1593 * Context: 1594 * Might sleep. 1595 * 1596 * Returns: 1597 * Pointer to a new struct btt on success, NULL on failure. 1598 */ 1599 static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, 1600 u32 lbasize, u8 *uuid, struct nd_region *nd_region) 1601 { 1602 int ret; 1603 struct btt *btt; 1604 struct nd_namespace_io *nsio; 1605 struct device *dev = &nd_btt->dev; 1606 1607 btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL); 1608 if (!btt) 1609 return NULL; 1610 1611 btt->nd_btt = nd_btt; 1612 btt->rawsize = rawsize; 1613 btt->lbasize = lbasize; 1614 btt->sector_size = ((lbasize >= 4096) ? 4096 : 512); 1615 INIT_LIST_HEAD(&btt->arena_list); 1616 mutex_init(&btt->init_lock); 1617 btt->nd_region = nd_region; 1618 nsio = to_nd_namespace_io(&nd_btt->ndns->dev); 1619 btt->phys_bb = &nsio->bb; 1620 1621 ret = discover_arenas(btt); 1622 if (ret) { 1623 dev_err(dev, "init: error in arena_discover: %d\n", ret); 1624 return NULL; 1625 } 1626 1627 if (btt->init_state != INIT_READY && nd_region->ro) { 1628 dev_warn(dev, "%s is read-only, unable to init btt metadata\n", 1629 dev_name(&nd_region->dev)); 1630 return NULL; 1631 } else if (btt->init_state != INIT_READY) { 1632 btt->num_arenas = (rawsize / ARENA_MAX_SIZE) + 1633 ((rawsize % ARENA_MAX_SIZE) ? 1 : 0); 1634 dev_dbg(dev, "init: %d arenas for %llu rawsize\n", 1635 btt->num_arenas, rawsize); 1636 1637 ret = create_arenas(btt); 1638 if (ret) { 1639 dev_info(dev, "init: create_arenas: %d\n", ret); 1640 return NULL; 1641 } 1642 1643 ret = btt_meta_init(btt); 1644 if (ret) { 1645 dev_err(dev, "init: error in meta_init: %d\n", ret); 1646 return NULL; 1647 } 1648 } 1649 1650 ret = btt_blk_init(btt); 1651 if (ret) { 1652 dev_err(dev, "init: error in blk_init: %d\n", ret); 1653 return NULL; 1654 } 1655 1656 btt_debugfs_init(btt); 1657 1658 return btt; 1659 } 1660 1661 /** 1662 * btt_fini - de-initialize a BTT 1663 * @btt: the BTT handle that was generated by btt_init 1664 * 1665 * De-initialize a Block Translation Table on device removal 1666 * 1667 * Context: 1668 * Might sleep. 1669 */ 1670 static void btt_fini(struct btt *btt) 1671 { 1672 if (btt) { 1673 btt_blk_cleanup(btt); 1674 free_arenas(btt); 1675 debugfs_remove_recursive(btt->debugfs_dir); 1676 } 1677 } 1678 1679 int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) 1680 { 1681 struct nd_btt *nd_btt = to_nd_btt(ndns->claim); 1682 struct nd_region *nd_region; 1683 struct btt_sb *btt_sb; 1684 struct btt *btt; 1685 size_t rawsize; 1686 1687 if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) { 1688 dev_dbg(&nd_btt->dev, "incomplete btt configuration\n"); 1689 return -ENODEV; 1690 } 1691 1692 btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL); 1693 if (!btt_sb) 1694 return -ENOMEM; 1695 1696 /* 1697 * If this returns < 0, that is ok as it just means there wasn't 1698 * an existing BTT, and we're creating a new one. We still need to 1699 * call this as we need the version dependent fields in nd_btt to be 1700 * set correctly based on the holder class 1701 */ 1702 nd_btt_version(nd_btt, ndns, btt_sb); 1703 1704 rawsize = nvdimm_namespace_capacity(ndns) - nd_btt->initial_offset; 1705 if (rawsize < ARENA_MIN_SIZE) { 1706 dev_dbg(&nd_btt->dev, "%s must be at least %ld bytes\n", 1707 dev_name(&ndns->dev), 1708 ARENA_MIN_SIZE + nd_btt->initial_offset); 1709 return -ENXIO; 1710 } 1711 nd_region = to_nd_region(nd_btt->dev.parent); 1712 btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid, 1713 nd_region); 1714 if (!btt) 1715 return -ENOMEM; 1716 nd_btt->btt = btt; 1717 1718 return 0; 1719 } 1720 EXPORT_SYMBOL(nvdimm_namespace_attach_btt); 1721 1722 int nvdimm_namespace_detach_btt(struct nd_btt *nd_btt) 1723 { 1724 struct btt *btt = nd_btt->btt; 1725 1726 btt_fini(btt); 1727 nd_btt->btt = NULL; 1728 1729 return 0; 1730 } 1731 EXPORT_SYMBOL(nvdimm_namespace_detach_btt); 1732 1733 static int __init nd_btt_init(void) 1734 { 1735 int rc = 0; 1736 1737 debugfs_root = debugfs_create_dir("btt", NULL); 1738 if (IS_ERR_OR_NULL(debugfs_root)) 1739 rc = -ENXIO; 1740 1741 return rc; 1742 } 1743 1744 static void __exit nd_btt_exit(void) 1745 { 1746 debugfs_remove_recursive(debugfs_root); 1747 } 1748 1749 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT); 1750 MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>"); 1751 MODULE_LICENSE("GPL v2"); 1752 module_init(nd_btt_init); 1753 module_exit(nd_btt_exit); 1754