1 /* 2 * Persistent Memory Driver 3 * 4 * Copyright (c) 2014-2015, Intel Corporation. 5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. 6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. 7 * 8 * This program is free software; you can redistribute it and/or modify it 9 * under the terms and conditions of the GNU General Public License, 10 * version 2, as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 * more details. 16 */ 17 18 #include <asm/cacheflush.h> 19 #include <linux/blkdev.h> 20 #include <linux/hdreg.h> 21 #include <linux/init.h> 22 #include <linux/platform_device.h> 23 #include <linux/module.h> 24 #include <linux/moduleparam.h> 25 #include <linux/badblocks.h> 26 #include <linux/memremap.h> 27 #include <linux/vmalloc.h> 28 #include <linux/pfn_t.h> 29 #include <linux/slab.h> 30 #include <linux/pmem.h> 31 #include <linux/nd.h> 32 #include "pfn.h" 33 #include "nd.h" 34 35 struct pmem_device { 36 struct request_queue *pmem_queue; 37 struct gendisk *pmem_disk; 38 struct nd_namespace_common *ndns; 39 40 /* One contiguous memory region per device */ 41 phys_addr_t phys_addr; 42 /* when non-zero this device is hosting a 'pfn' instance */ 43 phys_addr_t data_offset; 44 u64 pfn_flags; 45 void __pmem *virt_addr; 46 /* immutable base size of the namespace */ 47 size_t size; 48 /* trim size when namespace capacity has been section aligned */ 49 u32 pfn_pad; 50 struct badblocks bb; 51 }; 52 53 static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) 54 { 55 if (bb->count) { 56 sector_t first_bad; 57 int num_bad; 58 59 return !!badblocks_check(bb, sector, len / 512, &first_bad, 60 &num_bad); 61 } 62 63 return false; 64 } 65 66 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, 67 unsigned int len) 68 { 69 struct device *dev = disk_to_dev(pmem->pmem_disk); 70 sector_t sector; 71 long cleared; 72 73 sector = (offset - pmem->data_offset) / 512; 74 cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); 75 76 if (cleared > 0 && cleared / 512) { 77 dev_dbg(dev, "%s: %llx clear %ld sector%s\n", 78 __func__, (unsigned long long) sector, 79 cleared / 512, cleared / 512 > 1 ? "s" : ""); 80 badblocks_clear(&pmem->bb, sector, cleared / 512); 81 } 82 invalidate_pmem(pmem->virt_addr + offset, len); 83 } 84 85 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, 86 unsigned int len, unsigned int off, int rw, 87 sector_t sector) 88 { 89 int rc = 0; 90 bool bad_pmem = false; 91 void *mem = kmap_atomic(page); 92 phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 93 void __pmem *pmem_addr = pmem->virt_addr + pmem_off; 94 95 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 96 bad_pmem = true; 97 98 if (rw == READ) { 99 if (unlikely(bad_pmem)) 100 rc = -EIO; 101 else { 102 memcpy_from_pmem(mem + off, pmem_addr, len); 103 flush_dcache_page(page); 104 } 105 } else { 106 flush_dcache_page(page); 107 memcpy_to_pmem(pmem_addr, mem + off, len); 108 if (unlikely(bad_pmem)) { 109 pmem_clear_poison(pmem, pmem_off, len); 110 memcpy_to_pmem(pmem_addr, mem + off, len); 111 } 112 } 113 114 kunmap_atomic(mem); 115 return rc; 116 } 117 118 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) 119 { 120 int rc = 0; 121 bool do_acct; 122 unsigned long start; 123 struct bio_vec bvec; 124 struct bvec_iter iter; 125 struct block_device *bdev = bio->bi_bdev; 126 struct pmem_device *pmem = bdev->bd_disk->private_data; 127 128 do_acct = nd_iostat_start(bio, &start); 129 bio_for_each_segment(bvec, bio, iter) { 130 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, 131 bvec.bv_offset, bio_data_dir(bio), 132 iter.bi_sector); 133 if (rc) { 134 bio->bi_error = rc; 135 break; 136 } 137 } 138 if (do_acct) 139 nd_iostat_end(bio, start); 140 141 if (bio_data_dir(bio)) 142 wmb_pmem(); 143 144 bio_endio(bio); 145 return BLK_QC_T_NONE; 146 } 147 148 static int pmem_rw_page(struct block_device *bdev, sector_t sector, 149 struct page *page, int rw) 150 { 151 struct pmem_device *pmem = bdev->bd_disk->private_data; 152 int rc; 153 154 rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); 155 if (rw & WRITE) 156 wmb_pmem(); 157 158 /* 159 * The ->rw_page interface is subtle and tricky. The core 160 * retries on any error, so we can only invoke page_endio() in 161 * the successful completion case. Otherwise, we'll see crashes 162 * caused by double completion. 163 */ 164 if (rc == 0) 165 page_endio(page, rw & WRITE, 0); 166 167 return rc; 168 } 169 170 static long pmem_direct_access(struct block_device *bdev, sector_t sector, 171 void __pmem **kaddr, pfn_t *pfn) 172 { 173 struct pmem_device *pmem = bdev->bd_disk->private_data; 174 resource_size_t offset = sector * 512 + pmem->data_offset; 175 176 *kaddr = pmem->virt_addr + offset; 177 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 178 179 return pmem->size - pmem->pfn_pad - offset; 180 } 181 182 static const struct block_device_operations pmem_fops = { 183 .owner = THIS_MODULE, 184 .rw_page = pmem_rw_page, 185 .direct_access = pmem_direct_access, 186 .revalidate_disk = nvdimm_revalidate_disk, 187 }; 188 189 static struct pmem_device *pmem_alloc(struct device *dev, 190 struct resource *res, int id) 191 { 192 struct pmem_device *pmem; 193 struct request_queue *q; 194 195 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 196 if (!pmem) 197 return ERR_PTR(-ENOMEM); 198 199 pmem->phys_addr = res->start; 200 pmem->size = resource_size(res); 201 if (!arch_has_wmb_pmem()) 202 dev_warn(dev, "unable to guarantee persistence of writes\n"); 203 204 if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size, 205 dev_name(dev))) { 206 dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", 207 &pmem->phys_addr, pmem->size); 208 return ERR_PTR(-EBUSY); 209 } 210 211 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); 212 if (!q) 213 return ERR_PTR(-ENOMEM); 214 215 pmem->pfn_flags = PFN_DEV; 216 if (pmem_should_map_pages(dev)) { 217 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, 218 &q->q_usage_counter, NULL); 219 pmem->pfn_flags |= PFN_MAP; 220 } else 221 pmem->virt_addr = (void __pmem *) devm_memremap(dev, 222 pmem->phys_addr, pmem->size, 223 ARCH_MEMREMAP_PMEM); 224 225 if (IS_ERR(pmem->virt_addr)) { 226 blk_cleanup_queue(q); 227 return (void __force *) pmem->virt_addr; 228 } 229 230 pmem->pmem_queue = q; 231 return pmem; 232 } 233 234 static void pmem_detach_disk(struct pmem_device *pmem) 235 { 236 if (!pmem->pmem_disk) 237 return; 238 239 del_gendisk(pmem->pmem_disk); 240 put_disk(pmem->pmem_disk); 241 blk_cleanup_queue(pmem->pmem_queue); 242 } 243 244 static int pmem_attach_disk(struct device *dev, 245 struct nd_namespace_common *ndns, struct pmem_device *pmem) 246 { 247 int nid = dev_to_node(dev); 248 struct gendisk *disk; 249 250 blk_queue_make_request(pmem->pmem_queue, pmem_make_request); 251 blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); 252 blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); 253 blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); 254 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue); 255 256 disk = alloc_disk_node(0, nid); 257 if (!disk) { 258 blk_cleanup_queue(pmem->pmem_queue); 259 return -ENOMEM; 260 } 261 262 disk->fops = &pmem_fops; 263 disk->private_data = pmem; 264 disk->queue = pmem->pmem_queue; 265 disk->flags = GENHD_FL_EXT_DEVT; 266 nvdimm_namespace_disk_name(ndns, disk->disk_name); 267 disk->driverfs_dev = dev; 268 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 269 / 512); 270 pmem->pmem_disk = disk; 271 devm_exit_badblocks(dev, &pmem->bb); 272 if (devm_init_badblocks(dev, &pmem->bb)) 273 return -ENOMEM; 274 nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); 275 276 disk->bb = &pmem->bb; 277 add_disk(disk); 278 revalidate_disk(disk); 279 280 return 0; 281 } 282 283 static int pmem_rw_bytes(struct nd_namespace_common *ndns, 284 resource_size_t offset, void *buf, size_t size, int rw) 285 { 286 struct pmem_device *pmem = dev_get_drvdata(ndns->claim); 287 288 if (unlikely(offset + size > pmem->size)) { 289 dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); 290 return -EFAULT; 291 } 292 293 if (rw == READ) { 294 unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); 295 296 if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align))) 297 return -EIO; 298 memcpy_from_pmem(buf, pmem->virt_addr + offset, size); 299 } else { 300 memcpy_to_pmem(pmem->virt_addr + offset, buf, size); 301 wmb_pmem(); 302 } 303 304 return 0; 305 } 306 307 static int nd_pfn_init(struct nd_pfn *nd_pfn) 308 { 309 struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL); 310 struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev); 311 struct nd_namespace_common *ndns = nd_pfn->ndns; 312 u32 start_pad = 0, end_trunc = 0; 313 resource_size_t start, size; 314 struct nd_namespace_io *nsio; 315 struct nd_region *nd_region; 316 unsigned long npfns; 317 phys_addr_t offset; 318 u64 checksum; 319 int rc; 320 321 if (!pfn_sb) 322 return -ENOMEM; 323 324 nd_pfn->pfn_sb = pfn_sb; 325 rc = nd_pfn_validate(nd_pfn); 326 if (rc == -ENODEV) 327 /* no info block, do init */; 328 else 329 return rc; 330 331 nd_region = to_nd_region(nd_pfn->dev.parent); 332 if (nd_region->ro) { 333 dev_info(&nd_pfn->dev, 334 "%s is read-only, unable to init metadata\n", 335 dev_name(&nd_region->dev)); 336 goto err; 337 } 338 339 memset(pfn_sb, 0, sizeof(*pfn_sb)); 340 341 /* 342 * Check if pmem collides with 'System RAM' when section aligned and 343 * trim it accordingly 344 */ 345 nsio = to_nd_namespace_io(&ndns->dev); 346 start = PHYS_SECTION_ALIGN_DOWN(nsio->res.start); 347 size = resource_size(&nsio->res); 348 if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, 349 IORES_DESC_NONE) == REGION_MIXED) { 350 351 start = nsio->res.start; 352 start_pad = PHYS_SECTION_ALIGN_UP(start) - start; 353 } 354 355 start = nsio->res.start; 356 size = PHYS_SECTION_ALIGN_UP(start + size) - start; 357 if (region_intersects(start, size, IORESOURCE_SYSTEM_RAM, 358 IORES_DESC_NONE) == REGION_MIXED) { 359 size = resource_size(&nsio->res); 360 end_trunc = start + size - PHYS_SECTION_ALIGN_DOWN(start + size); 361 } 362 363 if (start_pad + end_trunc) 364 dev_info(&nd_pfn->dev, "%s section collision, truncate %d bytes\n", 365 dev_name(&ndns->dev), start_pad + end_trunc); 366 367 /* 368 * Note, we use 64 here for the standard size of struct page, 369 * debugging options may cause it to be larger in which case the 370 * implementation will limit the pfns advertised through 371 * ->direct_access() to those that are included in the memmap. 372 */ 373 start += start_pad; 374 npfns = (pmem->size - start_pad - end_trunc - SZ_8K) / SZ_4K; 375 if (nd_pfn->mode == PFN_MODE_PMEM) 376 offset = ALIGN(start + SZ_8K + 64 * npfns, nd_pfn->align) 377 - start; 378 else if (nd_pfn->mode == PFN_MODE_RAM) 379 offset = ALIGN(start + SZ_8K, nd_pfn->align) - start; 380 else 381 goto err; 382 383 if (offset + start_pad + end_trunc >= pmem->size) { 384 dev_err(&nd_pfn->dev, "%s unable to satisfy requested alignment\n", 385 dev_name(&ndns->dev)); 386 goto err; 387 } 388 389 npfns = (pmem->size - offset - start_pad - end_trunc) / SZ_4K; 390 pfn_sb->mode = cpu_to_le32(nd_pfn->mode); 391 pfn_sb->dataoff = cpu_to_le64(offset); 392 pfn_sb->npfns = cpu_to_le64(npfns); 393 memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); 394 memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); 395 memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); 396 pfn_sb->version_major = cpu_to_le16(1); 397 pfn_sb->version_minor = cpu_to_le16(1); 398 pfn_sb->start_pad = cpu_to_le32(start_pad); 399 pfn_sb->end_trunc = cpu_to_le32(end_trunc); 400 checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); 401 pfn_sb->checksum = cpu_to_le64(checksum); 402 403 rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)); 404 if (rc) 405 goto err; 406 407 return 0; 408 err: 409 nd_pfn->pfn_sb = NULL; 410 kfree(pfn_sb); 411 return -ENXIO; 412 } 413 414 static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns) 415 { 416 struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); 417 struct pmem_device *pmem; 418 419 /* free pmem disk */ 420 pmem = dev_get_drvdata(&nd_pfn->dev); 421 pmem_detach_disk(pmem); 422 423 /* release nd_pfn resources */ 424 kfree(nd_pfn->pfn_sb); 425 nd_pfn->pfn_sb = NULL; 426 427 return 0; 428 } 429 430 /* 431 * We hotplug memory at section granularity, pad the reserved area from 432 * the previous section base to the namespace base address. 433 */ 434 static unsigned long init_altmap_base(resource_size_t base) 435 { 436 unsigned long base_pfn = PHYS_PFN(base); 437 438 return PFN_SECTION_ALIGN_DOWN(base_pfn); 439 } 440 441 static unsigned long init_altmap_reserve(resource_size_t base) 442 { 443 unsigned long reserve = PHYS_PFN(SZ_8K); 444 unsigned long base_pfn = PHYS_PFN(base); 445 446 reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn); 447 return reserve; 448 } 449 450 static int __nvdimm_namespace_attach_pfn(struct nd_pfn *nd_pfn) 451 { 452 int rc; 453 struct resource res; 454 struct request_queue *q; 455 struct pmem_device *pmem; 456 struct vmem_altmap *altmap; 457 struct device *dev = &nd_pfn->dev; 458 struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 459 struct nd_namespace_common *ndns = nd_pfn->ndns; 460 u32 start_pad = __le32_to_cpu(pfn_sb->start_pad); 461 u32 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 462 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 463 resource_size_t base = nsio->res.start + start_pad; 464 struct vmem_altmap __altmap = { 465 .base_pfn = init_altmap_base(base), 466 .reserve = init_altmap_reserve(base), 467 }; 468 469 pmem = dev_get_drvdata(dev); 470 pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); 471 pmem->pfn_pad = start_pad + end_trunc; 472 nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); 473 if (nd_pfn->mode == PFN_MODE_RAM) { 474 if (pmem->data_offset < SZ_8K) 475 return -EINVAL; 476 nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); 477 altmap = NULL; 478 } else if (nd_pfn->mode == PFN_MODE_PMEM) { 479 nd_pfn->npfns = (pmem->size - pmem->pfn_pad - pmem->data_offset) 480 / PAGE_SIZE; 481 if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) 482 dev_info(&nd_pfn->dev, 483 "number of pfns truncated from %lld to %ld\n", 484 le64_to_cpu(nd_pfn->pfn_sb->npfns), 485 nd_pfn->npfns); 486 altmap = & __altmap; 487 altmap->free = PHYS_PFN(pmem->data_offset - SZ_8K); 488 altmap->alloc = 0; 489 } else { 490 rc = -ENXIO; 491 goto err; 492 } 493 494 /* establish pfn range for lookup, and switch to direct map */ 495 q = pmem->pmem_queue; 496 memcpy(&res, &nsio->res, sizeof(res)); 497 res.start += start_pad; 498 res.end -= end_trunc; 499 devm_memunmap(dev, (void __force *) pmem->virt_addr); 500 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &res, 501 &q->q_usage_counter, altmap); 502 pmem->pfn_flags |= PFN_MAP; 503 if (IS_ERR(pmem->virt_addr)) { 504 rc = PTR_ERR(pmem->virt_addr); 505 goto err; 506 } 507 508 /* attach pmem disk in "pfn-mode" */ 509 rc = pmem_attach_disk(dev, ndns, pmem); 510 if (rc) 511 goto err; 512 513 return rc; 514 err: 515 nvdimm_namespace_detach_pfn(ndns); 516 return rc; 517 518 } 519 520 static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) 521 { 522 struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); 523 int rc; 524 525 if (!nd_pfn->uuid || !nd_pfn->ndns) 526 return -ENODEV; 527 528 rc = nd_pfn_init(nd_pfn); 529 if (rc) 530 return rc; 531 /* we need a valid pfn_sb before we can init a vmem_altmap */ 532 return __nvdimm_namespace_attach_pfn(nd_pfn); 533 } 534 535 static int nd_pmem_probe(struct device *dev) 536 { 537 struct nd_region *nd_region = to_nd_region(dev->parent); 538 struct nd_namespace_common *ndns; 539 struct nd_namespace_io *nsio; 540 struct pmem_device *pmem; 541 542 ndns = nvdimm_namespace_common_probe(dev); 543 if (IS_ERR(ndns)) 544 return PTR_ERR(ndns); 545 546 nsio = to_nd_namespace_io(&ndns->dev); 547 pmem = pmem_alloc(dev, &nsio->res, nd_region->id); 548 if (IS_ERR(pmem)) 549 return PTR_ERR(pmem); 550 551 pmem->ndns = ndns; 552 dev_set_drvdata(dev, pmem); 553 ndns->rw_bytes = pmem_rw_bytes; 554 if (devm_init_badblocks(dev, &pmem->bb)) 555 return -ENOMEM; 556 nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); 557 558 if (is_nd_btt(dev)) { 559 /* btt allocates its own request_queue */ 560 blk_cleanup_queue(pmem->pmem_queue); 561 pmem->pmem_queue = NULL; 562 return nvdimm_namespace_attach_btt(ndns); 563 } 564 565 if (is_nd_pfn(dev)) 566 return nvdimm_namespace_attach_pfn(ndns); 567 568 if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) { 569 /* 570 * We'll come back as either btt-pmem, or pfn-pmem, so 571 * drop the queue allocation for now. 572 */ 573 blk_cleanup_queue(pmem->pmem_queue); 574 return -ENXIO; 575 } 576 577 return pmem_attach_disk(dev, ndns, pmem); 578 } 579 580 static int nd_pmem_remove(struct device *dev) 581 { 582 struct pmem_device *pmem = dev_get_drvdata(dev); 583 584 if (is_nd_btt(dev)) 585 nvdimm_namespace_detach_btt(pmem->ndns); 586 else if (is_nd_pfn(dev)) 587 nvdimm_namespace_detach_pfn(pmem->ndns); 588 else 589 pmem_detach_disk(pmem); 590 591 return 0; 592 } 593 594 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) 595 { 596 struct pmem_device *pmem = dev_get_drvdata(dev); 597 struct nd_namespace_common *ndns = pmem->ndns; 598 599 if (event != NVDIMM_REVALIDATE_POISON) 600 return; 601 602 if (is_nd_btt(dev)) 603 nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); 604 else 605 nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); 606 } 607 608 MODULE_ALIAS("pmem"); 609 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); 610 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); 611 static struct nd_device_driver nd_pmem_driver = { 612 .probe = nd_pmem_probe, 613 .remove = nd_pmem_remove, 614 .notify = nd_pmem_notify, 615 .drv = { 616 .name = "nd_pmem", 617 }, 618 .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, 619 }; 620 621 static int __init pmem_init(void) 622 { 623 return nd_driver_register(&nd_pmem_driver); 624 } 625 module_init(pmem_init); 626 627 static void pmem_exit(void) 628 { 629 driver_unregister(&nd_pmem_driver.drv); 630 } 631 module_exit(pmem_exit); 632 633 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); 634 MODULE_LICENSE("GPL v2"); 635