1 /* 2 * Persistent Memory Driver 3 * 4 * Copyright (c) 2014-2015, Intel Corporation. 5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. 6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. 7 * 8 * This program is free software; you can redistribute it and/or modify it 9 * under the terms and conditions of the GNU General Public License, 10 * version 2, as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 * more details. 16 */ 17 18 #include <asm/cacheflush.h> 19 #include <linux/blkdev.h> 20 #include <linux/hdreg.h> 21 #include <linux/init.h> 22 #include <linux/platform_device.h> 23 #include <linux/module.h> 24 #include <linux/moduleparam.h> 25 #include <linux/badblocks.h> 26 #include <linux/memremap.h> 27 #include <linux/vmalloc.h> 28 #include <linux/pfn_t.h> 29 #include <linux/slab.h> 30 #include <linux/pmem.h> 31 #include <linux/nd.h> 32 #include "pfn.h" 33 #include "nd.h" 34 35 struct pmem_device { 36 struct request_queue *pmem_queue; 37 struct gendisk *pmem_disk; 38 struct nd_namespace_common *ndns; 39 40 /* One contiguous memory region per device */ 41 phys_addr_t phys_addr; 42 /* when non-zero this device is hosting a 'pfn' instance */ 43 phys_addr_t data_offset; 44 unsigned long pfn_flags; 45 void __pmem *virt_addr; 46 size_t size; 47 struct badblocks bb; 48 }; 49 50 static int pmem_major; 51 52 static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len) 53 { 54 if (bb->count) { 55 sector_t first_bad; 56 int num_bad; 57 58 return !!badblocks_check(bb, sector, len / 512, &first_bad, 59 &num_bad); 60 } 61 62 return false; 63 } 64 65 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, 66 unsigned int len, unsigned int off, int rw, 67 sector_t sector) 68 { 69 void *mem = kmap_atomic(page); 70 phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 71 void __pmem *pmem_addr = pmem->virt_addr + pmem_off; 72 73 if (rw == READ) { 74 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 75 return -EIO; 76 memcpy_from_pmem(mem + off, pmem_addr, len); 77 flush_dcache_page(page); 78 } else { 79 flush_dcache_page(page); 80 memcpy_to_pmem(pmem_addr, mem + off, len); 81 } 82 83 kunmap_atomic(mem); 84 return 0; 85 } 86 87 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) 88 { 89 int rc = 0; 90 bool do_acct; 91 unsigned long start; 92 struct bio_vec bvec; 93 struct bvec_iter iter; 94 struct block_device *bdev = bio->bi_bdev; 95 struct pmem_device *pmem = bdev->bd_disk->private_data; 96 97 do_acct = nd_iostat_start(bio, &start); 98 bio_for_each_segment(bvec, bio, iter) { 99 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, 100 bvec.bv_offset, bio_data_dir(bio), 101 iter.bi_sector); 102 if (rc) { 103 bio->bi_error = rc; 104 break; 105 } 106 } 107 if (do_acct) 108 nd_iostat_end(bio, start); 109 110 if (bio_data_dir(bio)) 111 wmb_pmem(); 112 113 bio_endio(bio); 114 return BLK_QC_T_NONE; 115 } 116 117 static int pmem_rw_page(struct block_device *bdev, sector_t sector, 118 struct page *page, int rw) 119 { 120 struct pmem_device *pmem = bdev->bd_disk->private_data; 121 int rc; 122 123 rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); 124 if (rw & WRITE) 125 wmb_pmem(); 126 127 /* 128 * The ->rw_page interface is subtle and tricky. The core 129 * retries on any error, so we can only invoke page_endio() in 130 * the successful completion case. Otherwise, we'll see crashes 131 * caused by double completion. 132 */ 133 if (rc == 0) 134 page_endio(page, rw & WRITE, 0); 135 136 return rc; 137 } 138 139 static long pmem_direct_access(struct block_device *bdev, sector_t sector, 140 void __pmem **kaddr, pfn_t *pfn) 141 { 142 struct pmem_device *pmem = bdev->bd_disk->private_data; 143 resource_size_t offset = sector * 512 + pmem->data_offset; 144 145 *kaddr = pmem->virt_addr + offset; 146 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 147 148 return pmem->size - offset; 149 } 150 151 static const struct block_device_operations pmem_fops = { 152 .owner = THIS_MODULE, 153 .rw_page = pmem_rw_page, 154 .direct_access = pmem_direct_access, 155 .revalidate_disk = nvdimm_revalidate_disk, 156 }; 157 158 static struct pmem_device *pmem_alloc(struct device *dev, 159 struct resource *res, int id) 160 { 161 struct pmem_device *pmem; 162 struct request_queue *q; 163 164 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 165 if (!pmem) 166 return ERR_PTR(-ENOMEM); 167 168 pmem->phys_addr = res->start; 169 pmem->size = resource_size(res); 170 if (!arch_has_wmb_pmem()) 171 dev_warn(dev, "unable to guarantee persistence of writes\n"); 172 173 if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size, 174 dev_name(dev))) { 175 dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", 176 &pmem->phys_addr, pmem->size); 177 return ERR_PTR(-EBUSY); 178 } 179 180 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); 181 if (!q) 182 return ERR_PTR(-ENOMEM); 183 184 pmem->pfn_flags = PFN_DEV; 185 if (pmem_should_map_pages(dev)) { 186 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, 187 &q->q_usage_counter, NULL); 188 pmem->pfn_flags |= PFN_MAP; 189 } else 190 pmem->virt_addr = (void __pmem *) devm_memremap(dev, 191 pmem->phys_addr, pmem->size, 192 ARCH_MEMREMAP_PMEM); 193 194 if (IS_ERR(pmem->virt_addr)) { 195 blk_cleanup_queue(q); 196 return (void __force *) pmem->virt_addr; 197 } 198 199 pmem->pmem_queue = q; 200 return pmem; 201 } 202 203 static void pmem_detach_disk(struct pmem_device *pmem) 204 { 205 if (!pmem->pmem_disk) 206 return; 207 208 del_gendisk(pmem->pmem_disk); 209 put_disk(pmem->pmem_disk); 210 blk_cleanup_queue(pmem->pmem_queue); 211 } 212 213 static int pmem_attach_disk(struct device *dev, 214 struct nd_namespace_common *ndns, struct pmem_device *pmem) 215 { 216 int nid = dev_to_node(dev); 217 struct gendisk *disk; 218 219 blk_queue_make_request(pmem->pmem_queue, pmem_make_request); 220 blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); 221 blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); 222 blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); 223 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue); 224 225 disk = alloc_disk_node(0, nid); 226 if (!disk) { 227 blk_cleanup_queue(pmem->pmem_queue); 228 return -ENOMEM; 229 } 230 231 disk->major = pmem_major; 232 disk->first_minor = 0; 233 disk->fops = &pmem_fops; 234 disk->private_data = pmem; 235 disk->queue = pmem->pmem_queue; 236 disk->flags = GENHD_FL_EXT_DEVT; 237 nvdimm_namespace_disk_name(ndns, disk->disk_name); 238 disk->driverfs_dev = dev; 239 set_capacity(disk, (pmem->size - pmem->data_offset) / 512); 240 pmem->pmem_disk = disk; 241 devm_exit_badblocks(dev, &pmem->bb); 242 if (devm_init_badblocks(dev, &pmem->bb)) 243 return -ENOMEM; 244 nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset); 245 246 disk->bb = &pmem->bb; 247 add_disk(disk); 248 revalidate_disk(disk); 249 250 return 0; 251 } 252 253 static int pmem_rw_bytes(struct nd_namespace_common *ndns, 254 resource_size_t offset, void *buf, size_t size, int rw) 255 { 256 struct pmem_device *pmem = dev_get_drvdata(ndns->claim); 257 258 if (unlikely(offset + size > pmem->size)) { 259 dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); 260 return -EFAULT; 261 } 262 263 if (rw == READ) { 264 unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512); 265 266 if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align))) 267 return -EIO; 268 memcpy_from_pmem(buf, pmem->virt_addr + offset, size); 269 } else { 270 memcpy_to_pmem(pmem->virt_addr + offset, buf, size); 271 wmb_pmem(); 272 } 273 274 return 0; 275 } 276 277 static int nd_pfn_init(struct nd_pfn *nd_pfn) 278 { 279 struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL); 280 struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev); 281 struct nd_namespace_common *ndns = nd_pfn->ndns; 282 struct nd_region *nd_region; 283 unsigned long npfns; 284 phys_addr_t offset; 285 u64 checksum; 286 int rc; 287 288 if (!pfn_sb) 289 return -ENOMEM; 290 291 nd_pfn->pfn_sb = pfn_sb; 292 rc = nd_pfn_validate(nd_pfn); 293 if (rc == -ENODEV) 294 /* no info block, do init */; 295 else 296 return rc; 297 298 nd_region = to_nd_region(nd_pfn->dev.parent); 299 if (nd_region->ro) { 300 dev_info(&nd_pfn->dev, 301 "%s is read-only, unable to init metadata\n", 302 dev_name(&nd_region->dev)); 303 goto err; 304 } 305 306 memset(pfn_sb, 0, sizeof(*pfn_sb)); 307 npfns = (pmem->size - SZ_8K) / SZ_4K; 308 /* 309 * Note, we use 64 here for the standard size of struct page, 310 * debugging options may cause it to be larger in which case the 311 * implementation will limit the pfns advertised through 312 * ->direct_access() to those that are included in the memmap. 313 */ 314 if (nd_pfn->mode == PFN_MODE_PMEM) 315 offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align); 316 else if (nd_pfn->mode == PFN_MODE_RAM) 317 offset = ALIGN(SZ_8K, nd_pfn->align); 318 else 319 goto err; 320 321 npfns = (pmem->size - offset) / SZ_4K; 322 pfn_sb->mode = cpu_to_le32(nd_pfn->mode); 323 pfn_sb->dataoff = cpu_to_le64(offset); 324 pfn_sb->npfns = cpu_to_le64(npfns); 325 memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); 326 memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); 327 memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16); 328 pfn_sb->version_major = cpu_to_le16(1); 329 checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); 330 pfn_sb->checksum = cpu_to_le64(checksum); 331 332 rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)); 333 if (rc) 334 goto err; 335 336 return 0; 337 err: 338 nd_pfn->pfn_sb = NULL; 339 kfree(pfn_sb); 340 return -ENXIO; 341 } 342 343 static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns) 344 { 345 struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); 346 struct pmem_device *pmem; 347 348 /* free pmem disk */ 349 pmem = dev_get_drvdata(&nd_pfn->dev); 350 pmem_detach_disk(pmem); 351 352 /* release nd_pfn resources */ 353 kfree(nd_pfn->pfn_sb); 354 nd_pfn->pfn_sb = NULL; 355 356 return 0; 357 } 358 359 static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) 360 { 361 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 362 struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); 363 struct device *dev = &nd_pfn->dev; 364 struct nd_region *nd_region; 365 struct vmem_altmap *altmap; 366 struct nd_pfn_sb *pfn_sb; 367 struct pmem_device *pmem; 368 struct request_queue *q; 369 phys_addr_t offset; 370 int rc; 371 struct vmem_altmap __altmap = { 372 .base_pfn = __phys_to_pfn(nsio->res.start), 373 .reserve = __phys_to_pfn(SZ_8K), 374 }; 375 376 if (!nd_pfn->uuid || !nd_pfn->ndns) 377 return -ENODEV; 378 379 nd_region = to_nd_region(dev->parent); 380 rc = nd_pfn_init(nd_pfn); 381 if (rc) 382 return rc; 383 384 pfn_sb = nd_pfn->pfn_sb; 385 offset = le64_to_cpu(pfn_sb->dataoff); 386 nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); 387 if (nd_pfn->mode == PFN_MODE_RAM) { 388 if (offset < SZ_8K) 389 return -EINVAL; 390 nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); 391 altmap = NULL; 392 } else if (nd_pfn->mode == PFN_MODE_PMEM) { 393 nd_pfn->npfns = (resource_size(&nsio->res) - offset) 394 / PAGE_SIZE; 395 if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) 396 dev_info(&nd_pfn->dev, 397 "number of pfns truncated from %lld to %ld\n", 398 le64_to_cpu(nd_pfn->pfn_sb->npfns), 399 nd_pfn->npfns); 400 altmap = & __altmap; 401 altmap->free = __phys_to_pfn(offset - SZ_8K); 402 altmap->alloc = 0; 403 } else { 404 rc = -ENXIO; 405 goto err; 406 } 407 408 /* establish pfn range for lookup, and switch to direct map */ 409 pmem = dev_get_drvdata(dev); 410 q = pmem->pmem_queue; 411 devm_memunmap(dev, (void __force *) pmem->virt_addr); 412 pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res, 413 &q->q_usage_counter, altmap); 414 pmem->pfn_flags |= PFN_MAP; 415 if (IS_ERR(pmem->virt_addr)) { 416 rc = PTR_ERR(pmem->virt_addr); 417 goto err; 418 } 419 420 /* attach pmem disk in "pfn-mode" */ 421 pmem->data_offset = offset; 422 rc = pmem_attach_disk(dev, ndns, pmem); 423 if (rc) 424 goto err; 425 426 return rc; 427 err: 428 nvdimm_namespace_detach_pfn(ndns); 429 return rc; 430 } 431 432 static int nd_pmem_probe(struct device *dev) 433 { 434 struct nd_region *nd_region = to_nd_region(dev->parent); 435 struct nd_namespace_common *ndns; 436 struct nd_namespace_io *nsio; 437 struct pmem_device *pmem; 438 439 ndns = nvdimm_namespace_common_probe(dev); 440 if (IS_ERR(ndns)) 441 return PTR_ERR(ndns); 442 443 nsio = to_nd_namespace_io(&ndns->dev); 444 pmem = pmem_alloc(dev, &nsio->res, nd_region->id); 445 if (IS_ERR(pmem)) 446 return PTR_ERR(pmem); 447 448 pmem->ndns = ndns; 449 dev_set_drvdata(dev, pmem); 450 ndns->rw_bytes = pmem_rw_bytes; 451 if (devm_init_badblocks(dev, &pmem->bb)) 452 return -ENOMEM; 453 nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); 454 455 if (is_nd_btt(dev)) { 456 /* btt allocates its own request_queue */ 457 blk_cleanup_queue(pmem->pmem_queue); 458 pmem->pmem_queue = NULL; 459 return nvdimm_namespace_attach_btt(ndns); 460 } 461 462 if (is_nd_pfn(dev)) 463 return nvdimm_namespace_attach_pfn(ndns); 464 465 if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) { 466 /* 467 * We'll come back as either btt-pmem, or pfn-pmem, so 468 * drop the queue allocation for now. 469 */ 470 blk_cleanup_queue(pmem->pmem_queue); 471 return -ENXIO; 472 } 473 474 return pmem_attach_disk(dev, ndns, pmem); 475 } 476 477 static int nd_pmem_remove(struct device *dev) 478 { 479 struct pmem_device *pmem = dev_get_drvdata(dev); 480 481 if (is_nd_btt(dev)) 482 nvdimm_namespace_detach_btt(pmem->ndns); 483 else if (is_nd_pfn(dev)) 484 nvdimm_namespace_detach_pfn(pmem->ndns); 485 else 486 pmem_detach_disk(pmem); 487 488 return 0; 489 } 490 491 MODULE_ALIAS("pmem"); 492 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); 493 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); 494 static struct nd_device_driver nd_pmem_driver = { 495 .probe = nd_pmem_probe, 496 .remove = nd_pmem_remove, 497 .drv = { 498 .name = "nd_pmem", 499 }, 500 .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, 501 }; 502 503 static int __init pmem_init(void) 504 { 505 int error; 506 507 pmem_major = register_blkdev(0, "pmem"); 508 if (pmem_major < 0) 509 return pmem_major; 510 511 error = nd_driver_register(&nd_pmem_driver); 512 if (error) { 513 unregister_blkdev(pmem_major, "pmem"); 514 return error; 515 } 516 517 return 0; 518 } 519 module_init(pmem_init); 520 521 static void pmem_exit(void) 522 { 523 driver_unregister(&nd_pmem_driver.drv); 524 unregister_blkdev(pmem_major, "pmem"); 525 } 526 module_exit(pmem_exit); 527 528 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); 529 MODULE_LICENSE("GPL v2"); 530