1 /* 2 * Persistent Memory Driver 3 * 4 * Copyright (c) 2014-2015, Intel Corporation. 5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. 6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. 7 * 8 * This program is free software; you can redistribute it and/or modify it 9 * under the terms and conditions of the GNU General Public License, 10 * version 2, as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 * more details. 16 */ 17 18 #include <asm/cacheflush.h> 19 #include <linux/blkdev.h> 20 #include <linux/hdreg.h> 21 #include <linux/init.h> 22 #include <linux/platform_device.h> 23 #include <linux/module.h> 24 #include <linux/moduleparam.h> 25 #include <linux/badblocks.h> 26 #include <linux/memremap.h> 27 #include <linux/vmalloc.h> 28 #include <linux/pfn_t.h> 29 #include <linux/slab.h> 30 #include <linux/pmem.h> 31 #include <linux/nd.h> 32 #include "pmem.h" 33 #include "pfn.h" 34 #include "nd.h" 35 36 static struct device *to_dev(struct pmem_device *pmem) 37 { 38 /* 39 * nvdimm bus services need a 'dev' parameter, and we record the device 40 * at init in bb.dev. 41 */ 42 return pmem->bb.dev; 43 } 44 45 static struct nd_region *to_region(struct pmem_device *pmem) 46 { 47 return to_nd_region(to_dev(pmem)->parent); 48 } 49 50 static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, 51 unsigned int len) 52 { 53 struct device *dev = to_dev(pmem); 54 sector_t sector; 55 long cleared; 56 int rc = 0; 57 58 sector = (offset - pmem->data_offset) / 512; 59 60 cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); 61 if (cleared < len) 62 rc = -EIO; 63 if (cleared > 0 && cleared / 512) { 64 cleared /= 512; 65 dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__, 66 (unsigned long long) sector, cleared, 67 cleared > 1 ? "s" : ""); 68 badblocks_clear(&pmem->bb, sector, cleared); 69 } 70 71 invalidate_pmem(pmem->virt_addr + offset, len); 72 73 return rc; 74 } 75 76 static void write_pmem(void *pmem_addr, struct page *page, 77 unsigned int off, unsigned int len) 78 { 79 void *mem = kmap_atomic(page); 80 81 memcpy_to_pmem(pmem_addr, mem + off, len); 82 kunmap_atomic(mem); 83 } 84 85 static int read_pmem(struct page *page, unsigned int off, 86 void *pmem_addr, unsigned int len) 87 { 88 int rc; 89 void *mem = kmap_atomic(page); 90 91 rc = memcpy_from_pmem(mem + off, pmem_addr, len); 92 kunmap_atomic(mem); 93 if (rc) 94 return -EIO; 95 return 0; 96 } 97 98 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, 99 unsigned int len, unsigned int off, bool is_write, 100 sector_t sector) 101 { 102 int rc = 0; 103 bool bad_pmem = false; 104 phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 105 void *pmem_addr = pmem->virt_addr + pmem_off; 106 107 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 108 bad_pmem = true; 109 110 if (!is_write) { 111 if (unlikely(bad_pmem)) 112 rc = -EIO; 113 else { 114 rc = read_pmem(page, off, pmem_addr, len); 115 flush_dcache_page(page); 116 } 117 } else { 118 /* 119 * Note that we write the data both before and after 120 * clearing poison. The write before clear poison 121 * handles situations where the latest written data is 122 * preserved and the clear poison operation simply marks 123 * the address range as valid without changing the data. 124 * In this case application software can assume that an 125 * interrupted write will either return the new good 126 * data or an error. 127 * 128 * However, if pmem_clear_poison() leaves the data in an 129 * indeterminate state we need to perform the write 130 * after clear poison. 131 */ 132 flush_dcache_page(page); 133 write_pmem(pmem_addr, page, off, len); 134 if (unlikely(bad_pmem)) { 135 rc = pmem_clear_poison(pmem, pmem_off, len); 136 write_pmem(pmem_addr, page, off, len); 137 } 138 } 139 140 return rc; 141 } 142 143 /* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */ 144 #ifndef REQ_FLUSH 145 #define REQ_FLUSH REQ_PREFLUSH 146 #endif 147 148 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) 149 { 150 int rc = 0; 151 bool do_acct; 152 unsigned long start; 153 struct bio_vec bvec; 154 struct bvec_iter iter; 155 struct pmem_device *pmem = q->queuedata; 156 struct nd_region *nd_region = to_region(pmem); 157 158 if (bio->bi_opf & REQ_FLUSH) 159 nvdimm_flush(nd_region); 160 161 do_acct = nd_iostat_start(bio, &start); 162 bio_for_each_segment(bvec, bio, iter) { 163 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, 164 bvec.bv_offset, op_is_write(bio_op(bio)), 165 iter.bi_sector); 166 if (rc) { 167 bio->bi_error = rc; 168 break; 169 } 170 } 171 if (do_acct) 172 nd_iostat_end(bio, start); 173 174 if (bio->bi_opf & REQ_FUA) 175 nvdimm_flush(nd_region); 176 177 bio_endio(bio); 178 return BLK_QC_T_NONE; 179 } 180 181 static int pmem_rw_page(struct block_device *bdev, sector_t sector, 182 struct page *page, bool is_write) 183 { 184 struct pmem_device *pmem = bdev->bd_queue->queuedata; 185 int rc; 186 187 rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); 188 189 /* 190 * The ->rw_page interface is subtle and tricky. The core 191 * retries on any error, so we can only invoke page_endio() in 192 * the successful completion case. Otherwise, we'll see crashes 193 * caused by double completion. 194 */ 195 if (rc == 0) 196 page_endio(page, is_write, 0); 197 198 return rc; 199 } 200 201 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ 202 __weak long pmem_direct_access(struct block_device *bdev, sector_t sector, 203 void **kaddr, pfn_t *pfn, long size) 204 { 205 struct pmem_device *pmem = bdev->bd_queue->queuedata; 206 resource_size_t offset = sector * 512 + pmem->data_offset; 207 208 if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) 209 return -EIO; 210 *kaddr = pmem->virt_addr + offset; 211 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 212 213 /* 214 * If badblocks are present, limit known good range to the 215 * requested range. 216 */ 217 if (unlikely(pmem->bb.count)) 218 return size; 219 return pmem->size - pmem->pfn_pad - offset; 220 } 221 222 static const struct block_device_operations pmem_fops = { 223 .owner = THIS_MODULE, 224 .rw_page = pmem_rw_page, 225 .direct_access = pmem_direct_access, 226 .revalidate_disk = nvdimm_revalidate_disk, 227 }; 228 229 static void pmem_release_queue(void *q) 230 { 231 blk_cleanup_queue(q); 232 } 233 234 static void pmem_release_disk(void *disk) 235 { 236 del_gendisk(disk); 237 put_disk(disk); 238 } 239 240 static int pmem_attach_disk(struct device *dev, 241 struct nd_namespace_common *ndns) 242 { 243 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 244 struct nd_region *nd_region = to_nd_region(dev->parent); 245 struct vmem_altmap __altmap, *altmap = NULL; 246 struct resource *res = &nsio->res; 247 struct nd_pfn *nd_pfn = NULL; 248 int nid = dev_to_node(dev); 249 struct nd_pfn_sb *pfn_sb; 250 struct pmem_device *pmem; 251 struct resource pfn_res; 252 struct request_queue *q; 253 struct gendisk *disk; 254 void *addr; 255 256 /* while nsio_rw_bytes is active, parse a pfn info block if present */ 257 if (is_nd_pfn(dev)) { 258 nd_pfn = to_nd_pfn(dev); 259 altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap); 260 if (IS_ERR(altmap)) 261 return PTR_ERR(altmap); 262 } 263 264 /* we're attaching a block device, disable raw namespace access */ 265 devm_nsio_disable(dev, nsio); 266 267 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 268 if (!pmem) 269 return -ENOMEM; 270 271 dev_set_drvdata(dev, pmem); 272 pmem->phys_addr = res->start; 273 pmem->size = resource_size(res); 274 if (nvdimm_has_flush(nd_region) < 0) 275 dev_warn(dev, "unable to guarantee persistence of writes\n"); 276 277 if (!devm_request_mem_region(dev, res->start, resource_size(res), 278 dev_name(&ndns->dev))) { 279 dev_warn(dev, "could not reserve region %pR\n", res); 280 return -EBUSY; 281 } 282 283 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); 284 if (!q) 285 return -ENOMEM; 286 287 pmem->pfn_flags = PFN_DEV; 288 if (is_nd_pfn(dev)) { 289 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, 290 altmap); 291 pfn_sb = nd_pfn->pfn_sb; 292 pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); 293 pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res); 294 pmem->pfn_flags |= PFN_MAP; 295 res = &pfn_res; /* for badblocks populate */ 296 res->start += pmem->data_offset; 297 } else if (pmem_should_map_pages(dev)) { 298 addr = devm_memremap_pages(dev, &nsio->res, 299 &q->q_usage_counter, NULL); 300 pmem->pfn_flags |= PFN_MAP; 301 } else 302 addr = devm_memremap(dev, pmem->phys_addr, 303 pmem->size, ARCH_MEMREMAP_PMEM); 304 305 /* 306 * At release time the queue must be dead before 307 * devm_memremap_pages is unwound 308 */ 309 if (devm_add_action_or_reset(dev, pmem_release_queue, q)) 310 return -ENOMEM; 311 312 if (IS_ERR(addr)) 313 return PTR_ERR(addr); 314 pmem->virt_addr = addr; 315 316 blk_queue_write_cache(q, true, true); 317 blk_queue_make_request(q, pmem_make_request); 318 blk_queue_physical_block_size(q, PAGE_SIZE); 319 blk_queue_max_hw_sectors(q, UINT_MAX); 320 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 321 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 322 queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); 323 q->queuedata = pmem; 324 325 disk = alloc_disk_node(0, nid); 326 if (!disk) 327 return -ENOMEM; 328 329 disk->fops = &pmem_fops; 330 disk->queue = q; 331 disk->flags = GENHD_FL_EXT_DEVT; 332 nvdimm_namespace_disk_name(ndns, disk->disk_name); 333 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 334 / 512); 335 if (devm_init_badblocks(dev, &pmem->bb)) 336 return -ENOMEM; 337 nvdimm_badblocks_populate(nd_region, &pmem->bb, res); 338 disk->bb = &pmem->bb; 339 device_add_disk(dev, disk); 340 341 if (devm_add_action_or_reset(dev, pmem_release_disk, disk)) 342 return -ENOMEM; 343 344 revalidate_disk(disk); 345 346 return 0; 347 } 348 349 static int nd_pmem_probe(struct device *dev) 350 { 351 struct nd_namespace_common *ndns; 352 353 ndns = nvdimm_namespace_common_probe(dev); 354 if (IS_ERR(ndns)) 355 return PTR_ERR(ndns); 356 357 if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev))) 358 return -ENXIO; 359 360 if (is_nd_btt(dev)) 361 return nvdimm_namespace_attach_btt(ndns); 362 363 if (is_nd_pfn(dev)) 364 return pmem_attach_disk(dev, ndns); 365 366 /* if we find a valid info-block we'll come back as that personality */ 367 if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0 368 || nd_dax_probe(dev, ndns) == 0) 369 return -ENXIO; 370 371 /* ...otherwise we're just a raw pmem device */ 372 return pmem_attach_disk(dev, ndns); 373 } 374 375 static int nd_pmem_remove(struct device *dev) 376 { 377 if (is_nd_btt(dev)) 378 nvdimm_namespace_detach_btt(to_nd_btt(dev)); 379 nvdimm_flush(to_nd_region(dev->parent)); 380 381 return 0; 382 } 383 384 static void nd_pmem_shutdown(struct device *dev) 385 { 386 nvdimm_flush(to_nd_region(dev->parent)); 387 } 388 389 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) 390 { 391 struct pmem_device *pmem = dev_get_drvdata(dev); 392 struct nd_region *nd_region = to_region(pmem); 393 resource_size_t offset = 0, end_trunc = 0; 394 struct nd_namespace_common *ndns; 395 struct nd_namespace_io *nsio; 396 struct resource res; 397 398 if (event != NVDIMM_REVALIDATE_POISON) 399 return; 400 401 if (is_nd_btt(dev)) { 402 struct nd_btt *nd_btt = to_nd_btt(dev); 403 404 ndns = nd_btt->ndns; 405 } else if (is_nd_pfn(dev)) { 406 struct nd_pfn *nd_pfn = to_nd_pfn(dev); 407 struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 408 409 ndns = nd_pfn->ndns; 410 offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad); 411 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 412 } else 413 ndns = to_ndns(dev); 414 415 nsio = to_nd_namespace_io(&ndns->dev); 416 res.start = nsio->res.start + offset; 417 res.end = nsio->res.end - end_trunc; 418 nvdimm_badblocks_populate(nd_region, &pmem->bb, &res); 419 } 420 421 MODULE_ALIAS("pmem"); 422 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); 423 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); 424 static struct nd_device_driver nd_pmem_driver = { 425 .probe = nd_pmem_probe, 426 .remove = nd_pmem_remove, 427 .notify = nd_pmem_notify, 428 .shutdown = nd_pmem_shutdown, 429 .drv = { 430 .name = "nd_pmem", 431 }, 432 .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, 433 }; 434 435 static int __init pmem_init(void) 436 { 437 return nd_driver_register(&nd_pmem_driver); 438 } 439 module_init(pmem_init); 440 441 static void pmem_exit(void) 442 { 443 driver_unregister(&nd_pmem_driver.drv); 444 } 445 module_exit(pmem_exit); 446 447 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); 448 MODULE_LICENSE("GPL v2"); 449