1 /* 2 * Persistent Memory Driver 3 * 4 * Copyright (c) 2014-2015, Intel Corporation. 5 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. 6 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. 7 * 8 * This program is free software; you can redistribute it and/or modify it 9 * under the terms and conditions of the GNU General Public License, 10 * version 2, as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 * more details. 16 */ 17 18 #include <asm/cacheflush.h> 19 #include <linux/blkdev.h> 20 #include <linux/hdreg.h> 21 #include <linux/init.h> 22 #include <linux/platform_device.h> 23 #include <linux/module.h> 24 #include <linux/moduleparam.h> 25 #include <linux/badblocks.h> 26 #include <linux/memremap.h> 27 #include <linux/vmalloc.h> 28 #include <linux/pfn_t.h> 29 #include <linux/slab.h> 30 #include <linux/pmem.h> 31 #include <linux/nd.h> 32 #include "pmem.h" 33 #include "pfn.h" 34 #include "nd.h" 35 36 static struct device *to_dev(struct pmem_device *pmem) 37 { 38 /* 39 * nvdimm bus services need a 'dev' parameter, and we record the device 40 * at init in bb.dev. 41 */ 42 return pmem->bb.dev; 43 } 44 45 static struct nd_region *to_region(struct pmem_device *pmem) 46 { 47 return to_nd_region(to_dev(pmem)->parent); 48 } 49 50 static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, 51 unsigned int len) 52 { 53 struct device *dev = to_dev(pmem); 54 sector_t sector; 55 long cleared; 56 int rc = 0; 57 58 sector = (offset - pmem->data_offset) / 512; 59 60 cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len); 61 if (cleared < len) 62 rc = -EIO; 63 if (cleared > 0 && cleared / 512) { 64 cleared /= 512; 65 dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__, 66 (unsigned long long) sector, cleared, 67 cleared > 1 ? "s" : ""); 68 badblocks_clear(&pmem->bb, sector, cleared); 69 } 70 71 invalidate_pmem(pmem->virt_addr + offset, len); 72 73 return rc; 74 } 75 76 static void write_pmem(void *pmem_addr, struct page *page, 77 unsigned int off, unsigned int len) 78 { 79 void *mem = kmap_atomic(page); 80 81 memcpy_to_pmem(pmem_addr, mem + off, len); 82 kunmap_atomic(mem); 83 } 84 85 static int read_pmem(struct page *page, unsigned int off, 86 void *pmem_addr, unsigned int len) 87 { 88 int rc; 89 void *mem = kmap_atomic(page); 90 91 rc = memcpy_from_pmem(mem + off, pmem_addr, len); 92 kunmap_atomic(mem); 93 return rc; 94 } 95 96 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, 97 unsigned int len, unsigned int off, bool is_write, 98 sector_t sector) 99 { 100 int rc = 0; 101 bool bad_pmem = false; 102 phys_addr_t pmem_off = sector * 512 + pmem->data_offset; 103 void *pmem_addr = pmem->virt_addr + pmem_off; 104 105 if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) 106 bad_pmem = true; 107 108 if (!is_write) { 109 if (unlikely(bad_pmem)) 110 rc = -EIO; 111 else { 112 rc = read_pmem(page, off, pmem_addr, len); 113 flush_dcache_page(page); 114 } 115 } else { 116 /* 117 * Note that we write the data both before and after 118 * clearing poison. The write before clear poison 119 * handles situations where the latest written data is 120 * preserved and the clear poison operation simply marks 121 * the address range as valid without changing the data. 122 * In this case application software can assume that an 123 * interrupted write will either return the new good 124 * data or an error. 125 * 126 * However, if pmem_clear_poison() leaves the data in an 127 * indeterminate state we need to perform the write 128 * after clear poison. 129 */ 130 flush_dcache_page(page); 131 write_pmem(pmem_addr, page, off, len); 132 if (unlikely(bad_pmem)) { 133 rc = pmem_clear_poison(pmem, pmem_off, len); 134 write_pmem(pmem_addr, page, off, len); 135 } 136 } 137 138 return rc; 139 } 140 141 /* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */ 142 #ifndef REQ_FLUSH 143 #define REQ_FLUSH REQ_PREFLUSH 144 #endif 145 146 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) 147 { 148 int rc = 0; 149 bool do_acct; 150 unsigned long start; 151 struct bio_vec bvec; 152 struct bvec_iter iter; 153 struct pmem_device *pmem = q->queuedata; 154 struct nd_region *nd_region = to_region(pmem); 155 156 if (bio->bi_opf & REQ_FLUSH) 157 nvdimm_flush(nd_region); 158 159 do_acct = nd_iostat_start(bio, &start); 160 bio_for_each_segment(bvec, bio, iter) { 161 rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, 162 bvec.bv_offset, op_is_write(bio_op(bio)), 163 iter.bi_sector); 164 if (rc) { 165 bio->bi_error = rc; 166 break; 167 } 168 } 169 if (do_acct) 170 nd_iostat_end(bio, start); 171 172 if (bio->bi_opf & REQ_FUA) 173 nvdimm_flush(nd_region); 174 175 bio_endio(bio); 176 return BLK_QC_T_NONE; 177 } 178 179 static int pmem_rw_page(struct block_device *bdev, sector_t sector, 180 struct page *page, bool is_write) 181 { 182 struct pmem_device *pmem = bdev->bd_queue->queuedata; 183 int rc; 184 185 rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector); 186 187 /* 188 * The ->rw_page interface is subtle and tricky. The core 189 * retries on any error, so we can only invoke page_endio() in 190 * the successful completion case. Otherwise, we'll see crashes 191 * caused by double completion. 192 */ 193 if (rc == 0) 194 page_endio(page, is_write, 0); 195 196 return rc; 197 } 198 199 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */ 200 __weak long pmem_direct_access(struct block_device *bdev, sector_t sector, 201 void **kaddr, pfn_t *pfn, long size) 202 { 203 struct pmem_device *pmem = bdev->bd_queue->queuedata; 204 resource_size_t offset = sector * 512 + pmem->data_offset; 205 206 if (unlikely(is_bad_pmem(&pmem->bb, sector, size))) 207 return -EIO; 208 *kaddr = pmem->virt_addr + offset; 209 *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); 210 211 /* 212 * If badblocks are present, limit known good range to the 213 * requested range. 214 */ 215 if (unlikely(pmem->bb.count)) 216 return size; 217 return pmem->size - pmem->pfn_pad - offset; 218 } 219 220 static const struct block_device_operations pmem_fops = { 221 .owner = THIS_MODULE, 222 .rw_page = pmem_rw_page, 223 .direct_access = pmem_direct_access, 224 .revalidate_disk = nvdimm_revalidate_disk, 225 }; 226 227 static void pmem_release_queue(void *q) 228 { 229 blk_cleanup_queue(q); 230 } 231 232 static void pmem_release_disk(void *disk) 233 { 234 del_gendisk(disk); 235 put_disk(disk); 236 } 237 238 static int pmem_attach_disk(struct device *dev, 239 struct nd_namespace_common *ndns) 240 { 241 struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); 242 struct nd_region *nd_region = to_nd_region(dev->parent); 243 struct vmem_altmap __altmap, *altmap = NULL; 244 struct resource *res = &nsio->res; 245 struct nd_pfn *nd_pfn = NULL; 246 int nid = dev_to_node(dev); 247 struct nd_pfn_sb *pfn_sb; 248 struct pmem_device *pmem; 249 struct resource pfn_res; 250 struct request_queue *q; 251 struct gendisk *disk; 252 void *addr; 253 254 /* while nsio_rw_bytes is active, parse a pfn info block if present */ 255 if (is_nd_pfn(dev)) { 256 nd_pfn = to_nd_pfn(dev); 257 altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap); 258 if (IS_ERR(altmap)) 259 return PTR_ERR(altmap); 260 } 261 262 /* we're attaching a block device, disable raw namespace access */ 263 devm_nsio_disable(dev, nsio); 264 265 pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); 266 if (!pmem) 267 return -ENOMEM; 268 269 dev_set_drvdata(dev, pmem); 270 pmem->phys_addr = res->start; 271 pmem->size = resource_size(res); 272 if (nvdimm_has_flush(nd_region) < 0) 273 dev_warn(dev, "unable to guarantee persistence of writes\n"); 274 275 if (!devm_request_mem_region(dev, res->start, resource_size(res), 276 dev_name(&ndns->dev))) { 277 dev_warn(dev, "could not reserve region %pR\n", res); 278 return -EBUSY; 279 } 280 281 q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); 282 if (!q) 283 return -ENOMEM; 284 285 pmem->pfn_flags = PFN_DEV; 286 if (is_nd_pfn(dev)) { 287 addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, 288 altmap); 289 pfn_sb = nd_pfn->pfn_sb; 290 pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); 291 pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res); 292 pmem->pfn_flags |= PFN_MAP; 293 res = &pfn_res; /* for badblocks populate */ 294 res->start += pmem->data_offset; 295 } else if (pmem_should_map_pages(dev)) { 296 addr = devm_memremap_pages(dev, &nsio->res, 297 &q->q_usage_counter, NULL); 298 pmem->pfn_flags |= PFN_MAP; 299 } else 300 addr = devm_memremap(dev, pmem->phys_addr, 301 pmem->size, ARCH_MEMREMAP_PMEM); 302 303 /* 304 * At release time the queue must be dead before 305 * devm_memremap_pages is unwound 306 */ 307 if (devm_add_action_or_reset(dev, pmem_release_queue, q)) 308 return -ENOMEM; 309 310 if (IS_ERR(addr)) 311 return PTR_ERR(addr); 312 pmem->virt_addr = addr; 313 314 blk_queue_write_cache(q, true, true); 315 blk_queue_make_request(q, pmem_make_request); 316 blk_queue_physical_block_size(q, PAGE_SIZE); 317 blk_queue_max_hw_sectors(q, UINT_MAX); 318 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); 319 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); 320 queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); 321 q->queuedata = pmem; 322 323 disk = alloc_disk_node(0, nid); 324 if (!disk) 325 return -ENOMEM; 326 327 disk->fops = &pmem_fops; 328 disk->queue = q; 329 disk->flags = GENHD_FL_EXT_DEVT; 330 nvdimm_namespace_disk_name(ndns, disk->disk_name); 331 set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) 332 / 512); 333 if (devm_init_badblocks(dev, &pmem->bb)) 334 return -ENOMEM; 335 nvdimm_badblocks_populate(nd_region, &pmem->bb, res); 336 disk->bb = &pmem->bb; 337 device_add_disk(dev, disk); 338 339 if (devm_add_action_or_reset(dev, pmem_release_disk, disk)) 340 return -ENOMEM; 341 342 revalidate_disk(disk); 343 344 return 0; 345 } 346 347 static int nd_pmem_probe(struct device *dev) 348 { 349 struct nd_namespace_common *ndns; 350 351 ndns = nvdimm_namespace_common_probe(dev); 352 if (IS_ERR(ndns)) 353 return PTR_ERR(ndns); 354 355 if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev))) 356 return -ENXIO; 357 358 if (is_nd_btt(dev)) 359 return nvdimm_namespace_attach_btt(ndns); 360 361 if (is_nd_pfn(dev)) 362 return pmem_attach_disk(dev, ndns); 363 364 /* if we find a valid info-block we'll come back as that personality */ 365 if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0 366 || nd_dax_probe(dev, ndns) == 0) 367 return -ENXIO; 368 369 /* ...otherwise we're just a raw pmem device */ 370 return pmem_attach_disk(dev, ndns); 371 } 372 373 static int nd_pmem_remove(struct device *dev) 374 { 375 if (is_nd_btt(dev)) 376 nvdimm_namespace_detach_btt(to_nd_btt(dev)); 377 nvdimm_flush(to_nd_region(dev->parent)); 378 379 return 0; 380 } 381 382 static void nd_pmem_shutdown(struct device *dev) 383 { 384 nvdimm_flush(to_nd_region(dev->parent)); 385 } 386 387 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event) 388 { 389 struct pmem_device *pmem = dev_get_drvdata(dev); 390 struct nd_region *nd_region = to_region(pmem); 391 resource_size_t offset = 0, end_trunc = 0; 392 struct nd_namespace_common *ndns; 393 struct nd_namespace_io *nsio; 394 struct resource res; 395 396 if (event != NVDIMM_REVALIDATE_POISON) 397 return; 398 399 if (is_nd_btt(dev)) { 400 struct nd_btt *nd_btt = to_nd_btt(dev); 401 402 ndns = nd_btt->ndns; 403 } else if (is_nd_pfn(dev)) { 404 struct nd_pfn *nd_pfn = to_nd_pfn(dev); 405 struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; 406 407 ndns = nd_pfn->ndns; 408 offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad); 409 end_trunc = __le32_to_cpu(pfn_sb->end_trunc); 410 } else 411 ndns = to_ndns(dev); 412 413 nsio = to_nd_namespace_io(&ndns->dev); 414 res.start = nsio->res.start + offset; 415 res.end = nsio->res.end - end_trunc; 416 nvdimm_badblocks_populate(nd_region, &pmem->bb, &res); 417 } 418 419 MODULE_ALIAS("pmem"); 420 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); 421 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); 422 static struct nd_device_driver nd_pmem_driver = { 423 .probe = nd_pmem_probe, 424 .remove = nd_pmem_remove, 425 .notify = nd_pmem_notify, 426 .shutdown = nd_pmem_shutdown, 427 .drv = { 428 .name = "nd_pmem", 429 }, 430 .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, 431 }; 432 433 static int __init pmem_init(void) 434 { 435 return nd_driver_register(&nd_pmem_driver); 436 } 437 module_init(pmem_init); 438 439 static void pmem_exit(void) 440 { 441 driver_unregister(&nd_pmem_driver.drv); 442 } 443 module_exit(pmem_exit); 444 445 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); 446 MODULE_LICENSE("GPL v2"); 447