xref: /openbmc/linux/drivers/nvdimm/pmem.c (revision 1c2dd16a)
1 /*
2  * Persistent Memory Driver
3  *
4  * Copyright (c) 2014-2015, Intel Corporation.
5  * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
6  * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  */
17 
18 #include <asm/cacheflush.h>
19 #include <linux/blkdev.h>
20 #include <linux/hdreg.h>
21 #include <linux/init.h>
22 #include <linux/platform_device.h>
23 #include <linux/module.h>
24 #include <linux/moduleparam.h>
25 #include <linux/badblocks.h>
26 #include <linux/memremap.h>
27 #include <linux/vmalloc.h>
28 #include <linux/blk-mq.h>
29 #include <linux/pfn_t.h>
30 #include <linux/slab.h>
31 #include <linux/pmem.h>
32 #include <linux/nd.h>
33 #include "pmem.h"
34 #include "pfn.h"
35 #include "nd.h"
36 
37 static struct device *to_dev(struct pmem_device *pmem)
38 {
39 	/*
40 	 * nvdimm bus services need a 'dev' parameter, and we record the device
41 	 * at init in bb.dev.
42 	 */
43 	return pmem->bb.dev;
44 }
45 
46 static struct nd_region *to_region(struct pmem_device *pmem)
47 {
48 	return to_nd_region(to_dev(pmem)->parent);
49 }
50 
51 static int pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
52 		unsigned int len)
53 {
54 	struct device *dev = to_dev(pmem);
55 	sector_t sector;
56 	long cleared;
57 	int rc = 0;
58 
59 	sector = (offset - pmem->data_offset) / 512;
60 
61 	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
62 	if (cleared < len)
63 		rc = -EIO;
64 	if (cleared > 0 && cleared / 512) {
65 		cleared /= 512;
66 		dev_dbg(dev, "%s: %#llx clear %ld sector%s\n", __func__,
67 				(unsigned long long) sector, cleared,
68 				cleared > 1 ? "s" : "");
69 		badblocks_clear(&pmem->bb, sector, cleared);
70 	}
71 
72 	invalidate_pmem(pmem->virt_addr + offset, len);
73 
74 	return rc;
75 }
76 
77 static void write_pmem(void *pmem_addr, struct page *page,
78 		unsigned int off, unsigned int len)
79 {
80 	void *mem = kmap_atomic(page);
81 
82 	memcpy_to_pmem(pmem_addr, mem + off, len);
83 	kunmap_atomic(mem);
84 }
85 
86 static int read_pmem(struct page *page, unsigned int off,
87 		void *pmem_addr, unsigned int len)
88 {
89 	int rc;
90 	void *mem = kmap_atomic(page);
91 
92 	rc = memcpy_from_pmem(mem + off, pmem_addr, len);
93 	kunmap_atomic(mem);
94 	if (rc)
95 		return -EIO;
96 	return 0;
97 }
98 
99 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
100 			unsigned int len, unsigned int off, bool is_write,
101 			sector_t sector)
102 {
103 	int rc = 0;
104 	bool bad_pmem = false;
105 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
106 	void *pmem_addr = pmem->virt_addr + pmem_off;
107 
108 	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
109 		bad_pmem = true;
110 
111 	if (!is_write) {
112 		if (unlikely(bad_pmem))
113 			rc = -EIO;
114 		else {
115 			rc = read_pmem(page, off, pmem_addr, len);
116 			flush_dcache_page(page);
117 		}
118 	} else {
119 		/*
120 		 * Note that we write the data both before and after
121 		 * clearing poison.  The write before clear poison
122 		 * handles situations where the latest written data is
123 		 * preserved and the clear poison operation simply marks
124 		 * the address range as valid without changing the data.
125 		 * In this case application software can assume that an
126 		 * interrupted write will either return the new good
127 		 * data or an error.
128 		 *
129 		 * However, if pmem_clear_poison() leaves the data in an
130 		 * indeterminate state we need to perform the write
131 		 * after clear poison.
132 		 */
133 		flush_dcache_page(page);
134 		write_pmem(pmem_addr, page, off, len);
135 		if (unlikely(bad_pmem)) {
136 			rc = pmem_clear_poison(pmem, pmem_off, len);
137 			write_pmem(pmem_addr, page, off, len);
138 		}
139 	}
140 
141 	return rc;
142 }
143 
144 /* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
145 #ifndef REQ_FLUSH
146 #define REQ_FLUSH REQ_PREFLUSH
147 #endif
148 
149 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
150 {
151 	int rc = 0;
152 	bool do_acct;
153 	unsigned long start;
154 	struct bio_vec bvec;
155 	struct bvec_iter iter;
156 	struct pmem_device *pmem = q->queuedata;
157 	struct nd_region *nd_region = to_region(pmem);
158 
159 	if (bio->bi_opf & REQ_FLUSH)
160 		nvdimm_flush(nd_region);
161 
162 	do_acct = nd_iostat_start(bio, &start);
163 	bio_for_each_segment(bvec, bio, iter) {
164 		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
165 				bvec.bv_offset, op_is_write(bio_op(bio)),
166 				iter.bi_sector);
167 		if (rc) {
168 			bio->bi_error = rc;
169 			break;
170 		}
171 	}
172 	if (do_acct)
173 		nd_iostat_end(bio, start);
174 
175 	if (bio->bi_opf & REQ_FUA)
176 		nvdimm_flush(nd_region);
177 
178 	bio_endio(bio);
179 	return BLK_QC_T_NONE;
180 }
181 
182 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
183 		       struct page *page, bool is_write)
184 {
185 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
186 	int rc;
187 
188 	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
189 
190 	/*
191 	 * The ->rw_page interface is subtle and tricky.  The core
192 	 * retries on any error, so we can only invoke page_endio() in
193 	 * the successful completion case.  Otherwise, we'll see crashes
194 	 * caused by double completion.
195 	 */
196 	if (rc == 0)
197 		page_endio(page, is_write, 0);
198 
199 	return rc;
200 }
201 
202 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
203 __weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
204 		      void **kaddr, pfn_t *pfn, long size)
205 {
206 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
207 	resource_size_t offset = sector * 512 + pmem->data_offset;
208 
209 	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
210 		return -EIO;
211 	*kaddr = pmem->virt_addr + offset;
212 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
213 
214 	/*
215 	 * If badblocks are present, limit known good range to the
216 	 * requested range.
217 	 */
218 	if (unlikely(pmem->bb.count))
219 		return size;
220 	return pmem->size - pmem->pfn_pad - offset;
221 }
222 
223 static const struct block_device_operations pmem_fops = {
224 	.owner =		THIS_MODULE,
225 	.rw_page =		pmem_rw_page,
226 	.direct_access =	pmem_direct_access,
227 	.revalidate_disk =	nvdimm_revalidate_disk,
228 };
229 
230 static void pmem_release_queue(void *q)
231 {
232 	blk_cleanup_queue(q);
233 }
234 
235 static void pmem_freeze_queue(void *q)
236 {
237 	blk_freeze_queue_start(q);
238 }
239 
240 static void pmem_release_disk(void *disk)
241 {
242 	del_gendisk(disk);
243 	put_disk(disk);
244 }
245 
246 static int pmem_attach_disk(struct device *dev,
247 		struct nd_namespace_common *ndns)
248 {
249 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
250 	struct nd_region *nd_region = to_nd_region(dev->parent);
251 	struct vmem_altmap __altmap, *altmap = NULL;
252 	struct resource *res = &nsio->res;
253 	struct nd_pfn *nd_pfn = NULL;
254 	int nid = dev_to_node(dev);
255 	struct nd_pfn_sb *pfn_sb;
256 	struct pmem_device *pmem;
257 	struct resource pfn_res;
258 	struct request_queue *q;
259 	struct gendisk *disk;
260 	void *addr;
261 
262 	/* while nsio_rw_bytes is active, parse a pfn info block if present */
263 	if (is_nd_pfn(dev)) {
264 		nd_pfn = to_nd_pfn(dev);
265 		altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
266 		if (IS_ERR(altmap))
267 			return PTR_ERR(altmap);
268 	}
269 
270 	/* we're attaching a block device, disable raw namespace access */
271 	devm_nsio_disable(dev, nsio);
272 
273 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
274 	if (!pmem)
275 		return -ENOMEM;
276 
277 	dev_set_drvdata(dev, pmem);
278 	pmem->phys_addr = res->start;
279 	pmem->size = resource_size(res);
280 	if (nvdimm_has_flush(nd_region) < 0)
281 		dev_warn(dev, "unable to guarantee persistence of writes\n");
282 
283 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
284 				dev_name(&ndns->dev))) {
285 		dev_warn(dev, "could not reserve region %pR\n", res);
286 		return -EBUSY;
287 	}
288 
289 	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
290 	if (!q)
291 		return -ENOMEM;
292 
293 	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
294 		return -ENOMEM;
295 
296 	pmem->pfn_flags = PFN_DEV;
297 	if (is_nd_pfn(dev)) {
298 		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
299 				altmap);
300 		pfn_sb = nd_pfn->pfn_sb;
301 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
302 		pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
303 		pmem->pfn_flags |= PFN_MAP;
304 		res = &pfn_res; /* for badblocks populate */
305 		res->start += pmem->data_offset;
306 	} else if (pmem_should_map_pages(dev)) {
307 		addr = devm_memremap_pages(dev, &nsio->res,
308 				&q->q_usage_counter, NULL);
309 		pmem->pfn_flags |= PFN_MAP;
310 	} else
311 		addr = devm_memremap(dev, pmem->phys_addr,
312 				pmem->size, ARCH_MEMREMAP_PMEM);
313 
314 	/*
315 	 * At release time the queue must be frozen before
316 	 * devm_memremap_pages is unwound
317 	 */
318 	if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
319 		return -ENOMEM;
320 
321 	if (IS_ERR(addr))
322 		return PTR_ERR(addr);
323 	pmem->virt_addr = addr;
324 
325 	blk_queue_write_cache(q, true, true);
326 	blk_queue_make_request(q, pmem_make_request);
327 	blk_queue_physical_block_size(q, PAGE_SIZE);
328 	blk_queue_max_hw_sectors(q, UINT_MAX);
329 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
330 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
331 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
332 	q->queuedata = pmem;
333 
334 	disk = alloc_disk_node(0, nid);
335 	if (!disk)
336 		return -ENOMEM;
337 
338 	disk->fops		= &pmem_fops;
339 	disk->queue		= q;
340 	disk->flags		= GENHD_FL_EXT_DEVT;
341 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
342 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
343 			/ 512);
344 	if (devm_init_badblocks(dev, &pmem->bb))
345 		return -ENOMEM;
346 	nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
347 	disk->bb = &pmem->bb;
348 	device_add_disk(dev, disk);
349 
350 	if (devm_add_action_or_reset(dev, pmem_release_disk, disk))
351 		return -ENOMEM;
352 
353 	revalidate_disk(disk);
354 
355 	return 0;
356 }
357 
358 static int nd_pmem_probe(struct device *dev)
359 {
360 	struct nd_namespace_common *ndns;
361 
362 	ndns = nvdimm_namespace_common_probe(dev);
363 	if (IS_ERR(ndns))
364 		return PTR_ERR(ndns);
365 
366 	if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
367 		return -ENXIO;
368 
369 	if (is_nd_btt(dev))
370 		return nvdimm_namespace_attach_btt(ndns);
371 
372 	if (is_nd_pfn(dev))
373 		return pmem_attach_disk(dev, ndns);
374 
375 	/* if we find a valid info-block we'll come back as that personality */
376 	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
377 			|| nd_dax_probe(dev, ndns) == 0)
378 		return -ENXIO;
379 
380 	/* ...otherwise we're just a raw pmem device */
381 	return pmem_attach_disk(dev, ndns);
382 }
383 
384 static int nd_pmem_remove(struct device *dev)
385 {
386 	if (is_nd_btt(dev))
387 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
388 	nvdimm_flush(to_nd_region(dev->parent));
389 
390 	return 0;
391 }
392 
393 static void nd_pmem_shutdown(struct device *dev)
394 {
395 	nvdimm_flush(to_nd_region(dev->parent));
396 }
397 
398 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
399 {
400 	struct pmem_device *pmem = dev_get_drvdata(dev);
401 	struct nd_region *nd_region = to_region(pmem);
402 	resource_size_t offset = 0, end_trunc = 0;
403 	struct nd_namespace_common *ndns;
404 	struct nd_namespace_io *nsio;
405 	struct resource res;
406 
407 	if (event != NVDIMM_REVALIDATE_POISON)
408 		return;
409 
410 	if (is_nd_btt(dev)) {
411 		struct nd_btt *nd_btt = to_nd_btt(dev);
412 
413 		ndns = nd_btt->ndns;
414 	} else if (is_nd_pfn(dev)) {
415 		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
416 		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
417 
418 		ndns = nd_pfn->ndns;
419 		offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad);
420 		end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
421 	} else
422 		ndns = to_ndns(dev);
423 
424 	nsio = to_nd_namespace_io(&ndns->dev);
425 	res.start = nsio->res.start + offset;
426 	res.end = nsio->res.end - end_trunc;
427 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &res);
428 }
429 
430 MODULE_ALIAS("pmem");
431 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
432 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
433 static struct nd_device_driver nd_pmem_driver = {
434 	.probe = nd_pmem_probe,
435 	.remove = nd_pmem_remove,
436 	.notify = nd_pmem_notify,
437 	.shutdown = nd_pmem_shutdown,
438 	.drv = {
439 		.name = "nd_pmem",
440 	},
441 	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
442 };
443 
444 static int __init pmem_init(void)
445 {
446 	return nd_driver_register(&nd_pmem_driver);
447 }
448 module_init(pmem_init);
449 
450 static void pmem_exit(void)
451 {
452 	driver_unregister(&nd_pmem_driver.drv);
453 }
454 module_exit(pmem_exit);
455 
456 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
457 MODULE_LICENSE("GPL v2");
458