xref: /openbmc/linux/drivers/nvdimm/pmem.c (revision f2a89d3b)
1 /*
2  * Persistent Memory Driver
3  *
4  * Copyright (c) 2014-2015, Intel Corporation.
5  * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
6  * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  */
17 
18 #include <asm/cacheflush.h>
19 #include <linux/blkdev.h>
20 #include <linux/hdreg.h>
21 #include <linux/init.h>
22 #include <linux/platform_device.h>
23 #include <linux/module.h>
24 #include <linux/moduleparam.h>
25 #include <linux/badblocks.h>
26 #include <linux/memremap.h>
27 #include <linux/vmalloc.h>
28 #include <linux/pfn_t.h>
29 #include <linux/slab.h>
30 #include <linux/pmem.h>
31 #include <linux/nd.h>
32 #include "pmem.h"
33 #include "pfn.h"
34 #include "nd.h"
35 
36 static struct device *to_dev(struct pmem_device *pmem)
37 {
38 	/*
39 	 * nvdimm bus services need a 'dev' parameter, and we record the device
40 	 * at init in bb.dev.
41 	 */
42 	return pmem->bb.dev;
43 }
44 
45 static struct nd_region *to_region(struct pmem_device *pmem)
46 {
47 	return to_nd_region(to_dev(pmem)->parent);
48 }
49 
50 static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
51 		unsigned int len)
52 {
53 	struct device *dev = to_dev(pmem);
54 	sector_t sector;
55 	long cleared;
56 
57 	sector = (offset - pmem->data_offset) / 512;
58 	cleared = nvdimm_clear_poison(dev, pmem->phys_addr + offset, len);
59 
60 	if (cleared > 0 && cleared / 512) {
61 		dev_dbg(dev, "%s: %#llx clear %ld sector%s\n",
62 				__func__, (unsigned long long) sector,
63 				cleared / 512, cleared / 512 > 1 ? "s" : "");
64 		badblocks_clear(&pmem->bb, sector, cleared / 512);
65 	}
66 	invalidate_pmem(pmem->virt_addr + offset, len);
67 }
68 
69 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
70 			unsigned int len, unsigned int off, bool is_write,
71 			sector_t sector)
72 {
73 	int rc = 0;
74 	bool bad_pmem = false;
75 	void *mem = kmap_atomic(page);
76 	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
77 	void *pmem_addr = pmem->virt_addr + pmem_off;
78 
79 	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
80 		bad_pmem = true;
81 
82 	if (!is_write) {
83 		if (unlikely(bad_pmem))
84 			rc = -EIO;
85 		else {
86 			rc = memcpy_from_pmem(mem + off, pmem_addr, len);
87 			flush_dcache_page(page);
88 		}
89 	} else {
90 		/*
91 		 * Note that we write the data both before and after
92 		 * clearing poison.  The write before clear poison
93 		 * handles situations where the latest written data is
94 		 * preserved and the clear poison operation simply marks
95 		 * the address range as valid without changing the data.
96 		 * In this case application software can assume that an
97 		 * interrupted write will either return the new good
98 		 * data or an error.
99 		 *
100 		 * However, if pmem_clear_poison() leaves the data in an
101 		 * indeterminate state we need to perform the write
102 		 * after clear poison.
103 		 */
104 		flush_dcache_page(page);
105 		memcpy_to_pmem(pmem_addr, mem + off, len);
106 		if (unlikely(bad_pmem)) {
107 			pmem_clear_poison(pmem, pmem_off, len);
108 			memcpy_to_pmem(pmem_addr, mem + off, len);
109 		}
110 	}
111 
112 	kunmap_atomic(mem);
113 	return rc;
114 }
115 
116 /* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
117 #ifndef REQ_FLUSH
118 #define REQ_FLUSH REQ_PREFLUSH
119 #endif
120 
121 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
122 {
123 	int rc = 0;
124 	bool do_acct;
125 	unsigned long start;
126 	struct bio_vec bvec;
127 	struct bvec_iter iter;
128 	struct pmem_device *pmem = q->queuedata;
129 	struct nd_region *nd_region = to_region(pmem);
130 
131 	if (bio->bi_opf & REQ_FLUSH)
132 		nvdimm_flush(nd_region);
133 
134 	do_acct = nd_iostat_start(bio, &start);
135 	bio_for_each_segment(bvec, bio, iter) {
136 		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
137 				bvec.bv_offset, op_is_write(bio_op(bio)),
138 				iter.bi_sector);
139 		if (rc) {
140 			bio->bi_error = rc;
141 			break;
142 		}
143 	}
144 	if (do_acct)
145 		nd_iostat_end(bio, start);
146 
147 	if (bio->bi_opf & REQ_FUA)
148 		nvdimm_flush(nd_region);
149 
150 	bio_endio(bio);
151 	return BLK_QC_T_NONE;
152 }
153 
154 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
155 		       struct page *page, bool is_write)
156 {
157 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
158 	int rc;
159 
160 	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, is_write, sector);
161 
162 	/*
163 	 * The ->rw_page interface is subtle and tricky.  The core
164 	 * retries on any error, so we can only invoke page_endio() in
165 	 * the successful completion case.  Otherwise, we'll see crashes
166 	 * caused by double completion.
167 	 */
168 	if (rc == 0)
169 		page_endio(page, is_write, 0);
170 
171 	return rc;
172 }
173 
174 /* see "strong" declaration in tools/testing/nvdimm/pmem-dax.c */
175 __weak long pmem_direct_access(struct block_device *bdev, sector_t sector,
176 		      void **kaddr, pfn_t *pfn, long size)
177 {
178 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
179 	resource_size_t offset = sector * 512 + pmem->data_offset;
180 
181 	if (unlikely(is_bad_pmem(&pmem->bb, sector, size)))
182 		return -EIO;
183 	*kaddr = pmem->virt_addr + offset;
184 	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
185 
186 	/*
187 	 * If badblocks are present, limit known good range to the
188 	 * requested range.
189 	 */
190 	if (unlikely(pmem->bb.count))
191 		return size;
192 	return pmem->size - pmem->pfn_pad - offset;
193 }
194 
195 static const struct block_device_operations pmem_fops = {
196 	.owner =		THIS_MODULE,
197 	.rw_page =		pmem_rw_page,
198 	.direct_access =	pmem_direct_access,
199 	.revalidate_disk =	nvdimm_revalidate_disk,
200 };
201 
202 static void pmem_release_queue(void *q)
203 {
204 	blk_cleanup_queue(q);
205 }
206 
207 static void pmem_release_disk(void *disk)
208 {
209 	del_gendisk(disk);
210 	put_disk(disk);
211 }
212 
213 static int pmem_attach_disk(struct device *dev,
214 		struct nd_namespace_common *ndns)
215 {
216 	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
217 	struct nd_region *nd_region = to_nd_region(dev->parent);
218 	struct vmem_altmap __altmap, *altmap = NULL;
219 	struct resource *res = &nsio->res;
220 	struct nd_pfn *nd_pfn = NULL;
221 	int nid = dev_to_node(dev);
222 	struct nd_pfn_sb *pfn_sb;
223 	struct pmem_device *pmem;
224 	struct resource pfn_res;
225 	struct request_queue *q;
226 	struct gendisk *disk;
227 	void *addr;
228 
229 	/* while nsio_rw_bytes is active, parse a pfn info block if present */
230 	if (is_nd_pfn(dev)) {
231 		nd_pfn = to_nd_pfn(dev);
232 		altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap);
233 		if (IS_ERR(altmap))
234 			return PTR_ERR(altmap);
235 	}
236 
237 	/* we're attaching a block device, disable raw namespace access */
238 	devm_nsio_disable(dev, nsio);
239 
240 	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
241 	if (!pmem)
242 		return -ENOMEM;
243 
244 	dev_set_drvdata(dev, pmem);
245 	pmem->phys_addr = res->start;
246 	pmem->size = resource_size(res);
247 	if (nvdimm_has_flush(nd_region) < 0)
248 		dev_warn(dev, "unable to guarantee persistence of writes\n");
249 
250 	if (!devm_request_mem_region(dev, res->start, resource_size(res),
251 				dev_name(dev))) {
252 		dev_warn(dev, "could not reserve region %pR\n", res);
253 		return -EBUSY;
254 	}
255 
256 	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
257 	if (!q)
258 		return -ENOMEM;
259 
260 	pmem->pfn_flags = PFN_DEV;
261 	if (is_nd_pfn(dev)) {
262 		addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
263 				altmap);
264 		pfn_sb = nd_pfn->pfn_sb;
265 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
266 		pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res);
267 		pmem->pfn_flags |= PFN_MAP;
268 		res = &pfn_res; /* for badblocks populate */
269 		res->start += pmem->data_offset;
270 	} else if (pmem_should_map_pages(dev)) {
271 		addr = devm_memremap_pages(dev, &nsio->res,
272 				&q->q_usage_counter, NULL);
273 		pmem->pfn_flags |= PFN_MAP;
274 	} else
275 		addr = devm_memremap(dev, pmem->phys_addr,
276 				pmem->size, ARCH_MEMREMAP_PMEM);
277 
278 	/*
279 	 * At release time the queue must be dead before
280 	 * devm_memremap_pages is unwound
281 	 */
282 	if (devm_add_action_or_reset(dev, pmem_release_queue, q))
283 		return -ENOMEM;
284 
285 	if (IS_ERR(addr))
286 		return PTR_ERR(addr);
287 	pmem->virt_addr = addr;
288 
289 	blk_queue_write_cache(q, true, true);
290 	blk_queue_make_request(q, pmem_make_request);
291 	blk_queue_physical_block_size(q, PAGE_SIZE);
292 	blk_queue_max_hw_sectors(q, UINT_MAX);
293 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
294 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
295 	queue_flag_set_unlocked(QUEUE_FLAG_DAX, q);
296 	q->queuedata = pmem;
297 
298 	disk = alloc_disk_node(0, nid);
299 	if (!disk)
300 		return -ENOMEM;
301 
302 	disk->fops		= &pmem_fops;
303 	disk->queue		= q;
304 	disk->flags		= GENHD_FL_EXT_DEVT;
305 	nvdimm_namespace_disk_name(ndns, disk->disk_name);
306 	set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset)
307 			/ 512);
308 	if (devm_init_badblocks(dev, &pmem->bb))
309 		return -ENOMEM;
310 	nvdimm_badblocks_populate(nd_region, &pmem->bb, res);
311 	disk->bb = &pmem->bb;
312 	device_add_disk(dev, disk);
313 
314 	if (devm_add_action_or_reset(dev, pmem_release_disk, disk))
315 		return -ENOMEM;
316 
317 	revalidate_disk(disk);
318 
319 	return 0;
320 }
321 
322 static int nd_pmem_probe(struct device *dev)
323 {
324 	struct nd_namespace_common *ndns;
325 
326 	ndns = nvdimm_namespace_common_probe(dev);
327 	if (IS_ERR(ndns))
328 		return PTR_ERR(ndns);
329 
330 	if (devm_nsio_enable(dev, to_nd_namespace_io(&ndns->dev)))
331 		return -ENXIO;
332 
333 	if (is_nd_btt(dev))
334 		return nvdimm_namespace_attach_btt(ndns);
335 
336 	if (is_nd_pfn(dev))
337 		return pmem_attach_disk(dev, ndns);
338 
339 	/* if we find a valid info-block we'll come back as that personality */
340 	if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
341 			|| nd_dax_probe(dev, ndns) == 0)
342 		return -ENXIO;
343 
344 	/* ...otherwise we're just a raw pmem device */
345 	return pmem_attach_disk(dev, ndns);
346 }
347 
348 static int nd_pmem_remove(struct device *dev)
349 {
350 	if (is_nd_btt(dev))
351 		nvdimm_namespace_detach_btt(to_nd_btt(dev));
352 	nvdimm_flush(to_nd_region(dev->parent));
353 
354 	return 0;
355 }
356 
357 static void nd_pmem_shutdown(struct device *dev)
358 {
359 	nvdimm_flush(to_nd_region(dev->parent));
360 }
361 
362 static void nd_pmem_notify(struct device *dev, enum nvdimm_event event)
363 {
364 	struct pmem_device *pmem = dev_get_drvdata(dev);
365 	struct nd_region *nd_region = to_region(pmem);
366 	resource_size_t offset = 0, end_trunc = 0;
367 	struct nd_namespace_common *ndns;
368 	struct nd_namespace_io *nsio;
369 	struct resource res;
370 
371 	if (event != NVDIMM_REVALIDATE_POISON)
372 		return;
373 
374 	if (is_nd_btt(dev)) {
375 		struct nd_btt *nd_btt = to_nd_btt(dev);
376 
377 		ndns = nd_btt->ndns;
378 	} else if (is_nd_pfn(dev)) {
379 		struct nd_pfn *nd_pfn = to_nd_pfn(dev);
380 		struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb;
381 
382 		ndns = nd_pfn->ndns;
383 		offset = pmem->data_offset + __le32_to_cpu(pfn_sb->start_pad);
384 		end_trunc = __le32_to_cpu(pfn_sb->end_trunc);
385 	} else
386 		ndns = to_ndns(dev);
387 
388 	nsio = to_nd_namespace_io(&ndns->dev);
389 	res.start = nsio->res.start + offset;
390 	res.end = nsio->res.end - end_trunc;
391 	nvdimm_badblocks_populate(nd_region, &pmem->bb, &res);
392 }
393 
394 MODULE_ALIAS("pmem");
395 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
396 MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
397 static struct nd_device_driver nd_pmem_driver = {
398 	.probe = nd_pmem_probe,
399 	.remove = nd_pmem_remove,
400 	.notify = nd_pmem_notify,
401 	.shutdown = nd_pmem_shutdown,
402 	.drv = {
403 		.name = "nd_pmem",
404 	},
405 	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
406 };
407 
408 static int __init pmem_init(void)
409 {
410 	return nd_driver_register(&nd_pmem_driver);
411 }
412 module_init(pmem_init);
413 
414 static void pmem_exit(void)
415 {
416 	driver_unregister(&nd_pmem_driver.drv);
417 }
418 module_exit(pmem_exit);
419 
420 MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
421 MODULE_LICENSE("GPL v2");
422