xref: /openbmc/linux/drivers/dax/super.c (revision 60696eb2)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright(c) 2017 Intel Corporation. All rights reserved.
4  */
5 #include <linux/pagemap.h>
6 #include <linux/module.h>
7 #include <linux/mount.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/magic.h>
10 #include <linux/pfn_t.h>
11 #include <linux/cdev.h>
12 #include <linux/slab.h>
13 #include <linux/uio.h>
14 #include <linux/dax.h>
15 #include <linux/fs.h>
16 #include "dax-private.h"
17 
18 /**
19  * struct dax_device - anchor object for dax services
20  * @inode: core vfs
21  * @cdev: optional character interface for "device dax"
22  * @private: dax driver private data
23  * @flags: state and boolean properties
24  */
25 struct dax_device {
26 	struct inode inode;
27 	struct cdev cdev;
28 	void *private;
29 	unsigned long flags;
30 	const struct dax_operations *ops;
31 };
32 
33 static dev_t dax_devt;
34 DEFINE_STATIC_SRCU(dax_srcu);
35 static struct vfsmount *dax_mnt;
36 static DEFINE_IDA(dax_minor_ida);
37 static struct kmem_cache *dax_cache __read_mostly;
38 static struct super_block *dax_superblock __read_mostly;
39 
40 int dax_read_lock(void)
41 {
42 	return srcu_read_lock(&dax_srcu);
43 }
44 EXPORT_SYMBOL_GPL(dax_read_lock);
45 
46 void dax_read_unlock(int id)
47 {
48 	srcu_read_unlock(&dax_srcu, id);
49 }
50 EXPORT_SYMBOL_GPL(dax_read_unlock);
51 
52 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
53 #include <linux/blkdev.h>
54 
55 static DEFINE_XARRAY(dax_hosts);
56 
57 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
58 {
59 	return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL);
60 }
61 EXPORT_SYMBOL_GPL(dax_add_host);
62 
63 void dax_remove_host(struct gendisk *disk)
64 {
65 	xa_erase(&dax_hosts, (unsigned long)disk);
66 }
67 EXPORT_SYMBOL_GPL(dax_remove_host);
68 
69 /**
70  * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
71  * @bdev: block device to find a dax_device for
72  */
73 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
74 {
75 	struct dax_device *dax_dev;
76 	int id;
77 
78 	if (!blk_queue_dax(bdev->bd_disk->queue))
79 		return NULL;
80 
81 	if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE ||
82 	    (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) {
83 		pr_info("%pg: error: unaligned partition for dax\n", bdev);
84 		return NULL;
85 	}
86 
87 	id = dax_read_lock();
88 	dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
89 	if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
90 		dax_dev = NULL;
91 	dax_read_unlock(id);
92 
93 	return dax_dev;
94 }
95 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
96 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
97 
98 enum dax_device_flags {
99 	/* !alive + rcu grace period == no new operations / mappings */
100 	DAXDEV_ALIVE,
101 	/* gate whether dax_flush() calls the low level flush routine */
102 	DAXDEV_WRITE_CACHE,
103 	/* flag to check if device supports synchronous flush */
104 	DAXDEV_SYNC,
105 };
106 
107 /**
108  * dax_direct_access() - translate a device pgoff to an absolute pfn
109  * @dax_dev: a dax_device instance representing the logical memory range
110  * @pgoff: offset in pages from the start of the device to translate
111  * @nr_pages: number of consecutive pages caller can handle relative to @pfn
112  * @kaddr: output parameter that returns a virtual address mapping of pfn
113  * @pfn: output parameter that returns an absolute pfn translation of @pgoff
114  *
115  * Return: negative errno if an error occurs, otherwise the number of
116  * pages accessible at the device relative @pgoff.
117  */
118 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
119 		void **kaddr, pfn_t *pfn)
120 {
121 	long avail;
122 
123 	if (!dax_dev)
124 		return -EOPNOTSUPP;
125 
126 	if (!dax_alive(dax_dev))
127 		return -ENXIO;
128 
129 	if (nr_pages < 0)
130 		return -EINVAL;
131 
132 	avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
133 			kaddr, pfn);
134 	if (!avail)
135 		return -ERANGE;
136 	return min(avail, nr_pages);
137 }
138 EXPORT_SYMBOL_GPL(dax_direct_access);
139 
140 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
141 		size_t bytes, struct iov_iter *i)
142 {
143 	if (!dax_alive(dax_dev))
144 		return 0;
145 
146 	return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i);
147 }
148 EXPORT_SYMBOL_GPL(dax_copy_from_iter);
149 
150 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
151 		size_t bytes, struct iov_iter *i)
152 {
153 	if (!dax_alive(dax_dev))
154 		return 0;
155 
156 	return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
157 }
158 EXPORT_SYMBOL_GPL(dax_copy_to_iter);
159 
160 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
161 			size_t nr_pages)
162 {
163 	if (!dax_alive(dax_dev))
164 		return -ENXIO;
165 	/*
166 	 * There are no callers that want to zero more than one page as of now.
167 	 * Once users are there, this check can be removed after the
168 	 * device mapper code has been updated to split ranges across targets.
169 	 */
170 	if (nr_pages != 1)
171 		return -EIO;
172 
173 	return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
174 }
175 EXPORT_SYMBOL_GPL(dax_zero_page_range);
176 
177 #ifdef CONFIG_ARCH_HAS_PMEM_API
178 void arch_wb_cache_pmem(void *addr, size_t size);
179 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
180 {
181 	if (unlikely(!dax_write_cache_enabled(dax_dev)))
182 		return;
183 
184 	arch_wb_cache_pmem(addr, size);
185 }
186 #else
187 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
188 {
189 }
190 #endif
191 EXPORT_SYMBOL_GPL(dax_flush);
192 
193 void dax_write_cache(struct dax_device *dax_dev, bool wc)
194 {
195 	if (wc)
196 		set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
197 	else
198 		clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
199 }
200 EXPORT_SYMBOL_GPL(dax_write_cache);
201 
202 bool dax_write_cache_enabled(struct dax_device *dax_dev)
203 {
204 	return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
205 }
206 EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
207 
208 bool __dax_synchronous(struct dax_device *dax_dev)
209 {
210 	return test_bit(DAXDEV_SYNC, &dax_dev->flags);
211 }
212 EXPORT_SYMBOL_GPL(__dax_synchronous);
213 
214 void __set_dax_synchronous(struct dax_device *dax_dev)
215 {
216 	set_bit(DAXDEV_SYNC, &dax_dev->flags);
217 }
218 EXPORT_SYMBOL_GPL(__set_dax_synchronous);
219 
220 bool dax_alive(struct dax_device *dax_dev)
221 {
222 	lockdep_assert_held(&dax_srcu);
223 	return test_bit(DAXDEV_ALIVE, &dax_dev->flags);
224 }
225 EXPORT_SYMBOL_GPL(dax_alive);
226 
227 /*
228  * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
229  * that any fault handlers or operations that might have seen
230  * dax_alive(), have completed.  Any operations that start after
231  * synchronize_srcu() has run will abort upon seeing !dax_alive().
232  */
233 void kill_dax(struct dax_device *dax_dev)
234 {
235 	if (!dax_dev)
236 		return;
237 
238 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
239 	synchronize_srcu(&dax_srcu);
240 }
241 EXPORT_SYMBOL_GPL(kill_dax);
242 
243 void run_dax(struct dax_device *dax_dev)
244 {
245 	set_bit(DAXDEV_ALIVE, &dax_dev->flags);
246 }
247 EXPORT_SYMBOL_GPL(run_dax);
248 
249 static struct inode *dax_alloc_inode(struct super_block *sb)
250 {
251 	struct dax_device *dax_dev;
252 	struct inode *inode;
253 
254 	dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
255 	if (!dax_dev)
256 		return NULL;
257 
258 	inode = &dax_dev->inode;
259 	inode->i_rdev = 0;
260 	return inode;
261 }
262 
263 static struct dax_device *to_dax_dev(struct inode *inode)
264 {
265 	return container_of(inode, struct dax_device, inode);
266 }
267 
268 static void dax_free_inode(struct inode *inode)
269 {
270 	struct dax_device *dax_dev = to_dax_dev(inode);
271 	if (inode->i_rdev)
272 		ida_simple_remove(&dax_minor_ida, iminor(inode));
273 	kmem_cache_free(dax_cache, dax_dev);
274 }
275 
276 static void dax_destroy_inode(struct inode *inode)
277 {
278 	struct dax_device *dax_dev = to_dax_dev(inode);
279 	WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags),
280 			"kill_dax() must be called before final iput()\n");
281 }
282 
283 static const struct super_operations dax_sops = {
284 	.statfs = simple_statfs,
285 	.alloc_inode = dax_alloc_inode,
286 	.destroy_inode = dax_destroy_inode,
287 	.free_inode = dax_free_inode,
288 	.drop_inode = generic_delete_inode,
289 };
290 
291 static int dax_init_fs_context(struct fs_context *fc)
292 {
293 	struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC);
294 	if (!ctx)
295 		return -ENOMEM;
296 	ctx->ops = &dax_sops;
297 	return 0;
298 }
299 
300 static struct file_system_type dax_fs_type = {
301 	.name		= "dax",
302 	.init_fs_context = dax_init_fs_context,
303 	.kill_sb	= kill_anon_super,
304 };
305 
306 static int dax_test(struct inode *inode, void *data)
307 {
308 	dev_t devt = *(dev_t *) data;
309 
310 	return inode->i_rdev == devt;
311 }
312 
313 static int dax_set(struct inode *inode, void *data)
314 {
315 	dev_t devt = *(dev_t *) data;
316 
317 	inode->i_rdev = devt;
318 	return 0;
319 }
320 
321 static struct dax_device *dax_dev_get(dev_t devt)
322 {
323 	struct dax_device *dax_dev;
324 	struct inode *inode;
325 
326 	inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
327 			dax_test, dax_set, &devt);
328 
329 	if (!inode)
330 		return NULL;
331 
332 	dax_dev = to_dax_dev(inode);
333 	if (inode->i_state & I_NEW) {
334 		set_bit(DAXDEV_ALIVE, &dax_dev->flags);
335 		inode->i_cdev = &dax_dev->cdev;
336 		inode->i_mode = S_IFCHR;
337 		inode->i_flags = S_DAX;
338 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
339 		unlock_new_inode(inode);
340 	}
341 
342 	return dax_dev;
343 }
344 
345 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops,
346 		unsigned long flags)
347 {
348 	struct dax_device *dax_dev;
349 	dev_t devt;
350 	int minor;
351 
352 	if (WARN_ON_ONCE(ops && !ops->zero_page_range))
353 		return ERR_PTR(-EINVAL);
354 
355 	minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
356 	if (minor < 0)
357 		return ERR_PTR(-ENOMEM);
358 
359 	devt = MKDEV(MAJOR(dax_devt), minor);
360 	dax_dev = dax_dev_get(devt);
361 	if (!dax_dev)
362 		goto err_dev;
363 
364 	dax_dev->ops = ops;
365 	dax_dev->private = private;
366 	if (flags & DAXDEV_F_SYNC)
367 		set_dax_synchronous(dax_dev);
368 
369 	return dax_dev;
370 
371  err_dev:
372 	ida_simple_remove(&dax_minor_ida, minor);
373 	return ERR_PTR(-ENOMEM);
374 }
375 EXPORT_SYMBOL_GPL(alloc_dax);
376 
377 void put_dax(struct dax_device *dax_dev)
378 {
379 	if (!dax_dev)
380 		return;
381 	iput(&dax_dev->inode);
382 }
383 EXPORT_SYMBOL_GPL(put_dax);
384 
385 /**
386  * inode_dax: convert a public inode into its dax_dev
387  * @inode: An inode with i_cdev pointing to a dax_dev
388  *
389  * Note this is not equivalent to to_dax_dev() which is for private
390  * internal use where we know the inode filesystem type == dax_fs_type.
391  */
392 struct dax_device *inode_dax(struct inode *inode)
393 {
394 	struct cdev *cdev = inode->i_cdev;
395 
396 	return container_of(cdev, struct dax_device, cdev);
397 }
398 EXPORT_SYMBOL_GPL(inode_dax);
399 
400 struct inode *dax_inode(struct dax_device *dax_dev)
401 {
402 	return &dax_dev->inode;
403 }
404 EXPORT_SYMBOL_GPL(dax_inode);
405 
406 void *dax_get_private(struct dax_device *dax_dev)
407 {
408 	if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags))
409 		return NULL;
410 	return dax_dev->private;
411 }
412 EXPORT_SYMBOL_GPL(dax_get_private);
413 
414 static void init_once(void *_dax_dev)
415 {
416 	struct dax_device *dax_dev = _dax_dev;
417 	struct inode *inode = &dax_dev->inode;
418 
419 	memset(dax_dev, 0, sizeof(*dax_dev));
420 	inode_init_once(inode);
421 }
422 
423 static int dax_fs_init(void)
424 {
425 	int rc;
426 
427 	dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
428 			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
429 			 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
430 			init_once);
431 	if (!dax_cache)
432 		return -ENOMEM;
433 
434 	dax_mnt = kern_mount(&dax_fs_type);
435 	if (IS_ERR(dax_mnt)) {
436 		rc = PTR_ERR(dax_mnt);
437 		goto err_mount;
438 	}
439 	dax_superblock = dax_mnt->mnt_sb;
440 
441 	return 0;
442 
443  err_mount:
444 	kmem_cache_destroy(dax_cache);
445 
446 	return rc;
447 }
448 
449 static void dax_fs_exit(void)
450 {
451 	kern_unmount(dax_mnt);
452 	kmem_cache_destroy(dax_cache);
453 }
454 
455 static int __init dax_core_init(void)
456 {
457 	int rc;
458 
459 	rc = dax_fs_init();
460 	if (rc)
461 		return rc;
462 
463 	rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
464 	if (rc)
465 		goto err_chrdev;
466 
467 	rc = dax_bus_init();
468 	if (rc)
469 		goto err_bus;
470 	return 0;
471 
472 err_bus:
473 	unregister_chrdev_region(dax_devt, MINORMASK+1);
474 err_chrdev:
475 	dax_fs_exit();
476 	return 0;
477 }
478 
479 static void __exit dax_core_exit(void)
480 {
481 	dax_bus_exit();
482 	unregister_chrdev_region(dax_devt, MINORMASK+1);
483 	ida_destroy(&dax_minor_ida);
484 	dax_fs_exit();
485 }
486 
487 MODULE_AUTHOR("Intel Corporation");
488 MODULE_LICENSE("GPL v2");
489 subsys_initcall(dax_core_init);
490 module_exit(dax_core_exit);
491