xref: /openbmc/linux/drivers/dax/super.c (revision fd1d00ec)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright(c) 2017 Intel Corporation. All rights reserved.
4  */
5 #include <linux/pagemap.h>
6 #include <linux/module.h>
7 #include <linux/mount.h>
8 #include <linux/pseudo_fs.h>
9 #include <linux/magic.h>
10 #include <linux/pfn_t.h>
11 #include <linux/cdev.h>
12 #include <linux/slab.h>
13 #include <linux/uio.h>
14 #include <linux/dax.h>
15 #include <linux/fs.h>
16 #include "dax-private.h"
17 
18 /**
19  * struct dax_device - anchor object for dax services
20  * @inode: core vfs
21  * @cdev: optional character interface for "device dax"
22  * @private: dax driver private data
23  * @flags: state and boolean properties
24  */
25 struct dax_device {
26 	struct inode inode;
27 	struct cdev cdev;
28 	void *private;
29 	unsigned long flags;
30 	const struct dax_operations *ops;
31 };
32 
33 static dev_t dax_devt;
34 DEFINE_STATIC_SRCU(dax_srcu);
35 static struct vfsmount *dax_mnt;
36 static DEFINE_IDA(dax_minor_ida);
37 static struct kmem_cache *dax_cache __read_mostly;
38 static struct super_block *dax_superblock __read_mostly;
39 
40 int dax_read_lock(void)
41 {
42 	return srcu_read_lock(&dax_srcu);
43 }
44 EXPORT_SYMBOL_GPL(dax_read_lock);
45 
46 void dax_read_unlock(int id)
47 {
48 	srcu_read_unlock(&dax_srcu, id);
49 }
50 EXPORT_SYMBOL_GPL(dax_read_unlock);
51 
52 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
53 #include <linux/blkdev.h>
54 
55 static DEFINE_XARRAY(dax_hosts);
56 
57 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
58 {
59 	return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL);
60 }
61 EXPORT_SYMBOL_GPL(dax_add_host);
62 
63 void dax_remove_host(struct gendisk *disk)
64 {
65 	xa_erase(&dax_hosts, (unsigned long)disk);
66 }
67 EXPORT_SYMBOL_GPL(dax_remove_host);
68 
69 /**
70  * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax
71  * @bdev: block device to find a dax_device for
72  * @start_off: returns the byte offset into the dax_device that @bdev starts
73  */
74 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off)
75 {
76 	struct dax_device *dax_dev;
77 	u64 part_size;
78 	int id;
79 
80 	if (!blk_queue_dax(bdev->bd_disk->queue))
81 		return NULL;
82 
83 	*start_off = get_start_sect(bdev) * SECTOR_SIZE;
84 	part_size = bdev_nr_sectors(bdev) * SECTOR_SIZE;
85 	if (*start_off % PAGE_SIZE || part_size % PAGE_SIZE) {
86 		pr_info("%pg: error: unaligned partition for dax\n", bdev);
87 		return NULL;
88 	}
89 
90 	id = dax_read_lock();
91 	dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk);
92 	if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode))
93 		dax_dev = NULL;
94 	dax_read_unlock(id);
95 
96 	return dax_dev;
97 }
98 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
99 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
100 
101 enum dax_device_flags {
102 	/* !alive + rcu grace period == no new operations / mappings */
103 	DAXDEV_ALIVE,
104 	/* gate whether dax_flush() calls the low level flush routine */
105 	DAXDEV_WRITE_CACHE,
106 	/* flag to check if device supports synchronous flush */
107 	DAXDEV_SYNC,
108 };
109 
110 /**
111  * dax_direct_access() - translate a device pgoff to an absolute pfn
112  * @dax_dev: a dax_device instance representing the logical memory range
113  * @pgoff: offset in pages from the start of the device to translate
114  * @nr_pages: number of consecutive pages caller can handle relative to @pfn
115  * @kaddr: output parameter that returns a virtual address mapping of pfn
116  * @pfn: output parameter that returns an absolute pfn translation of @pgoff
117  *
118  * Return: negative errno if an error occurs, otherwise the number of
119  * pages accessible at the device relative @pgoff.
120  */
121 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
122 		void **kaddr, pfn_t *pfn)
123 {
124 	long avail;
125 
126 	if (!dax_dev)
127 		return -EOPNOTSUPP;
128 
129 	if (!dax_alive(dax_dev))
130 		return -ENXIO;
131 
132 	if (nr_pages < 0)
133 		return -EINVAL;
134 
135 	avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages,
136 			kaddr, pfn);
137 	if (!avail)
138 		return -ERANGE;
139 	return min(avail, nr_pages);
140 }
141 EXPORT_SYMBOL_GPL(dax_direct_access);
142 
143 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
144 		size_t bytes, struct iov_iter *i)
145 {
146 	if (!dax_alive(dax_dev))
147 		return 0;
148 
149 	return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i);
150 }
151 EXPORT_SYMBOL_GPL(dax_copy_from_iter);
152 
153 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
154 		size_t bytes, struct iov_iter *i)
155 {
156 	if (!dax_alive(dax_dev))
157 		return 0;
158 
159 	return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
160 }
161 EXPORT_SYMBOL_GPL(dax_copy_to_iter);
162 
163 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
164 			size_t nr_pages)
165 {
166 	if (!dax_alive(dax_dev))
167 		return -ENXIO;
168 	/*
169 	 * There are no callers that want to zero more than one page as of now.
170 	 * Once users are there, this check can be removed after the
171 	 * device mapper code has been updated to split ranges across targets.
172 	 */
173 	if (nr_pages != 1)
174 		return -EIO;
175 
176 	return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages);
177 }
178 EXPORT_SYMBOL_GPL(dax_zero_page_range);
179 
180 #ifdef CONFIG_ARCH_HAS_PMEM_API
181 void arch_wb_cache_pmem(void *addr, size_t size);
182 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
183 {
184 	if (unlikely(!dax_write_cache_enabled(dax_dev)))
185 		return;
186 
187 	arch_wb_cache_pmem(addr, size);
188 }
189 #else
190 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
191 {
192 }
193 #endif
194 EXPORT_SYMBOL_GPL(dax_flush);
195 
196 void dax_write_cache(struct dax_device *dax_dev, bool wc)
197 {
198 	if (wc)
199 		set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
200 	else
201 		clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
202 }
203 EXPORT_SYMBOL_GPL(dax_write_cache);
204 
205 bool dax_write_cache_enabled(struct dax_device *dax_dev)
206 {
207 	return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
208 }
209 EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
210 
211 bool dax_synchronous(struct dax_device *dax_dev)
212 {
213 	return test_bit(DAXDEV_SYNC, &dax_dev->flags);
214 }
215 EXPORT_SYMBOL_GPL(dax_synchronous);
216 
217 void set_dax_synchronous(struct dax_device *dax_dev)
218 {
219 	set_bit(DAXDEV_SYNC, &dax_dev->flags);
220 }
221 EXPORT_SYMBOL_GPL(set_dax_synchronous);
222 
223 bool dax_alive(struct dax_device *dax_dev)
224 {
225 	lockdep_assert_held(&dax_srcu);
226 	return test_bit(DAXDEV_ALIVE, &dax_dev->flags);
227 }
228 EXPORT_SYMBOL_GPL(dax_alive);
229 
230 /*
231  * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring
232  * that any fault handlers or operations that might have seen
233  * dax_alive(), have completed.  Any operations that start after
234  * synchronize_srcu() has run will abort upon seeing !dax_alive().
235  */
236 void kill_dax(struct dax_device *dax_dev)
237 {
238 	if (!dax_dev)
239 		return;
240 
241 	clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
242 	synchronize_srcu(&dax_srcu);
243 }
244 EXPORT_SYMBOL_GPL(kill_dax);
245 
246 void run_dax(struct dax_device *dax_dev)
247 {
248 	set_bit(DAXDEV_ALIVE, &dax_dev->flags);
249 }
250 EXPORT_SYMBOL_GPL(run_dax);
251 
252 static struct inode *dax_alloc_inode(struct super_block *sb)
253 {
254 	struct dax_device *dax_dev;
255 	struct inode *inode;
256 
257 	dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
258 	if (!dax_dev)
259 		return NULL;
260 
261 	inode = &dax_dev->inode;
262 	inode->i_rdev = 0;
263 	return inode;
264 }
265 
266 static struct dax_device *to_dax_dev(struct inode *inode)
267 {
268 	return container_of(inode, struct dax_device, inode);
269 }
270 
271 static void dax_free_inode(struct inode *inode)
272 {
273 	struct dax_device *dax_dev = to_dax_dev(inode);
274 	if (inode->i_rdev)
275 		ida_simple_remove(&dax_minor_ida, iminor(inode));
276 	kmem_cache_free(dax_cache, dax_dev);
277 }
278 
279 static void dax_destroy_inode(struct inode *inode)
280 {
281 	struct dax_device *dax_dev = to_dax_dev(inode);
282 	WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags),
283 			"kill_dax() must be called before final iput()\n");
284 }
285 
286 static const struct super_operations dax_sops = {
287 	.statfs = simple_statfs,
288 	.alloc_inode = dax_alloc_inode,
289 	.destroy_inode = dax_destroy_inode,
290 	.free_inode = dax_free_inode,
291 	.drop_inode = generic_delete_inode,
292 };
293 
294 static int dax_init_fs_context(struct fs_context *fc)
295 {
296 	struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC);
297 	if (!ctx)
298 		return -ENOMEM;
299 	ctx->ops = &dax_sops;
300 	return 0;
301 }
302 
303 static struct file_system_type dax_fs_type = {
304 	.name		= "dax",
305 	.init_fs_context = dax_init_fs_context,
306 	.kill_sb	= kill_anon_super,
307 };
308 
309 static int dax_test(struct inode *inode, void *data)
310 {
311 	dev_t devt = *(dev_t *) data;
312 
313 	return inode->i_rdev == devt;
314 }
315 
316 static int dax_set(struct inode *inode, void *data)
317 {
318 	dev_t devt = *(dev_t *) data;
319 
320 	inode->i_rdev = devt;
321 	return 0;
322 }
323 
324 static struct dax_device *dax_dev_get(dev_t devt)
325 {
326 	struct dax_device *dax_dev;
327 	struct inode *inode;
328 
329 	inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31),
330 			dax_test, dax_set, &devt);
331 
332 	if (!inode)
333 		return NULL;
334 
335 	dax_dev = to_dax_dev(inode);
336 	if (inode->i_state & I_NEW) {
337 		set_bit(DAXDEV_ALIVE, &dax_dev->flags);
338 		inode->i_cdev = &dax_dev->cdev;
339 		inode->i_mode = S_IFCHR;
340 		inode->i_flags = S_DAX;
341 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
342 		unlock_new_inode(inode);
343 	}
344 
345 	return dax_dev;
346 }
347 
348 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops,
349 		unsigned long flags)
350 {
351 	struct dax_device *dax_dev;
352 	dev_t devt;
353 	int minor;
354 
355 	if (WARN_ON_ONCE(ops && !ops->zero_page_range))
356 		return ERR_PTR(-EINVAL);
357 
358 	minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL);
359 	if (minor < 0)
360 		return ERR_PTR(-ENOMEM);
361 
362 	devt = MKDEV(MAJOR(dax_devt), minor);
363 	dax_dev = dax_dev_get(devt);
364 	if (!dax_dev)
365 		goto err_dev;
366 
367 	dax_dev->ops = ops;
368 	dax_dev->private = private;
369 	if (flags & DAXDEV_F_SYNC)
370 		set_dax_synchronous(dax_dev);
371 
372 	return dax_dev;
373 
374  err_dev:
375 	ida_simple_remove(&dax_minor_ida, minor);
376 	return ERR_PTR(-ENOMEM);
377 }
378 EXPORT_SYMBOL_GPL(alloc_dax);
379 
380 void put_dax(struct dax_device *dax_dev)
381 {
382 	if (!dax_dev)
383 		return;
384 	iput(&dax_dev->inode);
385 }
386 EXPORT_SYMBOL_GPL(put_dax);
387 
388 /**
389  * inode_dax: convert a public inode into its dax_dev
390  * @inode: An inode with i_cdev pointing to a dax_dev
391  *
392  * Note this is not equivalent to to_dax_dev() which is for private
393  * internal use where we know the inode filesystem type == dax_fs_type.
394  */
395 struct dax_device *inode_dax(struct inode *inode)
396 {
397 	struct cdev *cdev = inode->i_cdev;
398 
399 	return container_of(cdev, struct dax_device, cdev);
400 }
401 EXPORT_SYMBOL_GPL(inode_dax);
402 
403 struct inode *dax_inode(struct dax_device *dax_dev)
404 {
405 	return &dax_dev->inode;
406 }
407 EXPORT_SYMBOL_GPL(dax_inode);
408 
409 void *dax_get_private(struct dax_device *dax_dev)
410 {
411 	if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags))
412 		return NULL;
413 	return dax_dev->private;
414 }
415 EXPORT_SYMBOL_GPL(dax_get_private);
416 
417 static void init_once(void *_dax_dev)
418 {
419 	struct dax_device *dax_dev = _dax_dev;
420 	struct inode *inode = &dax_dev->inode;
421 
422 	memset(dax_dev, 0, sizeof(*dax_dev));
423 	inode_init_once(inode);
424 }
425 
426 static int dax_fs_init(void)
427 {
428 	int rc;
429 
430 	dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0,
431 			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
432 			 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
433 			init_once);
434 	if (!dax_cache)
435 		return -ENOMEM;
436 
437 	dax_mnt = kern_mount(&dax_fs_type);
438 	if (IS_ERR(dax_mnt)) {
439 		rc = PTR_ERR(dax_mnt);
440 		goto err_mount;
441 	}
442 	dax_superblock = dax_mnt->mnt_sb;
443 
444 	return 0;
445 
446  err_mount:
447 	kmem_cache_destroy(dax_cache);
448 
449 	return rc;
450 }
451 
452 static void dax_fs_exit(void)
453 {
454 	kern_unmount(dax_mnt);
455 	kmem_cache_destroy(dax_cache);
456 }
457 
458 static int __init dax_core_init(void)
459 {
460 	int rc;
461 
462 	rc = dax_fs_init();
463 	if (rc)
464 		return rc;
465 
466 	rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax");
467 	if (rc)
468 		goto err_chrdev;
469 
470 	rc = dax_bus_init();
471 	if (rc)
472 		goto err_bus;
473 	return 0;
474 
475 err_bus:
476 	unregister_chrdev_region(dax_devt, MINORMASK+1);
477 err_chrdev:
478 	dax_fs_exit();
479 	return 0;
480 }
481 
482 static void __exit dax_core_exit(void)
483 {
484 	dax_bus_exit();
485 	unregister_chrdev_region(dax_devt, MINORMASK+1);
486 	ida_destroy(&dax_minor_ida);
487 	dax_fs_exit();
488 }
489 
490 MODULE_AUTHOR("Intel Corporation");
491 MODULE_LICENSE("GPL v2");
492 subsys_initcall(dax_core_init);
493 module_exit(dax_core_exit);
494