1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright(c) 2017 Intel Corporation. All rights reserved. 4 */ 5 #include <linux/pagemap.h> 6 #include <linux/module.h> 7 #include <linux/mount.h> 8 #include <linux/pseudo_fs.h> 9 #include <linux/magic.h> 10 #include <linux/pfn_t.h> 11 #include <linux/cdev.h> 12 #include <linux/slab.h> 13 #include <linux/uio.h> 14 #include <linux/dax.h> 15 #include <linux/fs.h> 16 #include "dax-private.h" 17 18 /** 19 * struct dax_device - anchor object for dax services 20 * @inode: core vfs 21 * @cdev: optional character interface for "device dax" 22 * @private: dax driver private data 23 * @flags: state and boolean properties 24 */ 25 struct dax_device { 26 struct inode inode; 27 struct cdev cdev; 28 void *private; 29 unsigned long flags; 30 const struct dax_operations *ops; 31 }; 32 33 static dev_t dax_devt; 34 DEFINE_STATIC_SRCU(dax_srcu); 35 static struct vfsmount *dax_mnt; 36 static DEFINE_IDA(dax_minor_ida); 37 static struct kmem_cache *dax_cache __read_mostly; 38 static struct super_block *dax_superblock __read_mostly; 39 40 int dax_read_lock(void) 41 { 42 return srcu_read_lock(&dax_srcu); 43 } 44 EXPORT_SYMBOL_GPL(dax_read_lock); 45 46 void dax_read_unlock(int id) 47 { 48 srcu_read_unlock(&dax_srcu, id); 49 } 50 EXPORT_SYMBOL_GPL(dax_read_unlock); 51 52 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) 53 #include <linux/blkdev.h> 54 55 static DEFINE_XARRAY(dax_hosts); 56 57 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) 58 { 59 return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL); 60 } 61 EXPORT_SYMBOL_GPL(dax_add_host); 62 63 void dax_remove_host(struct gendisk *disk) 64 { 65 xa_erase(&dax_hosts, (unsigned long)disk); 66 } 67 EXPORT_SYMBOL_GPL(dax_remove_host); 68 69 /** 70 * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax 71 * @bdev: block device to find a dax_device for 72 */ 73 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 74 { 75 struct dax_device *dax_dev; 76 int id; 77 78 if (!blk_queue_dax(bdev->bd_disk->queue)) 79 return NULL; 80 81 if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE || 82 (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) { 83 pr_info("%pg: error: unaligned partition for dax\n", bdev); 84 return NULL; 85 } 86 87 id = dax_read_lock(); 88 dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); 89 if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) 90 dax_dev = NULL; 91 dax_read_unlock(id); 92 93 return dax_dev; 94 } 95 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 96 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ 97 98 enum dax_device_flags { 99 /* !alive + rcu grace period == no new operations / mappings */ 100 DAXDEV_ALIVE, 101 /* gate whether dax_flush() calls the low level flush routine */ 102 DAXDEV_WRITE_CACHE, 103 /* flag to check if device supports synchronous flush */ 104 DAXDEV_SYNC, 105 }; 106 107 /** 108 * dax_direct_access() - translate a device pgoff to an absolute pfn 109 * @dax_dev: a dax_device instance representing the logical memory range 110 * @pgoff: offset in pages from the start of the device to translate 111 * @nr_pages: number of consecutive pages caller can handle relative to @pfn 112 * @kaddr: output parameter that returns a virtual address mapping of pfn 113 * @pfn: output parameter that returns an absolute pfn translation of @pgoff 114 * 115 * Return: negative errno if an error occurs, otherwise the number of 116 * pages accessible at the device relative @pgoff. 117 */ 118 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 119 void **kaddr, pfn_t *pfn) 120 { 121 long avail; 122 123 if (!dax_dev) 124 return -EOPNOTSUPP; 125 126 if (!dax_alive(dax_dev)) 127 return -ENXIO; 128 129 if (nr_pages < 0) 130 return -EINVAL; 131 132 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 133 kaddr, pfn); 134 if (!avail) 135 return -ERANGE; 136 return min(avail, nr_pages); 137 } 138 EXPORT_SYMBOL_GPL(dax_direct_access); 139 140 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 141 size_t bytes, struct iov_iter *i) 142 { 143 if (!dax_alive(dax_dev)) 144 return 0; 145 146 return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); 147 } 148 EXPORT_SYMBOL_GPL(dax_copy_from_iter); 149 150 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 151 size_t bytes, struct iov_iter *i) 152 { 153 if (!dax_alive(dax_dev)) 154 return 0; 155 156 return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); 157 } 158 EXPORT_SYMBOL_GPL(dax_copy_to_iter); 159 160 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 161 size_t nr_pages) 162 { 163 if (!dax_alive(dax_dev)) 164 return -ENXIO; 165 /* 166 * There are no callers that want to zero more than one page as of now. 167 * Once users are there, this check can be removed after the 168 * device mapper code has been updated to split ranges across targets. 169 */ 170 if (nr_pages != 1) 171 return -EIO; 172 173 return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); 174 } 175 EXPORT_SYMBOL_GPL(dax_zero_page_range); 176 177 #ifdef CONFIG_ARCH_HAS_PMEM_API 178 void arch_wb_cache_pmem(void *addr, size_t size); 179 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 180 { 181 if (unlikely(!dax_write_cache_enabled(dax_dev))) 182 return; 183 184 arch_wb_cache_pmem(addr, size); 185 } 186 #else 187 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 188 { 189 } 190 #endif 191 EXPORT_SYMBOL_GPL(dax_flush); 192 193 void dax_write_cache(struct dax_device *dax_dev, bool wc) 194 { 195 if (wc) 196 set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 197 else 198 clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 199 } 200 EXPORT_SYMBOL_GPL(dax_write_cache); 201 202 bool dax_write_cache_enabled(struct dax_device *dax_dev) 203 { 204 return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 205 } 206 EXPORT_SYMBOL_GPL(dax_write_cache_enabled); 207 208 bool __dax_synchronous(struct dax_device *dax_dev) 209 { 210 return test_bit(DAXDEV_SYNC, &dax_dev->flags); 211 } 212 EXPORT_SYMBOL_GPL(__dax_synchronous); 213 214 void __set_dax_synchronous(struct dax_device *dax_dev) 215 { 216 set_bit(DAXDEV_SYNC, &dax_dev->flags); 217 } 218 EXPORT_SYMBOL_GPL(__set_dax_synchronous); 219 220 bool dax_alive(struct dax_device *dax_dev) 221 { 222 lockdep_assert_held(&dax_srcu); 223 return test_bit(DAXDEV_ALIVE, &dax_dev->flags); 224 } 225 EXPORT_SYMBOL_GPL(dax_alive); 226 227 /* 228 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring 229 * that any fault handlers or operations that might have seen 230 * dax_alive(), have completed. Any operations that start after 231 * synchronize_srcu() has run will abort upon seeing !dax_alive(). 232 */ 233 void kill_dax(struct dax_device *dax_dev) 234 { 235 if (!dax_dev) 236 return; 237 238 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 239 synchronize_srcu(&dax_srcu); 240 } 241 EXPORT_SYMBOL_GPL(kill_dax); 242 243 void run_dax(struct dax_device *dax_dev) 244 { 245 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 246 } 247 EXPORT_SYMBOL_GPL(run_dax); 248 249 static struct inode *dax_alloc_inode(struct super_block *sb) 250 { 251 struct dax_device *dax_dev; 252 struct inode *inode; 253 254 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 255 if (!dax_dev) 256 return NULL; 257 258 inode = &dax_dev->inode; 259 inode->i_rdev = 0; 260 return inode; 261 } 262 263 static struct dax_device *to_dax_dev(struct inode *inode) 264 { 265 return container_of(inode, struct dax_device, inode); 266 } 267 268 static void dax_free_inode(struct inode *inode) 269 { 270 struct dax_device *dax_dev = to_dax_dev(inode); 271 if (inode->i_rdev) 272 ida_simple_remove(&dax_minor_ida, iminor(inode)); 273 kmem_cache_free(dax_cache, dax_dev); 274 } 275 276 static void dax_destroy_inode(struct inode *inode) 277 { 278 struct dax_device *dax_dev = to_dax_dev(inode); 279 WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), 280 "kill_dax() must be called before final iput()\n"); 281 } 282 283 static const struct super_operations dax_sops = { 284 .statfs = simple_statfs, 285 .alloc_inode = dax_alloc_inode, 286 .destroy_inode = dax_destroy_inode, 287 .free_inode = dax_free_inode, 288 .drop_inode = generic_delete_inode, 289 }; 290 291 static int dax_init_fs_context(struct fs_context *fc) 292 { 293 struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC); 294 if (!ctx) 295 return -ENOMEM; 296 ctx->ops = &dax_sops; 297 return 0; 298 } 299 300 static struct file_system_type dax_fs_type = { 301 .name = "dax", 302 .init_fs_context = dax_init_fs_context, 303 .kill_sb = kill_anon_super, 304 }; 305 306 static int dax_test(struct inode *inode, void *data) 307 { 308 dev_t devt = *(dev_t *) data; 309 310 return inode->i_rdev == devt; 311 } 312 313 static int dax_set(struct inode *inode, void *data) 314 { 315 dev_t devt = *(dev_t *) data; 316 317 inode->i_rdev = devt; 318 return 0; 319 } 320 321 static struct dax_device *dax_dev_get(dev_t devt) 322 { 323 struct dax_device *dax_dev; 324 struct inode *inode; 325 326 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 327 dax_test, dax_set, &devt); 328 329 if (!inode) 330 return NULL; 331 332 dax_dev = to_dax_dev(inode); 333 if (inode->i_state & I_NEW) { 334 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 335 inode->i_cdev = &dax_dev->cdev; 336 inode->i_mode = S_IFCHR; 337 inode->i_flags = S_DAX; 338 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 339 unlock_new_inode(inode); 340 } 341 342 return dax_dev; 343 } 344 345 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, 346 unsigned long flags) 347 { 348 struct dax_device *dax_dev; 349 dev_t devt; 350 int minor; 351 352 if (WARN_ON_ONCE(ops && !ops->zero_page_range)) 353 return ERR_PTR(-EINVAL); 354 355 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); 356 if (minor < 0) 357 return ERR_PTR(-ENOMEM); 358 359 devt = MKDEV(MAJOR(dax_devt), minor); 360 dax_dev = dax_dev_get(devt); 361 if (!dax_dev) 362 goto err_dev; 363 364 dax_dev->ops = ops; 365 dax_dev->private = private; 366 if (flags & DAXDEV_F_SYNC) 367 set_dax_synchronous(dax_dev); 368 369 return dax_dev; 370 371 err_dev: 372 ida_simple_remove(&dax_minor_ida, minor); 373 return ERR_PTR(-ENOMEM); 374 } 375 EXPORT_SYMBOL_GPL(alloc_dax); 376 377 void put_dax(struct dax_device *dax_dev) 378 { 379 if (!dax_dev) 380 return; 381 iput(&dax_dev->inode); 382 } 383 EXPORT_SYMBOL_GPL(put_dax); 384 385 /** 386 * inode_dax: convert a public inode into its dax_dev 387 * @inode: An inode with i_cdev pointing to a dax_dev 388 * 389 * Note this is not equivalent to to_dax_dev() which is for private 390 * internal use where we know the inode filesystem type == dax_fs_type. 391 */ 392 struct dax_device *inode_dax(struct inode *inode) 393 { 394 struct cdev *cdev = inode->i_cdev; 395 396 return container_of(cdev, struct dax_device, cdev); 397 } 398 EXPORT_SYMBOL_GPL(inode_dax); 399 400 struct inode *dax_inode(struct dax_device *dax_dev) 401 { 402 return &dax_dev->inode; 403 } 404 EXPORT_SYMBOL_GPL(dax_inode); 405 406 void *dax_get_private(struct dax_device *dax_dev) 407 { 408 if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) 409 return NULL; 410 return dax_dev->private; 411 } 412 EXPORT_SYMBOL_GPL(dax_get_private); 413 414 static void init_once(void *_dax_dev) 415 { 416 struct dax_device *dax_dev = _dax_dev; 417 struct inode *inode = &dax_dev->inode; 418 419 memset(dax_dev, 0, sizeof(*dax_dev)); 420 inode_init_once(inode); 421 } 422 423 static int dax_fs_init(void) 424 { 425 int rc; 426 427 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, 428 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 429 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 430 init_once); 431 if (!dax_cache) 432 return -ENOMEM; 433 434 dax_mnt = kern_mount(&dax_fs_type); 435 if (IS_ERR(dax_mnt)) { 436 rc = PTR_ERR(dax_mnt); 437 goto err_mount; 438 } 439 dax_superblock = dax_mnt->mnt_sb; 440 441 return 0; 442 443 err_mount: 444 kmem_cache_destroy(dax_cache); 445 446 return rc; 447 } 448 449 static void dax_fs_exit(void) 450 { 451 kern_unmount(dax_mnt); 452 kmem_cache_destroy(dax_cache); 453 } 454 455 static int __init dax_core_init(void) 456 { 457 int rc; 458 459 rc = dax_fs_init(); 460 if (rc) 461 return rc; 462 463 rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); 464 if (rc) 465 goto err_chrdev; 466 467 rc = dax_bus_init(); 468 if (rc) 469 goto err_bus; 470 return 0; 471 472 err_bus: 473 unregister_chrdev_region(dax_devt, MINORMASK+1); 474 err_chrdev: 475 dax_fs_exit(); 476 return 0; 477 } 478 479 static void __exit dax_core_exit(void) 480 { 481 dax_bus_exit(); 482 unregister_chrdev_region(dax_devt, MINORMASK+1); 483 ida_destroy(&dax_minor_ida); 484 dax_fs_exit(); 485 } 486 487 MODULE_AUTHOR("Intel Corporation"); 488 MODULE_LICENSE("GPL v2"); 489 subsys_initcall(dax_core_init); 490 module_exit(dax_core_exit); 491