1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright(c) 2017 Intel Corporation. All rights reserved. 4 */ 5 #include <linux/pagemap.h> 6 #include <linux/module.h> 7 #include <linux/mount.h> 8 #include <linux/pseudo_fs.h> 9 #include <linux/magic.h> 10 #include <linux/pfn_t.h> 11 #include <linux/cdev.h> 12 #include <linux/slab.h> 13 #include <linux/uio.h> 14 #include <linux/dax.h> 15 #include <linux/fs.h> 16 #include "dax-private.h" 17 18 /** 19 * struct dax_device - anchor object for dax services 20 * @inode: core vfs 21 * @cdev: optional character interface for "device dax" 22 * @private: dax driver private data 23 * @flags: state and boolean properties 24 */ 25 struct dax_device { 26 struct inode inode; 27 struct cdev cdev; 28 void *private; 29 unsigned long flags; 30 const struct dax_operations *ops; 31 }; 32 33 static dev_t dax_devt; 34 DEFINE_STATIC_SRCU(dax_srcu); 35 static struct vfsmount *dax_mnt; 36 static DEFINE_IDA(dax_minor_ida); 37 static struct kmem_cache *dax_cache __read_mostly; 38 static struct super_block *dax_superblock __read_mostly; 39 40 int dax_read_lock(void) 41 { 42 return srcu_read_lock(&dax_srcu); 43 } 44 EXPORT_SYMBOL_GPL(dax_read_lock); 45 46 void dax_read_unlock(int id) 47 { 48 srcu_read_unlock(&dax_srcu, id); 49 } 50 EXPORT_SYMBOL_GPL(dax_read_unlock); 51 52 #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) 53 #include <linux/blkdev.h> 54 55 static DEFINE_XARRAY(dax_hosts); 56 57 int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) 58 { 59 return xa_insert(&dax_hosts, (unsigned long)disk, dax_dev, GFP_KERNEL); 60 } 61 EXPORT_SYMBOL_GPL(dax_add_host); 62 63 void dax_remove_host(struct gendisk *disk) 64 { 65 xa_erase(&dax_hosts, (unsigned long)disk); 66 } 67 EXPORT_SYMBOL_GPL(dax_remove_host); 68 69 int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, 70 pgoff_t *pgoff) 71 { 72 sector_t start_sect = bdev ? get_start_sect(bdev) : 0; 73 phys_addr_t phys_off = (start_sect + sector) * 512; 74 75 if (pgoff) 76 *pgoff = PHYS_PFN(phys_off); 77 if (phys_off % PAGE_SIZE || size % PAGE_SIZE) 78 return -EINVAL; 79 return 0; 80 } 81 EXPORT_SYMBOL(bdev_dax_pgoff); 82 83 /** 84 * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax 85 * @bdev: block device to find a dax_device for 86 */ 87 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 88 { 89 struct dax_device *dax_dev; 90 int id; 91 92 if (!blk_queue_dax(bdev->bd_disk->queue)) 93 return NULL; 94 95 if ((get_start_sect(bdev) * SECTOR_SIZE) % PAGE_SIZE || 96 (bdev_nr_sectors(bdev) * SECTOR_SIZE) % PAGE_SIZE) { 97 pr_info("%pg: error: unaligned partition for dax\n", bdev); 98 return NULL; 99 } 100 101 id = dax_read_lock(); 102 dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); 103 if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) 104 dax_dev = NULL; 105 dax_read_unlock(id); 106 107 return dax_dev; 108 } 109 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 110 #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ 111 112 enum dax_device_flags { 113 /* !alive + rcu grace period == no new operations / mappings */ 114 DAXDEV_ALIVE, 115 /* gate whether dax_flush() calls the low level flush routine */ 116 DAXDEV_WRITE_CACHE, 117 /* flag to check if device supports synchronous flush */ 118 DAXDEV_SYNC, 119 }; 120 121 /** 122 * dax_direct_access() - translate a device pgoff to an absolute pfn 123 * @dax_dev: a dax_device instance representing the logical memory range 124 * @pgoff: offset in pages from the start of the device to translate 125 * @nr_pages: number of consecutive pages caller can handle relative to @pfn 126 * @kaddr: output parameter that returns a virtual address mapping of pfn 127 * @pfn: output parameter that returns an absolute pfn translation of @pgoff 128 * 129 * Return: negative errno if an error occurs, otherwise the number of 130 * pages accessible at the device relative @pgoff. 131 */ 132 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 133 void **kaddr, pfn_t *pfn) 134 { 135 long avail; 136 137 if (!dax_dev) 138 return -EOPNOTSUPP; 139 140 if (!dax_alive(dax_dev)) 141 return -ENXIO; 142 143 if (nr_pages < 0) 144 return -EINVAL; 145 146 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 147 kaddr, pfn); 148 if (!avail) 149 return -ERANGE; 150 return min(avail, nr_pages); 151 } 152 EXPORT_SYMBOL_GPL(dax_direct_access); 153 154 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 155 size_t bytes, struct iov_iter *i) 156 { 157 if (!dax_alive(dax_dev)) 158 return 0; 159 160 return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); 161 } 162 EXPORT_SYMBOL_GPL(dax_copy_from_iter); 163 164 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 165 size_t bytes, struct iov_iter *i) 166 { 167 if (!dax_alive(dax_dev)) 168 return 0; 169 170 return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); 171 } 172 EXPORT_SYMBOL_GPL(dax_copy_to_iter); 173 174 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 175 size_t nr_pages) 176 { 177 if (!dax_alive(dax_dev)) 178 return -ENXIO; 179 /* 180 * There are no callers that want to zero more than one page as of now. 181 * Once users are there, this check can be removed after the 182 * device mapper code has been updated to split ranges across targets. 183 */ 184 if (nr_pages != 1) 185 return -EIO; 186 187 return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); 188 } 189 EXPORT_SYMBOL_GPL(dax_zero_page_range); 190 191 #ifdef CONFIG_ARCH_HAS_PMEM_API 192 void arch_wb_cache_pmem(void *addr, size_t size); 193 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 194 { 195 if (unlikely(!dax_write_cache_enabled(dax_dev))) 196 return; 197 198 arch_wb_cache_pmem(addr, size); 199 } 200 #else 201 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 202 { 203 } 204 #endif 205 EXPORT_SYMBOL_GPL(dax_flush); 206 207 void dax_write_cache(struct dax_device *dax_dev, bool wc) 208 { 209 if (wc) 210 set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 211 else 212 clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 213 } 214 EXPORT_SYMBOL_GPL(dax_write_cache); 215 216 bool dax_write_cache_enabled(struct dax_device *dax_dev) 217 { 218 return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 219 } 220 EXPORT_SYMBOL_GPL(dax_write_cache_enabled); 221 222 bool __dax_synchronous(struct dax_device *dax_dev) 223 { 224 return test_bit(DAXDEV_SYNC, &dax_dev->flags); 225 } 226 EXPORT_SYMBOL_GPL(__dax_synchronous); 227 228 void __set_dax_synchronous(struct dax_device *dax_dev) 229 { 230 set_bit(DAXDEV_SYNC, &dax_dev->flags); 231 } 232 EXPORT_SYMBOL_GPL(__set_dax_synchronous); 233 234 bool dax_alive(struct dax_device *dax_dev) 235 { 236 lockdep_assert_held(&dax_srcu); 237 return test_bit(DAXDEV_ALIVE, &dax_dev->flags); 238 } 239 EXPORT_SYMBOL_GPL(dax_alive); 240 241 /* 242 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring 243 * that any fault handlers or operations that might have seen 244 * dax_alive(), have completed. Any operations that start after 245 * synchronize_srcu() has run will abort upon seeing !dax_alive(). 246 */ 247 void kill_dax(struct dax_device *dax_dev) 248 { 249 if (!dax_dev) 250 return; 251 252 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 253 synchronize_srcu(&dax_srcu); 254 } 255 EXPORT_SYMBOL_GPL(kill_dax); 256 257 void run_dax(struct dax_device *dax_dev) 258 { 259 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 260 } 261 EXPORT_SYMBOL_GPL(run_dax); 262 263 static struct inode *dax_alloc_inode(struct super_block *sb) 264 { 265 struct dax_device *dax_dev; 266 struct inode *inode; 267 268 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 269 if (!dax_dev) 270 return NULL; 271 272 inode = &dax_dev->inode; 273 inode->i_rdev = 0; 274 return inode; 275 } 276 277 static struct dax_device *to_dax_dev(struct inode *inode) 278 { 279 return container_of(inode, struct dax_device, inode); 280 } 281 282 static void dax_free_inode(struct inode *inode) 283 { 284 struct dax_device *dax_dev = to_dax_dev(inode); 285 if (inode->i_rdev) 286 ida_simple_remove(&dax_minor_ida, iminor(inode)); 287 kmem_cache_free(dax_cache, dax_dev); 288 } 289 290 static void dax_destroy_inode(struct inode *inode) 291 { 292 struct dax_device *dax_dev = to_dax_dev(inode); 293 WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), 294 "kill_dax() must be called before final iput()\n"); 295 } 296 297 static const struct super_operations dax_sops = { 298 .statfs = simple_statfs, 299 .alloc_inode = dax_alloc_inode, 300 .destroy_inode = dax_destroy_inode, 301 .free_inode = dax_free_inode, 302 .drop_inode = generic_delete_inode, 303 }; 304 305 static int dax_init_fs_context(struct fs_context *fc) 306 { 307 struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC); 308 if (!ctx) 309 return -ENOMEM; 310 ctx->ops = &dax_sops; 311 return 0; 312 } 313 314 static struct file_system_type dax_fs_type = { 315 .name = "dax", 316 .init_fs_context = dax_init_fs_context, 317 .kill_sb = kill_anon_super, 318 }; 319 320 static int dax_test(struct inode *inode, void *data) 321 { 322 dev_t devt = *(dev_t *) data; 323 324 return inode->i_rdev == devt; 325 } 326 327 static int dax_set(struct inode *inode, void *data) 328 { 329 dev_t devt = *(dev_t *) data; 330 331 inode->i_rdev = devt; 332 return 0; 333 } 334 335 static struct dax_device *dax_dev_get(dev_t devt) 336 { 337 struct dax_device *dax_dev; 338 struct inode *inode; 339 340 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 341 dax_test, dax_set, &devt); 342 343 if (!inode) 344 return NULL; 345 346 dax_dev = to_dax_dev(inode); 347 if (inode->i_state & I_NEW) { 348 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 349 inode->i_cdev = &dax_dev->cdev; 350 inode->i_mode = S_IFCHR; 351 inode->i_flags = S_DAX; 352 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 353 unlock_new_inode(inode); 354 } 355 356 return dax_dev; 357 } 358 359 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, 360 unsigned long flags) 361 { 362 struct dax_device *dax_dev; 363 dev_t devt; 364 int minor; 365 366 if (WARN_ON_ONCE(ops && !ops->zero_page_range)) 367 return ERR_PTR(-EINVAL); 368 369 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); 370 if (minor < 0) 371 return ERR_PTR(-ENOMEM); 372 373 devt = MKDEV(MAJOR(dax_devt), minor); 374 dax_dev = dax_dev_get(devt); 375 if (!dax_dev) 376 goto err_dev; 377 378 dax_dev->ops = ops; 379 dax_dev->private = private; 380 if (flags & DAXDEV_F_SYNC) 381 set_dax_synchronous(dax_dev); 382 383 return dax_dev; 384 385 err_dev: 386 ida_simple_remove(&dax_minor_ida, minor); 387 return ERR_PTR(-ENOMEM); 388 } 389 EXPORT_SYMBOL_GPL(alloc_dax); 390 391 void put_dax(struct dax_device *dax_dev) 392 { 393 if (!dax_dev) 394 return; 395 iput(&dax_dev->inode); 396 } 397 EXPORT_SYMBOL_GPL(put_dax); 398 399 /** 400 * inode_dax: convert a public inode into its dax_dev 401 * @inode: An inode with i_cdev pointing to a dax_dev 402 * 403 * Note this is not equivalent to to_dax_dev() which is for private 404 * internal use where we know the inode filesystem type == dax_fs_type. 405 */ 406 struct dax_device *inode_dax(struct inode *inode) 407 { 408 struct cdev *cdev = inode->i_cdev; 409 410 return container_of(cdev, struct dax_device, cdev); 411 } 412 EXPORT_SYMBOL_GPL(inode_dax); 413 414 struct inode *dax_inode(struct dax_device *dax_dev) 415 { 416 return &dax_dev->inode; 417 } 418 EXPORT_SYMBOL_GPL(dax_inode); 419 420 void *dax_get_private(struct dax_device *dax_dev) 421 { 422 if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) 423 return NULL; 424 return dax_dev->private; 425 } 426 EXPORT_SYMBOL_GPL(dax_get_private); 427 428 static void init_once(void *_dax_dev) 429 { 430 struct dax_device *dax_dev = _dax_dev; 431 struct inode *inode = &dax_dev->inode; 432 433 memset(dax_dev, 0, sizeof(*dax_dev)); 434 inode_init_once(inode); 435 } 436 437 static int dax_fs_init(void) 438 { 439 int rc; 440 441 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, 442 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 443 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 444 init_once); 445 if (!dax_cache) 446 return -ENOMEM; 447 448 dax_mnt = kern_mount(&dax_fs_type); 449 if (IS_ERR(dax_mnt)) { 450 rc = PTR_ERR(dax_mnt); 451 goto err_mount; 452 } 453 dax_superblock = dax_mnt->mnt_sb; 454 455 return 0; 456 457 err_mount: 458 kmem_cache_destroy(dax_cache); 459 460 return rc; 461 } 462 463 static void dax_fs_exit(void) 464 { 465 kern_unmount(dax_mnt); 466 kmem_cache_destroy(dax_cache); 467 } 468 469 static int __init dax_core_init(void) 470 { 471 int rc; 472 473 rc = dax_fs_init(); 474 if (rc) 475 return rc; 476 477 rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); 478 if (rc) 479 goto err_chrdev; 480 481 rc = dax_bus_init(); 482 if (rc) 483 goto err_bus; 484 return 0; 485 486 err_bus: 487 unregister_chrdev_region(dax_devt, MINORMASK+1); 488 err_chrdev: 489 dax_fs_exit(); 490 return 0; 491 } 492 493 static void __exit dax_core_exit(void) 494 { 495 dax_bus_exit(); 496 unregister_chrdev_region(dax_devt, MINORMASK+1); 497 ida_destroy(&dax_minor_ida); 498 dax_fs_exit(); 499 } 500 501 MODULE_AUTHOR("Intel Corporation"); 502 MODULE_LICENSE("GPL v2"); 503 subsys_initcall(dax_core_init); 504 module_exit(dax_core_exit); 505