1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright(c) 2017 Intel Corporation. All rights reserved. 4 */ 5 #include <linux/pagemap.h> 6 #include <linux/module.h> 7 #include <linux/mount.h> 8 #include <linux/pseudo_fs.h> 9 #include <linux/magic.h> 10 #include <linux/genhd.h> 11 #include <linux/pfn_t.h> 12 #include <linux/cdev.h> 13 #include <linux/hash.h> 14 #include <linux/slab.h> 15 #include <linux/uio.h> 16 #include <linux/dax.h> 17 #include <linux/fs.h> 18 #include "dax-private.h" 19 20 /** 21 * struct dax_device - anchor object for dax services 22 * @inode: core vfs 23 * @cdev: optional character interface for "device dax" 24 * @host: optional name for lookups where the device path is not available 25 * @private: dax driver private data 26 * @flags: state and boolean properties 27 */ 28 struct dax_device { 29 struct hlist_node list; 30 struct inode inode; 31 struct cdev cdev; 32 const char *host; 33 void *private; 34 unsigned long flags; 35 const struct dax_operations *ops; 36 }; 37 38 static dev_t dax_devt; 39 DEFINE_STATIC_SRCU(dax_srcu); 40 static struct vfsmount *dax_mnt; 41 static DEFINE_IDA(dax_minor_ida); 42 static struct kmem_cache *dax_cache __read_mostly; 43 static struct super_block *dax_superblock __read_mostly; 44 45 #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) 46 static struct hlist_head dax_host_list[DAX_HASH_SIZE]; 47 static DEFINE_SPINLOCK(dax_host_lock); 48 49 int dax_read_lock(void) 50 { 51 return srcu_read_lock(&dax_srcu); 52 } 53 EXPORT_SYMBOL_GPL(dax_read_lock); 54 55 void dax_read_unlock(int id) 56 { 57 srcu_read_unlock(&dax_srcu, id); 58 } 59 EXPORT_SYMBOL_GPL(dax_read_unlock); 60 61 static int dax_host_hash(const char *host) 62 { 63 return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; 64 } 65 66 #ifdef CONFIG_BLOCK 67 #include <linux/blkdev.h> 68 69 int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, 70 pgoff_t *pgoff) 71 { 72 sector_t start_sect = bdev ? get_start_sect(bdev) : 0; 73 phys_addr_t phys_off = (start_sect + sector) * 512; 74 75 if (pgoff) 76 *pgoff = PHYS_PFN(phys_off); 77 if (phys_off % PAGE_SIZE || size % PAGE_SIZE) 78 return -EINVAL; 79 return 0; 80 } 81 EXPORT_SYMBOL(bdev_dax_pgoff); 82 83 #if IS_ENABLED(CONFIG_FS_DAX) 84 /** 85 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax 86 * @host: alternate name for the device registered by a dax driver 87 */ 88 static struct dax_device *dax_get_by_host(const char *host) 89 { 90 struct dax_device *dax_dev, *found = NULL; 91 int hash, id; 92 93 if (!host) 94 return NULL; 95 96 hash = dax_host_hash(host); 97 98 id = dax_read_lock(); 99 spin_lock(&dax_host_lock); 100 hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { 101 if (!dax_alive(dax_dev) 102 || strcmp(host, dax_dev->host) != 0) 103 continue; 104 105 if (igrab(&dax_dev->inode)) 106 found = dax_dev; 107 break; 108 } 109 spin_unlock(&dax_host_lock); 110 dax_read_unlock(id); 111 112 return found; 113 } 114 115 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 116 { 117 if (!blk_queue_dax(bdev->bd_disk->queue)) 118 return NULL; 119 return dax_get_by_host(bdev->bd_disk->disk_name); 120 } 121 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 122 123 bool generic_fsdax_supported(struct dax_device *dax_dev, 124 struct block_device *bdev, int blocksize, sector_t start, 125 sector_t sectors) 126 { 127 bool dax_enabled = false; 128 pgoff_t pgoff, pgoff_end; 129 void *kaddr, *end_kaddr; 130 pfn_t pfn, end_pfn; 131 sector_t last_page; 132 long len, len2; 133 int err, id; 134 135 if (blocksize != PAGE_SIZE) { 136 pr_info("%pg: error: unsupported blocksize for dax\n", bdev); 137 return false; 138 } 139 140 if (!dax_dev) { 141 pr_debug("%pg: error: dax unsupported by block device\n", bdev); 142 return false; 143 } 144 145 err = bdev_dax_pgoff(bdev, start, PAGE_SIZE, &pgoff); 146 if (err) { 147 pr_info("%pg: error: unaligned partition for dax\n", bdev); 148 return false; 149 } 150 151 last_page = PFN_DOWN((start + sectors - 1) * 512) * PAGE_SIZE / 512; 152 err = bdev_dax_pgoff(bdev, last_page, PAGE_SIZE, &pgoff_end); 153 if (err) { 154 pr_info("%pg: error: unaligned partition for dax\n", bdev); 155 return false; 156 } 157 158 id = dax_read_lock(); 159 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); 160 len2 = dax_direct_access(dax_dev, pgoff_end, 1, &end_kaddr, &end_pfn); 161 162 if (len < 1 || len2 < 1) { 163 pr_info("%pg: error: dax access failed (%ld)\n", 164 bdev, len < 1 ? len : len2); 165 dax_read_unlock(id); 166 return false; 167 } 168 169 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { 170 /* 171 * An arch that has enabled the pmem api should also 172 * have its drivers support pfn_t_devmap() 173 * 174 * This is a developer warning and should not trigger in 175 * production. dax_flush() will crash since it depends 176 * on being able to do (page_address(pfn_to_page())). 177 */ 178 WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); 179 dax_enabled = true; 180 } else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) { 181 struct dev_pagemap *pgmap, *end_pgmap; 182 183 pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); 184 end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL); 185 if (pgmap && pgmap == end_pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX 186 && pfn_t_to_page(pfn)->pgmap == pgmap 187 && pfn_t_to_page(end_pfn)->pgmap == pgmap 188 && pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr)) 189 && pfn_t_to_pfn(end_pfn) == PHYS_PFN(__pa(end_kaddr))) 190 dax_enabled = true; 191 put_dev_pagemap(pgmap); 192 put_dev_pagemap(end_pgmap); 193 194 } 195 dax_read_unlock(id); 196 197 if (!dax_enabled) { 198 pr_info("%pg: error: dax support not enabled\n", bdev); 199 return false; 200 } 201 return true; 202 } 203 EXPORT_SYMBOL_GPL(generic_fsdax_supported); 204 205 bool dax_supported(struct dax_device *dax_dev, struct block_device *bdev, 206 int blocksize, sector_t start, sector_t len) 207 { 208 bool ret = false; 209 int id; 210 211 if (!dax_dev) 212 return false; 213 214 id = dax_read_lock(); 215 if (dax_alive(dax_dev) && dax_dev->ops->dax_supported) 216 ret = dax_dev->ops->dax_supported(dax_dev, bdev, blocksize, 217 start, len); 218 dax_read_unlock(id); 219 return ret; 220 } 221 EXPORT_SYMBOL_GPL(dax_supported); 222 #endif /* CONFIG_FS_DAX */ 223 #endif /* CONFIG_BLOCK */ 224 225 enum dax_device_flags { 226 /* !alive + rcu grace period == no new operations / mappings */ 227 DAXDEV_ALIVE, 228 /* gate whether dax_flush() calls the low level flush routine */ 229 DAXDEV_WRITE_CACHE, 230 /* flag to check if device supports synchronous flush */ 231 DAXDEV_SYNC, 232 }; 233 234 /** 235 * dax_direct_access() - translate a device pgoff to an absolute pfn 236 * @dax_dev: a dax_device instance representing the logical memory range 237 * @pgoff: offset in pages from the start of the device to translate 238 * @nr_pages: number of consecutive pages caller can handle relative to @pfn 239 * @kaddr: output parameter that returns a virtual address mapping of pfn 240 * @pfn: output parameter that returns an absolute pfn translation of @pgoff 241 * 242 * Return: negative errno if an error occurs, otherwise the number of 243 * pages accessible at the device relative @pgoff. 244 */ 245 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 246 void **kaddr, pfn_t *pfn) 247 { 248 long avail; 249 250 if (!dax_dev) 251 return -EOPNOTSUPP; 252 253 if (!dax_alive(dax_dev)) 254 return -ENXIO; 255 256 if (nr_pages < 0) 257 return -EINVAL; 258 259 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 260 kaddr, pfn); 261 if (!avail) 262 return -ERANGE; 263 return min(avail, nr_pages); 264 } 265 EXPORT_SYMBOL_GPL(dax_direct_access); 266 267 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 268 size_t bytes, struct iov_iter *i) 269 { 270 if (!dax_alive(dax_dev)) 271 return 0; 272 273 return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); 274 } 275 EXPORT_SYMBOL_GPL(dax_copy_from_iter); 276 277 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 278 size_t bytes, struct iov_iter *i) 279 { 280 if (!dax_alive(dax_dev)) 281 return 0; 282 283 return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); 284 } 285 EXPORT_SYMBOL_GPL(dax_copy_to_iter); 286 287 int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, 288 size_t nr_pages) 289 { 290 if (!dax_alive(dax_dev)) 291 return -ENXIO; 292 /* 293 * There are no callers that want to zero more than one page as of now. 294 * Once users are there, this check can be removed after the 295 * device mapper code has been updated to split ranges across targets. 296 */ 297 if (nr_pages != 1) 298 return -EIO; 299 300 return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); 301 } 302 EXPORT_SYMBOL_GPL(dax_zero_page_range); 303 304 #ifdef CONFIG_ARCH_HAS_PMEM_API 305 void arch_wb_cache_pmem(void *addr, size_t size); 306 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 307 { 308 if (unlikely(!dax_write_cache_enabled(dax_dev))) 309 return; 310 311 arch_wb_cache_pmem(addr, size); 312 } 313 #else 314 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 315 { 316 } 317 #endif 318 EXPORT_SYMBOL_GPL(dax_flush); 319 320 void dax_write_cache(struct dax_device *dax_dev, bool wc) 321 { 322 if (wc) 323 set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 324 else 325 clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 326 } 327 EXPORT_SYMBOL_GPL(dax_write_cache); 328 329 bool dax_write_cache_enabled(struct dax_device *dax_dev) 330 { 331 return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 332 } 333 EXPORT_SYMBOL_GPL(dax_write_cache_enabled); 334 335 bool __dax_synchronous(struct dax_device *dax_dev) 336 { 337 return test_bit(DAXDEV_SYNC, &dax_dev->flags); 338 } 339 EXPORT_SYMBOL_GPL(__dax_synchronous); 340 341 void __set_dax_synchronous(struct dax_device *dax_dev) 342 { 343 set_bit(DAXDEV_SYNC, &dax_dev->flags); 344 } 345 EXPORT_SYMBOL_GPL(__set_dax_synchronous); 346 347 bool dax_alive(struct dax_device *dax_dev) 348 { 349 lockdep_assert_held(&dax_srcu); 350 return test_bit(DAXDEV_ALIVE, &dax_dev->flags); 351 } 352 EXPORT_SYMBOL_GPL(dax_alive); 353 354 /* 355 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring 356 * that any fault handlers or operations that might have seen 357 * dax_alive(), have completed. Any operations that start after 358 * synchronize_srcu() has run will abort upon seeing !dax_alive(). 359 */ 360 void kill_dax(struct dax_device *dax_dev) 361 { 362 if (!dax_dev) 363 return; 364 365 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 366 367 synchronize_srcu(&dax_srcu); 368 369 spin_lock(&dax_host_lock); 370 hlist_del_init(&dax_dev->list); 371 spin_unlock(&dax_host_lock); 372 } 373 EXPORT_SYMBOL_GPL(kill_dax); 374 375 void run_dax(struct dax_device *dax_dev) 376 { 377 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 378 } 379 EXPORT_SYMBOL_GPL(run_dax); 380 381 static struct inode *dax_alloc_inode(struct super_block *sb) 382 { 383 struct dax_device *dax_dev; 384 struct inode *inode; 385 386 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 387 if (!dax_dev) 388 return NULL; 389 390 inode = &dax_dev->inode; 391 inode->i_rdev = 0; 392 return inode; 393 } 394 395 static struct dax_device *to_dax_dev(struct inode *inode) 396 { 397 return container_of(inode, struct dax_device, inode); 398 } 399 400 static void dax_free_inode(struct inode *inode) 401 { 402 struct dax_device *dax_dev = to_dax_dev(inode); 403 kfree(dax_dev->host); 404 dax_dev->host = NULL; 405 if (inode->i_rdev) 406 ida_simple_remove(&dax_minor_ida, iminor(inode)); 407 kmem_cache_free(dax_cache, dax_dev); 408 } 409 410 static void dax_destroy_inode(struct inode *inode) 411 { 412 struct dax_device *dax_dev = to_dax_dev(inode); 413 WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), 414 "kill_dax() must be called before final iput()\n"); 415 } 416 417 static const struct super_operations dax_sops = { 418 .statfs = simple_statfs, 419 .alloc_inode = dax_alloc_inode, 420 .destroy_inode = dax_destroy_inode, 421 .free_inode = dax_free_inode, 422 .drop_inode = generic_delete_inode, 423 }; 424 425 static int dax_init_fs_context(struct fs_context *fc) 426 { 427 struct pseudo_fs_context *ctx = init_pseudo(fc, DAXFS_MAGIC); 428 if (!ctx) 429 return -ENOMEM; 430 ctx->ops = &dax_sops; 431 return 0; 432 } 433 434 static struct file_system_type dax_fs_type = { 435 .name = "dax", 436 .init_fs_context = dax_init_fs_context, 437 .kill_sb = kill_anon_super, 438 }; 439 440 static int dax_test(struct inode *inode, void *data) 441 { 442 dev_t devt = *(dev_t *) data; 443 444 return inode->i_rdev == devt; 445 } 446 447 static int dax_set(struct inode *inode, void *data) 448 { 449 dev_t devt = *(dev_t *) data; 450 451 inode->i_rdev = devt; 452 return 0; 453 } 454 455 static struct dax_device *dax_dev_get(dev_t devt) 456 { 457 struct dax_device *dax_dev; 458 struct inode *inode; 459 460 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 461 dax_test, dax_set, &devt); 462 463 if (!inode) 464 return NULL; 465 466 dax_dev = to_dax_dev(inode); 467 if (inode->i_state & I_NEW) { 468 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 469 inode->i_cdev = &dax_dev->cdev; 470 inode->i_mode = S_IFCHR; 471 inode->i_flags = S_DAX; 472 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 473 unlock_new_inode(inode); 474 } 475 476 return dax_dev; 477 } 478 479 static void dax_add_host(struct dax_device *dax_dev, const char *host) 480 { 481 int hash; 482 483 /* 484 * Unconditionally init dax_dev since it's coming from a 485 * non-zeroed slab cache 486 */ 487 INIT_HLIST_NODE(&dax_dev->list); 488 dax_dev->host = host; 489 if (!host) 490 return; 491 492 hash = dax_host_hash(host); 493 spin_lock(&dax_host_lock); 494 hlist_add_head(&dax_dev->list, &dax_host_list[hash]); 495 spin_unlock(&dax_host_lock); 496 } 497 498 struct dax_device *alloc_dax(void *private, const char *__host, 499 const struct dax_operations *ops, unsigned long flags) 500 { 501 struct dax_device *dax_dev; 502 const char *host; 503 dev_t devt; 504 int minor; 505 506 if (ops && !ops->zero_page_range) { 507 pr_debug("%s: error: device does not provide dax" 508 " operation zero_page_range()\n", 509 __host ? __host : "Unknown"); 510 return ERR_PTR(-EINVAL); 511 } 512 513 host = kstrdup(__host, GFP_KERNEL); 514 if (__host && !host) 515 return ERR_PTR(-ENOMEM); 516 517 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); 518 if (minor < 0) 519 goto err_minor; 520 521 devt = MKDEV(MAJOR(dax_devt), minor); 522 dax_dev = dax_dev_get(devt); 523 if (!dax_dev) 524 goto err_dev; 525 526 dax_add_host(dax_dev, host); 527 dax_dev->ops = ops; 528 dax_dev->private = private; 529 if (flags & DAXDEV_F_SYNC) 530 set_dax_synchronous(dax_dev); 531 532 return dax_dev; 533 534 err_dev: 535 ida_simple_remove(&dax_minor_ida, minor); 536 err_minor: 537 kfree(host); 538 return ERR_PTR(-ENOMEM); 539 } 540 EXPORT_SYMBOL_GPL(alloc_dax); 541 542 void put_dax(struct dax_device *dax_dev) 543 { 544 if (!dax_dev) 545 return; 546 iput(&dax_dev->inode); 547 } 548 EXPORT_SYMBOL_GPL(put_dax); 549 550 /** 551 * inode_dax: convert a public inode into its dax_dev 552 * @inode: An inode with i_cdev pointing to a dax_dev 553 * 554 * Note this is not equivalent to to_dax_dev() which is for private 555 * internal use where we know the inode filesystem type == dax_fs_type. 556 */ 557 struct dax_device *inode_dax(struct inode *inode) 558 { 559 struct cdev *cdev = inode->i_cdev; 560 561 return container_of(cdev, struct dax_device, cdev); 562 } 563 EXPORT_SYMBOL_GPL(inode_dax); 564 565 struct inode *dax_inode(struct dax_device *dax_dev) 566 { 567 return &dax_dev->inode; 568 } 569 EXPORT_SYMBOL_GPL(dax_inode); 570 571 void *dax_get_private(struct dax_device *dax_dev) 572 { 573 if (!test_bit(DAXDEV_ALIVE, &dax_dev->flags)) 574 return NULL; 575 return dax_dev->private; 576 } 577 EXPORT_SYMBOL_GPL(dax_get_private); 578 579 static void init_once(void *_dax_dev) 580 { 581 struct dax_device *dax_dev = _dax_dev; 582 struct inode *inode = &dax_dev->inode; 583 584 memset(dax_dev, 0, sizeof(*dax_dev)); 585 inode_init_once(inode); 586 } 587 588 static int dax_fs_init(void) 589 { 590 int rc; 591 592 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, 593 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 594 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 595 init_once); 596 if (!dax_cache) 597 return -ENOMEM; 598 599 dax_mnt = kern_mount(&dax_fs_type); 600 if (IS_ERR(dax_mnt)) { 601 rc = PTR_ERR(dax_mnt); 602 goto err_mount; 603 } 604 dax_superblock = dax_mnt->mnt_sb; 605 606 return 0; 607 608 err_mount: 609 kmem_cache_destroy(dax_cache); 610 611 return rc; 612 } 613 614 static void dax_fs_exit(void) 615 { 616 kern_unmount(dax_mnt); 617 kmem_cache_destroy(dax_cache); 618 } 619 620 static int __init dax_core_init(void) 621 { 622 int rc; 623 624 rc = dax_fs_init(); 625 if (rc) 626 return rc; 627 628 rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); 629 if (rc) 630 goto err_chrdev; 631 632 rc = dax_bus_init(); 633 if (rc) 634 goto err_bus; 635 return 0; 636 637 err_bus: 638 unregister_chrdev_region(dax_devt, MINORMASK+1); 639 err_chrdev: 640 dax_fs_exit(); 641 return 0; 642 } 643 644 static void __exit dax_core_exit(void) 645 { 646 dax_bus_exit(); 647 unregister_chrdev_region(dax_devt, MINORMASK+1); 648 ida_destroy(&dax_minor_ida); 649 dax_fs_exit(); 650 } 651 652 MODULE_AUTHOR("Intel Corporation"); 653 MODULE_LICENSE("GPL v2"); 654 subsys_initcall(dax_core_init); 655 module_exit(dax_core_exit); 656