1 /* 2 * Copyright(c) 2017 Intel Corporation. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 #include <linux/pagemap.h> 14 #include <linux/module.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/genhd.h> 18 #include <linux/pfn_t.h> 19 #include <linux/cdev.h> 20 #include <linux/hash.h> 21 #include <linux/slab.h> 22 #include <linux/uio.h> 23 #include <linux/dax.h> 24 #include <linux/fs.h> 25 26 static dev_t dax_devt; 27 DEFINE_STATIC_SRCU(dax_srcu); 28 static struct vfsmount *dax_mnt; 29 static DEFINE_IDA(dax_minor_ida); 30 static struct kmem_cache *dax_cache __read_mostly; 31 static struct super_block *dax_superblock __read_mostly; 32 33 #define DAX_HASH_SIZE (PAGE_SIZE / sizeof(struct hlist_head)) 34 static struct hlist_head dax_host_list[DAX_HASH_SIZE]; 35 static DEFINE_SPINLOCK(dax_host_lock); 36 37 int dax_read_lock(void) 38 { 39 return srcu_read_lock(&dax_srcu); 40 } 41 EXPORT_SYMBOL_GPL(dax_read_lock); 42 43 void dax_read_unlock(int id) 44 { 45 srcu_read_unlock(&dax_srcu, id); 46 } 47 EXPORT_SYMBOL_GPL(dax_read_unlock); 48 49 #ifdef CONFIG_BLOCK 50 #include <linux/blkdev.h> 51 52 int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, 53 pgoff_t *pgoff) 54 { 55 phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512; 56 57 if (pgoff) 58 *pgoff = PHYS_PFN(phys_off); 59 if (phys_off % PAGE_SIZE || size % PAGE_SIZE) 60 return -EINVAL; 61 return 0; 62 } 63 EXPORT_SYMBOL(bdev_dax_pgoff); 64 65 #if IS_ENABLED(CONFIG_FS_DAX) 66 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) 67 { 68 if (!blk_queue_dax(bdev->bd_queue)) 69 return NULL; 70 return fs_dax_get_by_host(bdev->bd_disk->disk_name); 71 } 72 EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); 73 #endif 74 75 /** 76 * __bdev_dax_supported() - Check if the device supports dax for filesystem 77 * @bdev: block device to check 78 * @blocksize: The block size of the device 79 * 80 * This is a library function for filesystems to check if the block device 81 * can be mounted with dax option. 82 * 83 * Return: true if supported, false if unsupported 84 */ 85 bool __bdev_dax_supported(struct block_device *bdev, int blocksize) 86 { 87 struct dax_device *dax_dev; 88 bool dax_enabled = false; 89 pgoff_t pgoff; 90 int err, id; 91 void *kaddr; 92 pfn_t pfn; 93 long len; 94 char buf[BDEVNAME_SIZE]; 95 96 if (blocksize != PAGE_SIZE) { 97 pr_debug("%s: error: unsupported blocksize for dax\n", 98 bdevname(bdev, buf)); 99 return false; 100 } 101 102 err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); 103 if (err) { 104 pr_debug("%s: error: unaligned partition for dax\n", 105 bdevname(bdev, buf)); 106 return false; 107 } 108 109 dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); 110 if (!dax_dev) { 111 pr_debug("%s: error: device does not support dax\n", 112 bdevname(bdev, buf)); 113 return false; 114 } 115 116 id = dax_read_lock(); 117 len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn); 118 dax_read_unlock(id); 119 120 put_dax(dax_dev); 121 122 if (len < 1) { 123 pr_debug("%s: error: dax access failed (%ld)\n", 124 bdevname(bdev, buf), len); 125 return false; 126 } 127 128 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { 129 /* 130 * An arch that has enabled the pmem api should also 131 * have its drivers support pfn_t_devmap() 132 * 133 * This is a developer warning and should not trigger in 134 * production. dax_flush() will crash since it depends 135 * on being able to do (page_address(pfn_to_page())). 136 */ 137 WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); 138 dax_enabled = true; 139 } else if (pfn_t_devmap(pfn)) { 140 struct dev_pagemap *pgmap; 141 142 pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL); 143 if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX) 144 dax_enabled = true; 145 put_dev_pagemap(pgmap); 146 } 147 148 if (!dax_enabled) { 149 pr_debug("%s: error: dax support not enabled\n", 150 bdevname(bdev, buf)); 151 return false; 152 } 153 return true; 154 } 155 EXPORT_SYMBOL_GPL(__bdev_dax_supported); 156 #endif 157 158 enum dax_device_flags { 159 /* !alive + rcu grace period == no new operations / mappings */ 160 DAXDEV_ALIVE, 161 /* gate whether dax_flush() calls the low level flush routine */ 162 DAXDEV_WRITE_CACHE, 163 }; 164 165 /** 166 * struct dax_device - anchor object for dax services 167 * @inode: core vfs 168 * @cdev: optional character interface for "device dax" 169 * @host: optional name for lookups where the device path is not available 170 * @private: dax driver private data 171 * @flags: state and boolean properties 172 */ 173 struct dax_device { 174 struct hlist_node list; 175 struct inode inode; 176 struct cdev cdev; 177 const char *host; 178 void *private; 179 unsigned long flags; 180 const struct dax_operations *ops; 181 }; 182 183 static ssize_t write_cache_show(struct device *dev, 184 struct device_attribute *attr, char *buf) 185 { 186 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 187 ssize_t rc; 188 189 WARN_ON_ONCE(!dax_dev); 190 if (!dax_dev) 191 return -ENXIO; 192 193 rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev)); 194 put_dax(dax_dev); 195 return rc; 196 } 197 198 static ssize_t write_cache_store(struct device *dev, 199 struct device_attribute *attr, const char *buf, size_t len) 200 { 201 bool write_cache; 202 int rc = strtobool(buf, &write_cache); 203 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 204 205 WARN_ON_ONCE(!dax_dev); 206 if (!dax_dev) 207 return -ENXIO; 208 209 if (rc) 210 len = rc; 211 else 212 dax_write_cache(dax_dev, write_cache); 213 214 put_dax(dax_dev); 215 return len; 216 } 217 static DEVICE_ATTR_RW(write_cache); 218 219 static umode_t dax_visible(struct kobject *kobj, struct attribute *a, int n) 220 { 221 struct device *dev = container_of(kobj, typeof(*dev), kobj); 222 struct dax_device *dax_dev = dax_get_by_host(dev_name(dev)); 223 224 WARN_ON_ONCE(!dax_dev); 225 if (!dax_dev) 226 return 0; 227 228 #ifndef CONFIG_ARCH_HAS_PMEM_API 229 if (a == &dev_attr_write_cache.attr) 230 return 0; 231 #endif 232 return a->mode; 233 } 234 235 static struct attribute *dax_attributes[] = { 236 &dev_attr_write_cache.attr, 237 NULL, 238 }; 239 240 struct attribute_group dax_attribute_group = { 241 .name = "dax", 242 .attrs = dax_attributes, 243 .is_visible = dax_visible, 244 }; 245 EXPORT_SYMBOL_GPL(dax_attribute_group); 246 247 /** 248 * dax_direct_access() - translate a device pgoff to an absolute pfn 249 * @dax_dev: a dax_device instance representing the logical memory range 250 * @pgoff: offset in pages from the start of the device to translate 251 * @nr_pages: number of consecutive pages caller can handle relative to @pfn 252 * @kaddr: output parameter that returns a virtual address mapping of pfn 253 * @pfn: output parameter that returns an absolute pfn translation of @pgoff 254 * 255 * Return: negative errno if an error occurs, otherwise the number of 256 * pages accessible at the device relative @pgoff. 257 */ 258 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 259 void **kaddr, pfn_t *pfn) 260 { 261 long avail; 262 263 if (!dax_dev) 264 return -EOPNOTSUPP; 265 266 if (!dax_alive(dax_dev)) 267 return -ENXIO; 268 269 if (nr_pages < 0) 270 return nr_pages; 271 272 avail = dax_dev->ops->direct_access(dax_dev, pgoff, nr_pages, 273 kaddr, pfn); 274 if (!avail) 275 return -ERANGE; 276 return min(avail, nr_pages); 277 } 278 EXPORT_SYMBOL_GPL(dax_direct_access); 279 280 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 281 size_t bytes, struct iov_iter *i) 282 { 283 if (!dax_alive(dax_dev)) 284 return 0; 285 286 return dax_dev->ops->copy_from_iter(dax_dev, pgoff, addr, bytes, i); 287 } 288 EXPORT_SYMBOL_GPL(dax_copy_from_iter); 289 290 size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 291 size_t bytes, struct iov_iter *i) 292 { 293 if (!dax_alive(dax_dev)) 294 return 0; 295 296 return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i); 297 } 298 EXPORT_SYMBOL_GPL(dax_copy_to_iter); 299 300 #ifdef CONFIG_ARCH_HAS_PMEM_API 301 void arch_wb_cache_pmem(void *addr, size_t size); 302 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 303 { 304 if (unlikely(!dax_write_cache_enabled(dax_dev))) 305 return; 306 307 arch_wb_cache_pmem(addr, size); 308 } 309 #else 310 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) 311 { 312 } 313 #endif 314 EXPORT_SYMBOL_GPL(dax_flush); 315 316 void dax_write_cache(struct dax_device *dax_dev, bool wc) 317 { 318 if (wc) 319 set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 320 else 321 clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 322 } 323 EXPORT_SYMBOL_GPL(dax_write_cache); 324 325 bool dax_write_cache_enabled(struct dax_device *dax_dev) 326 { 327 return test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags); 328 } 329 EXPORT_SYMBOL_GPL(dax_write_cache_enabled); 330 331 bool dax_alive(struct dax_device *dax_dev) 332 { 333 lockdep_assert_held(&dax_srcu); 334 return test_bit(DAXDEV_ALIVE, &dax_dev->flags); 335 } 336 EXPORT_SYMBOL_GPL(dax_alive); 337 338 static int dax_host_hash(const char *host) 339 { 340 return hashlen_hash(hashlen_string("DAX", host)) % DAX_HASH_SIZE; 341 } 342 343 /* 344 * Note, rcu is not protecting the liveness of dax_dev, rcu is ensuring 345 * that any fault handlers or operations that might have seen 346 * dax_alive(), have completed. Any operations that start after 347 * synchronize_srcu() has run will abort upon seeing !dax_alive(). 348 */ 349 void kill_dax(struct dax_device *dax_dev) 350 { 351 if (!dax_dev) 352 return; 353 354 clear_bit(DAXDEV_ALIVE, &dax_dev->flags); 355 356 synchronize_srcu(&dax_srcu); 357 358 spin_lock(&dax_host_lock); 359 hlist_del_init(&dax_dev->list); 360 spin_unlock(&dax_host_lock); 361 362 dax_dev->private = NULL; 363 } 364 EXPORT_SYMBOL_GPL(kill_dax); 365 366 static struct inode *dax_alloc_inode(struct super_block *sb) 367 { 368 struct dax_device *dax_dev; 369 struct inode *inode; 370 371 dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); 372 if (!dax_dev) 373 return NULL; 374 375 inode = &dax_dev->inode; 376 inode->i_rdev = 0; 377 return inode; 378 } 379 380 static struct dax_device *to_dax_dev(struct inode *inode) 381 { 382 return container_of(inode, struct dax_device, inode); 383 } 384 385 static void dax_i_callback(struct rcu_head *head) 386 { 387 struct inode *inode = container_of(head, struct inode, i_rcu); 388 struct dax_device *dax_dev = to_dax_dev(inode); 389 390 kfree(dax_dev->host); 391 dax_dev->host = NULL; 392 if (inode->i_rdev) 393 ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); 394 kmem_cache_free(dax_cache, dax_dev); 395 } 396 397 static void dax_destroy_inode(struct inode *inode) 398 { 399 struct dax_device *dax_dev = to_dax_dev(inode); 400 401 WARN_ONCE(test_bit(DAXDEV_ALIVE, &dax_dev->flags), 402 "kill_dax() must be called before final iput()\n"); 403 call_rcu(&inode->i_rcu, dax_i_callback); 404 } 405 406 static const struct super_operations dax_sops = { 407 .statfs = simple_statfs, 408 .alloc_inode = dax_alloc_inode, 409 .destroy_inode = dax_destroy_inode, 410 .drop_inode = generic_delete_inode, 411 }; 412 413 static struct dentry *dax_mount(struct file_system_type *fs_type, 414 int flags, const char *dev_name, void *data) 415 { 416 return mount_pseudo(fs_type, "dax:", &dax_sops, NULL, DAXFS_MAGIC); 417 } 418 419 static struct file_system_type dax_fs_type = { 420 .name = "dax", 421 .mount = dax_mount, 422 .kill_sb = kill_anon_super, 423 }; 424 425 static int dax_test(struct inode *inode, void *data) 426 { 427 dev_t devt = *(dev_t *) data; 428 429 return inode->i_rdev == devt; 430 } 431 432 static int dax_set(struct inode *inode, void *data) 433 { 434 dev_t devt = *(dev_t *) data; 435 436 inode->i_rdev = devt; 437 return 0; 438 } 439 440 static struct dax_device *dax_dev_get(dev_t devt) 441 { 442 struct dax_device *dax_dev; 443 struct inode *inode; 444 445 inode = iget5_locked(dax_superblock, hash_32(devt + DAXFS_MAGIC, 31), 446 dax_test, dax_set, &devt); 447 448 if (!inode) 449 return NULL; 450 451 dax_dev = to_dax_dev(inode); 452 if (inode->i_state & I_NEW) { 453 set_bit(DAXDEV_ALIVE, &dax_dev->flags); 454 inode->i_cdev = &dax_dev->cdev; 455 inode->i_mode = S_IFCHR; 456 inode->i_flags = S_DAX; 457 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 458 unlock_new_inode(inode); 459 } 460 461 return dax_dev; 462 } 463 464 static void dax_add_host(struct dax_device *dax_dev, const char *host) 465 { 466 int hash; 467 468 /* 469 * Unconditionally init dax_dev since it's coming from a 470 * non-zeroed slab cache 471 */ 472 INIT_HLIST_NODE(&dax_dev->list); 473 dax_dev->host = host; 474 if (!host) 475 return; 476 477 hash = dax_host_hash(host); 478 spin_lock(&dax_host_lock); 479 hlist_add_head(&dax_dev->list, &dax_host_list[hash]); 480 spin_unlock(&dax_host_lock); 481 } 482 483 struct dax_device *alloc_dax(void *private, const char *__host, 484 const struct dax_operations *ops) 485 { 486 struct dax_device *dax_dev; 487 const char *host; 488 dev_t devt; 489 int minor; 490 491 host = kstrdup(__host, GFP_KERNEL); 492 if (__host && !host) 493 return NULL; 494 495 minor = ida_simple_get(&dax_minor_ida, 0, MINORMASK+1, GFP_KERNEL); 496 if (minor < 0) 497 goto err_minor; 498 499 devt = MKDEV(MAJOR(dax_devt), minor); 500 dax_dev = dax_dev_get(devt); 501 if (!dax_dev) 502 goto err_dev; 503 504 dax_add_host(dax_dev, host); 505 dax_dev->ops = ops; 506 dax_dev->private = private; 507 return dax_dev; 508 509 err_dev: 510 ida_simple_remove(&dax_minor_ida, minor); 511 err_minor: 512 kfree(host); 513 return NULL; 514 } 515 EXPORT_SYMBOL_GPL(alloc_dax); 516 517 void put_dax(struct dax_device *dax_dev) 518 { 519 if (!dax_dev) 520 return; 521 iput(&dax_dev->inode); 522 } 523 EXPORT_SYMBOL_GPL(put_dax); 524 525 /** 526 * dax_get_by_host() - temporary lookup mechanism for filesystem-dax 527 * @host: alternate name for the device registered by a dax driver 528 */ 529 struct dax_device *dax_get_by_host(const char *host) 530 { 531 struct dax_device *dax_dev, *found = NULL; 532 int hash, id; 533 534 if (!host) 535 return NULL; 536 537 hash = dax_host_hash(host); 538 539 id = dax_read_lock(); 540 spin_lock(&dax_host_lock); 541 hlist_for_each_entry(dax_dev, &dax_host_list[hash], list) { 542 if (!dax_alive(dax_dev) 543 || strcmp(host, dax_dev->host) != 0) 544 continue; 545 546 if (igrab(&dax_dev->inode)) 547 found = dax_dev; 548 break; 549 } 550 spin_unlock(&dax_host_lock); 551 dax_read_unlock(id); 552 553 return found; 554 } 555 EXPORT_SYMBOL_GPL(dax_get_by_host); 556 557 /** 558 * inode_dax: convert a public inode into its dax_dev 559 * @inode: An inode with i_cdev pointing to a dax_dev 560 * 561 * Note this is not equivalent to to_dax_dev() which is for private 562 * internal use where we know the inode filesystem type == dax_fs_type. 563 */ 564 struct dax_device *inode_dax(struct inode *inode) 565 { 566 struct cdev *cdev = inode->i_cdev; 567 568 return container_of(cdev, struct dax_device, cdev); 569 } 570 EXPORT_SYMBOL_GPL(inode_dax); 571 572 struct inode *dax_inode(struct dax_device *dax_dev) 573 { 574 return &dax_dev->inode; 575 } 576 EXPORT_SYMBOL_GPL(dax_inode); 577 578 void *dax_get_private(struct dax_device *dax_dev) 579 { 580 return dax_dev->private; 581 } 582 EXPORT_SYMBOL_GPL(dax_get_private); 583 584 static void init_once(void *_dax_dev) 585 { 586 struct dax_device *dax_dev = _dax_dev; 587 struct inode *inode = &dax_dev->inode; 588 589 memset(dax_dev, 0, sizeof(*dax_dev)); 590 inode_init_once(inode); 591 } 592 593 static int __dax_fs_init(void) 594 { 595 int rc; 596 597 dax_cache = kmem_cache_create("dax_cache", sizeof(struct dax_device), 0, 598 (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 599 SLAB_MEM_SPREAD|SLAB_ACCOUNT), 600 init_once); 601 if (!dax_cache) 602 return -ENOMEM; 603 604 rc = register_filesystem(&dax_fs_type); 605 if (rc) 606 goto err_register_fs; 607 608 dax_mnt = kern_mount(&dax_fs_type); 609 if (IS_ERR(dax_mnt)) { 610 rc = PTR_ERR(dax_mnt); 611 goto err_mount; 612 } 613 dax_superblock = dax_mnt->mnt_sb; 614 615 return 0; 616 617 err_mount: 618 unregister_filesystem(&dax_fs_type); 619 err_register_fs: 620 kmem_cache_destroy(dax_cache); 621 622 return rc; 623 } 624 625 static void __dax_fs_exit(void) 626 { 627 kern_unmount(dax_mnt); 628 unregister_filesystem(&dax_fs_type); 629 kmem_cache_destroy(dax_cache); 630 } 631 632 static int __init dax_fs_init(void) 633 { 634 int rc; 635 636 rc = __dax_fs_init(); 637 if (rc) 638 return rc; 639 640 rc = alloc_chrdev_region(&dax_devt, 0, MINORMASK+1, "dax"); 641 if (rc) 642 __dax_fs_exit(); 643 return rc; 644 } 645 646 static void __exit dax_fs_exit(void) 647 { 648 unregister_chrdev_region(dax_devt, MINORMASK+1); 649 ida_destroy(&dax_minor_ida); 650 __dax_fs_exit(); 651 } 652 653 MODULE_AUTHOR("Intel Corporation"); 654 MODULE_LICENSE("GPL v2"); 655 subsys_initcall(dax_fs_init); 656 module_exit(dax_fs_exit); 657