1 /* 2 * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 #include <linux/pagemap.h> 14 #include <linux/module.h> 15 #include <linux/device.h> 16 #include <linux/pfn_t.h> 17 #include <linux/cdev.h> 18 #include <linux/slab.h> 19 #include <linux/dax.h> 20 #include <linux/fs.h> 21 #include <linux/mm.h> 22 #include "dax-private.h" 23 #include "dax.h" 24 25 static struct class *dax_class; 26 27 /* 28 * Rely on the fact that drvdata is set before the attributes are 29 * registered, and that the attributes are unregistered before drvdata 30 * is cleared to assume that drvdata is always valid. 31 */ 32 static ssize_t id_show(struct device *dev, 33 struct device_attribute *attr, char *buf) 34 { 35 struct dax_region *dax_region = dev_get_drvdata(dev); 36 37 return sprintf(buf, "%d\n", dax_region->id); 38 } 39 static DEVICE_ATTR_RO(id); 40 41 static ssize_t region_size_show(struct device *dev, 42 struct device_attribute *attr, char *buf) 43 { 44 struct dax_region *dax_region = dev_get_drvdata(dev); 45 46 return sprintf(buf, "%llu\n", (unsigned long long) 47 resource_size(&dax_region->res)); 48 } 49 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, 50 region_size_show, NULL); 51 52 static ssize_t align_show(struct device *dev, 53 struct device_attribute *attr, char *buf) 54 { 55 struct dax_region *dax_region = dev_get_drvdata(dev); 56 57 return sprintf(buf, "%u\n", dax_region->align); 58 } 59 static DEVICE_ATTR_RO(align); 60 61 static struct attribute *dax_region_attributes[] = { 62 &dev_attr_region_size.attr, 63 &dev_attr_align.attr, 64 &dev_attr_id.attr, 65 NULL, 66 }; 67 68 static const struct attribute_group dax_region_attribute_group = { 69 .name = "dax_region", 70 .attrs = dax_region_attributes, 71 }; 72 73 static const struct attribute_group *dax_region_attribute_groups[] = { 74 &dax_region_attribute_group, 75 NULL, 76 }; 77 78 static void dax_region_free(struct kref *kref) 79 { 80 struct dax_region *dax_region; 81 82 dax_region = container_of(kref, struct dax_region, kref); 83 kfree(dax_region); 84 } 85 86 void dax_region_put(struct dax_region *dax_region) 87 { 88 kref_put(&dax_region->kref, dax_region_free); 89 } 90 EXPORT_SYMBOL_GPL(dax_region_put); 91 92 static void dax_region_unregister(void *region) 93 { 94 struct dax_region *dax_region = region; 95 96 sysfs_remove_groups(&dax_region->dev->kobj, 97 dax_region_attribute_groups); 98 dax_region_put(dax_region); 99 } 100 101 struct dax_region *alloc_dax_region(struct device *parent, int region_id, 102 struct resource *res, unsigned int align, void *addr, 103 unsigned long pfn_flags) 104 { 105 struct dax_region *dax_region; 106 107 /* 108 * The DAX core assumes that it can store its private data in 109 * parent->driver_data. This WARN is a reminder / safeguard for 110 * developers of device-dax drivers. 111 */ 112 if (dev_get_drvdata(parent)) { 113 dev_WARN(parent, "dax core failed to setup private data\n"); 114 return NULL; 115 } 116 117 if (!IS_ALIGNED(res->start, align) 118 || !IS_ALIGNED(resource_size(res), align)) 119 return NULL; 120 121 dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); 122 if (!dax_region) 123 return NULL; 124 125 dev_set_drvdata(parent, dax_region); 126 memcpy(&dax_region->res, res, sizeof(*res)); 127 dax_region->pfn_flags = pfn_flags; 128 kref_init(&dax_region->kref); 129 dax_region->id = region_id; 130 ida_init(&dax_region->ida); 131 dax_region->align = align; 132 dax_region->dev = parent; 133 dax_region->base = addr; 134 if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { 135 kfree(dax_region); 136 return NULL;; 137 } 138 139 kref_get(&dax_region->kref); 140 if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) 141 return NULL; 142 return dax_region; 143 } 144 EXPORT_SYMBOL_GPL(alloc_dax_region); 145 146 static struct dev_dax *to_dev_dax(struct device *dev) 147 { 148 return container_of(dev, struct dev_dax, dev); 149 } 150 151 static ssize_t size_show(struct device *dev, 152 struct device_attribute *attr, char *buf) 153 { 154 struct dev_dax *dev_dax = to_dev_dax(dev); 155 unsigned long long size = 0; 156 int i; 157 158 for (i = 0; i < dev_dax->num_resources; i++) 159 size += resource_size(&dev_dax->res[i]); 160 161 return sprintf(buf, "%llu\n", size); 162 } 163 static DEVICE_ATTR_RO(size); 164 165 static struct attribute *dev_dax_attributes[] = { 166 &dev_attr_size.attr, 167 NULL, 168 }; 169 170 static const struct attribute_group dev_dax_attribute_group = { 171 .attrs = dev_dax_attributes, 172 }; 173 174 static const struct attribute_group *dax_attribute_groups[] = { 175 &dev_dax_attribute_group, 176 NULL, 177 }; 178 179 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, 180 const char *func) 181 { 182 struct dax_region *dax_region = dev_dax->region; 183 struct device *dev = &dev_dax->dev; 184 unsigned long mask; 185 186 if (!dax_alive(dev_dax->dax_dev)) 187 return -ENXIO; 188 189 /* prevent private mappings from being established */ 190 if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { 191 dev_info(dev, "%s: %s: fail, attempted private mapping\n", 192 current->comm, func); 193 return -EINVAL; 194 } 195 196 mask = dax_region->align - 1; 197 if (vma->vm_start & mask || vma->vm_end & mask) { 198 dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", 199 current->comm, func, vma->vm_start, vma->vm_end, 200 mask); 201 return -EINVAL; 202 } 203 204 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV 205 && (vma->vm_flags & VM_DONTCOPY) == 0) { 206 dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", 207 current->comm, func); 208 return -EINVAL; 209 } 210 211 if (!vma_is_dax(vma)) { 212 dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", 213 current->comm, func); 214 return -EINVAL; 215 } 216 217 return 0; 218 } 219 220 /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ 221 __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 222 unsigned long size) 223 { 224 struct resource *res; 225 phys_addr_t phys; 226 int i; 227 228 for (i = 0; i < dev_dax->num_resources; i++) { 229 res = &dev_dax->res[i]; 230 phys = pgoff * PAGE_SIZE + res->start; 231 if (phys >= res->start && phys <= res->end) 232 break; 233 pgoff -= PHYS_PFN(resource_size(res)); 234 } 235 236 if (i < dev_dax->num_resources) { 237 res = &dev_dax->res[i]; 238 if (phys + size - 1 <= res->end) 239 return phys; 240 } 241 242 return -1; 243 } 244 245 static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 246 { 247 struct device *dev = &dev_dax->dev; 248 struct dax_region *dax_region; 249 int rc = VM_FAULT_SIGBUS; 250 phys_addr_t phys; 251 pfn_t pfn; 252 unsigned int fault_size = PAGE_SIZE; 253 254 if (check_vma(dev_dax, vmf->vma, __func__)) 255 return VM_FAULT_SIGBUS; 256 257 dax_region = dev_dax->region; 258 if (dax_region->align > PAGE_SIZE) { 259 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 260 __func__, dax_region->align, fault_size); 261 return VM_FAULT_SIGBUS; 262 } 263 264 if (fault_size != dax_region->align) 265 return VM_FAULT_SIGBUS; 266 267 phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); 268 if (phys == -1) { 269 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 270 vmf->pgoff); 271 return VM_FAULT_SIGBUS; 272 } 273 274 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 275 276 rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); 277 278 if (rc == -ENOMEM) 279 return VM_FAULT_OOM; 280 if (rc < 0 && rc != -EBUSY) 281 return VM_FAULT_SIGBUS; 282 283 return VM_FAULT_NOPAGE; 284 } 285 286 static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 287 { 288 unsigned long pmd_addr = vmf->address & PMD_MASK; 289 struct device *dev = &dev_dax->dev; 290 struct dax_region *dax_region; 291 phys_addr_t phys; 292 pgoff_t pgoff; 293 pfn_t pfn; 294 unsigned int fault_size = PMD_SIZE; 295 296 if (check_vma(dev_dax, vmf->vma, __func__)) 297 return VM_FAULT_SIGBUS; 298 299 dax_region = dev_dax->region; 300 if (dax_region->align > PMD_SIZE) { 301 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 302 __func__, dax_region->align, fault_size); 303 return VM_FAULT_SIGBUS; 304 } 305 306 /* dax pmd mappings require pfn_t_devmap() */ 307 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 308 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 309 return VM_FAULT_SIGBUS; 310 } 311 312 if (fault_size < dax_region->align) 313 return VM_FAULT_SIGBUS; 314 else if (fault_size > dax_region->align) 315 return VM_FAULT_FALLBACK; 316 317 /* if we are outside of the VMA */ 318 if (pmd_addr < vmf->vma->vm_start || 319 (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 320 return VM_FAULT_SIGBUS; 321 322 pgoff = linear_page_index(vmf->vma, pmd_addr); 323 phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); 324 if (phys == -1) { 325 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 326 pgoff); 327 return VM_FAULT_SIGBUS; 328 } 329 330 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 331 332 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, 333 vmf->flags & FAULT_FLAG_WRITE); 334 } 335 336 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 337 static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 338 { 339 unsigned long pud_addr = vmf->address & PUD_MASK; 340 struct device *dev = &dev_dax->dev; 341 struct dax_region *dax_region; 342 phys_addr_t phys; 343 pgoff_t pgoff; 344 pfn_t pfn; 345 unsigned int fault_size = PUD_SIZE; 346 347 348 if (check_vma(dev_dax, vmf->vma, __func__)) 349 return VM_FAULT_SIGBUS; 350 351 dax_region = dev_dax->region; 352 if (dax_region->align > PUD_SIZE) { 353 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 354 __func__, dax_region->align, fault_size); 355 return VM_FAULT_SIGBUS; 356 } 357 358 /* dax pud mappings require pfn_t_devmap() */ 359 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 360 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 361 return VM_FAULT_SIGBUS; 362 } 363 364 if (fault_size < dax_region->align) 365 return VM_FAULT_SIGBUS; 366 else if (fault_size > dax_region->align) 367 return VM_FAULT_FALLBACK; 368 369 /* if we are outside of the VMA */ 370 if (pud_addr < vmf->vma->vm_start || 371 (pud_addr + PUD_SIZE) > vmf->vma->vm_end) 372 return VM_FAULT_SIGBUS; 373 374 pgoff = linear_page_index(vmf->vma, pud_addr); 375 phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); 376 if (phys == -1) { 377 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 378 pgoff); 379 return VM_FAULT_SIGBUS; 380 } 381 382 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 383 384 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, 385 vmf->flags & FAULT_FLAG_WRITE); 386 } 387 #else 388 static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 389 { 390 return VM_FAULT_FALLBACK; 391 } 392 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 393 394 static int dev_dax_huge_fault(struct vm_fault *vmf, 395 enum page_entry_size pe_size) 396 { 397 int rc, id; 398 struct file *filp = vmf->vma->vm_file; 399 struct dev_dax *dev_dax = filp->private_data; 400 401 dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, 402 current->comm, (vmf->flags & FAULT_FLAG_WRITE) 403 ? "write" : "read", 404 vmf->vma->vm_start, vmf->vma->vm_end, pe_size); 405 406 id = dax_read_lock(); 407 switch (pe_size) { 408 case PE_SIZE_PTE: 409 rc = __dev_dax_pte_fault(dev_dax, vmf); 410 break; 411 case PE_SIZE_PMD: 412 rc = __dev_dax_pmd_fault(dev_dax, vmf); 413 break; 414 case PE_SIZE_PUD: 415 rc = __dev_dax_pud_fault(dev_dax, vmf); 416 break; 417 default: 418 rc = VM_FAULT_SIGBUS; 419 } 420 dax_read_unlock(id); 421 422 return rc; 423 } 424 425 static int dev_dax_fault(struct vm_fault *vmf) 426 { 427 return dev_dax_huge_fault(vmf, PE_SIZE_PTE); 428 } 429 430 static const struct vm_operations_struct dax_vm_ops = { 431 .fault = dev_dax_fault, 432 .huge_fault = dev_dax_huge_fault, 433 }; 434 435 static int dax_mmap(struct file *filp, struct vm_area_struct *vma) 436 { 437 struct dev_dax *dev_dax = filp->private_data; 438 int rc, id; 439 440 dev_dbg(&dev_dax->dev, "%s\n", __func__); 441 442 /* 443 * We lock to check dax_dev liveness and will re-check at 444 * fault time. 445 */ 446 id = dax_read_lock(); 447 rc = check_vma(dev_dax, vma, __func__); 448 dax_read_unlock(id); 449 if (rc) 450 return rc; 451 452 vma->vm_ops = &dax_vm_ops; 453 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 454 return 0; 455 } 456 457 /* return an unmapped area aligned to the dax region specified alignment */ 458 static unsigned long dax_get_unmapped_area(struct file *filp, 459 unsigned long addr, unsigned long len, unsigned long pgoff, 460 unsigned long flags) 461 { 462 unsigned long off, off_end, off_align, len_align, addr_align, align; 463 struct dev_dax *dev_dax = filp ? filp->private_data : NULL; 464 struct dax_region *dax_region; 465 466 if (!dev_dax || addr) 467 goto out; 468 469 dax_region = dev_dax->region; 470 align = dax_region->align; 471 off = pgoff << PAGE_SHIFT; 472 off_end = off + len; 473 off_align = round_up(off, align); 474 475 if ((off_end <= off_align) || ((off_end - off_align) < align)) 476 goto out; 477 478 len_align = len + align; 479 if ((off + len_align) < off) 480 goto out; 481 482 addr_align = current->mm->get_unmapped_area(filp, addr, len_align, 483 pgoff, flags); 484 if (!IS_ERR_VALUE(addr_align)) { 485 addr_align += (off - addr_align) & (align - 1); 486 return addr_align; 487 } 488 out: 489 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 490 } 491 492 static int dax_open(struct inode *inode, struct file *filp) 493 { 494 struct dax_device *dax_dev = inode_dax(inode); 495 struct inode *__dax_inode = dax_inode(dax_dev); 496 struct dev_dax *dev_dax = dax_get_private(dax_dev); 497 498 dev_dbg(&dev_dax->dev, "%s\n", __func__); 499 inode->i_mapping = __dax_inode->i_mapping; 500 inode->i_mapping->host = __dax_inode; 501 filp->f_mapping = inode->i_mapping; 502 filp->private_data = dev_dax; 503 inode->i_flags = S_DAX; 504 505 return 0; 506 } 507 508 static int dax_release(struct inode *inode, struct file *filp) 509 { 510 struct dev_dax *dev_dax = filp->private_data; 511 512 dev_dbg(&dev_dax->dev, "%s\n", __func__); 513 return 0; 514 } 515 516 static const struct file_operations dax_fops = { 517 .llseek = noop_llseek, 518 .owner = THIS_MODULE, 519 .open = dax_open, 520 .release = dax_release, 521 .get_unmapped_area = dax_get_unmapped_area, 522 .mmap = dax_mmap, 523 }; 524 525 static void dev_dax_release(struct device *dev) 526 { 527 struct dev_dax *dev_dax = to_dev_dax(dev); 528 struct dax_region *dax_region = dev_dax->region; 529 struct dax_device *dax_dev = dev_dax->dax_dev; 530 531 ida_simple_remove(&dax_region->ida, dev_dax->id); 532 dax_region_put(dax_region); 533 put_dax(dax_dev); 534 kfree(dev_dax); 535 } 536 537 static void kill_dev_dax(struct dev_dax *dev_dax) 538 { 539 struct dax_device *dax_dev = dev_dax->dax_dev; 540 struct inode *inode = dax_inode(dax_dev); 541 542 kill_dax(dax_dev); 543 unmap_mapping_range(inode->i_mapping, 0, 0, 1); 544 } 545 546 static void unregister_dev_dax(void *dev) 547 { 548 struct dev_dax *dev_dax = to_dev_dax(dev); 549 struct dax_device *dax_dev = dev_dax->dax_dev; 550 struct inode *inode = dax_inode(dax_dev); 551 struct cdev *cdev = inode->i_cdev; 552 553 dev_dbg(dev, "%s\n", __func__); 554 555 kill_dev_dax(dev_dax); 556 cdev_device_del(cdev, dev); 557 put_device(dev); 558 } 559 560 struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, 561 struct resource *res, int count) 562 { 563 struct device *parent = dax_region->dev; 564 struct dax_device *dax_dev; 565 struct dev_dax *dev_dax; 566 struct inode *inode; 567 struct device *dev; 568 struct cdev *cdev; 569 int rc = 0, i; 570 571 dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); 572 if (!dev_dax) 573 return ERR_PTR(-ENOMEM); 574 575 for (i = 0; i < count; i++) { 576 if (!IS_ALIGNED(res[i].start, dax_region->align) 577 || !IS_ALIGNED(resource_size(&res[i]), 578 dax_region->align)) { 579 rc = -EINVAL; 580 break; 581 } 582 dev_dax->res[i].start = res[i].start; 583 dev_dax->res[i].end = res[i].end; 584 } 585 586 if (i < count) 587 goto err_id; 588 589 dev_dax->id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); 590 if (dev_dax->id < 0) { 591 rc = dev_dax->id; 592 goto err_id; 593 } 594 595 /* 596 * No 'host' or dax_operations since there is no access to this 597 * device outside of mmap of the resulting character device. 598 */ 599 dax_dev = alloc_dax(dev_dax, NULL, NULL); 600 if (!dax_dev) 601 goto err_dax; 602 603 /* from here on we're committed to teardown via dax_dev_release() */ 604 dev = &dev_dax->dev; 605 device_initialize(dev); 606 607 inode = dax_inode(dax_dev); 608 cdev = inode->i_cdev; 609 cdev_init(cdev, &dax_fops); 610 cdev->owner = parent->driver->owner; 611 612 dev_dax->num_resources = count; 613 dev_dax->dax_dev = dax_dev; 614 dev_dax->region = dax_region; 615 kref_get(&dax_region->kref); 616 617 dev->devt = inode->i_rdev; 618 dev->class = dax_class; 619 dev->parent = parent; 620 dev->groups = dax_attribute_groups; 621 dev->release = dev_dax_release; 622 dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id); 623 624 rc = cdev_device_add(cdev, dev); 625 if (rc) { 626 kill_dev_dax(dev_dax); 627 put_device(dev); 628 return ERR_PTR(rc); 629 } 630 631 rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); 632 if (rc) 633 return ERR_PTR(rc); 634 635 return dev_dax; 636 637 err_dax: 638 ida_simple_remove(&dax_region->ida, dev_dax->id); 639 err_id: 640 kfree(dev_dax); 641 642 return ERR_PTR(rc); 643 } 644 EXPORT_SYMBOL_GPL(devm_create_dev_dax); 645 646 static int __init dax_init(void) 647 { 648 dax_class = class_create(THIS_MODULE, "dax"); 649 return PTR_ERR_OR_ZERO(dax_class); 650 } 651 652 static void __exit dax_exit(void) 653 { 654 class_destroy(dax_class); 655 } 656 657 MODULE_AUTHOR("Intel Corporation"); 658 MODULE_LICENSE("GPL v2"); 659 subsys_initcall(dax_init); 660 module_exit(dax_exit); 661