1 /* 2 * Copyright(c) 2016 - 2017 Intel Corporation. All rights reserved. 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of version 2 of the GNU General Public License as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 * General Public License for more details. 12 */ 13 #include <linux/pagemap.h> 14 #include <linux/module.h> 15 #include <linux/device.h> 16 #include <linux/pfn_t.h> 17 #include <linux/cdev.h> 18 #include <linux/slab.h> 19 #include <linux/dax.h> 20 #include <linux/fs.h> 21 #include <linux/mm.h> 22 #include "dax-private.h" 23 #include "dax.h" 24 25 static struct class *dax_class; 26 27 /* 28 * Rely on the fact that drvdata is set before the attributes are 29 * registered, and that the attributes are unregistered before drvdata 30 * is cleared to assume that drvdata is always valid. 31 */ 32 static ssize_t id_show(struct device *dev, 33 struct device_attribute *attr, char *buf) 34 { 35 struct dax_region *dax_region = dev_get_drvdata(dev); 36 37 return sprintf(buf, "%d\n", dax_region->id); 38 } 39 static DEVICE_ATTR_RO(id); 40 41 static ssize_t region_size_show(struct device *dev, 42 struct device_attribute *attr, char *buf) 43 { 44 struct dax_region *dax_region = dev_get_drvdata(dev); 45 46 return sprintf(buf, "%llu\n", (unsigned long long) 47 resource_size(&dax_region->res)); 48 } 49 static struct device_attribute dev_attr_region_size = __ATTR(size, 0444, 50 region_size_show, NULL); 51 52 static ssize_t align_show(struct device *dev, 53 struct device_attribute *attr, char *buf) 54 { 55 struct dax_region *dax_region = dev_get_drvdata(dev); 56 57 return sprintf(buf, "%u\n", dax_region->align); 58 } 59 static DEVICE_ATTR_RO(align); 60 61 static struct attribute *dax_region_attributes[] = { 62 &dev_attr_region_size.attr, 63 &dev_attr_align.attr, 64 &dev_attr_id.attr, 65 NULL, 66 }; 67 68 static const struct attribute_group dax_region_attribute_group = { 69 .name = "dax_region", 70 .attrs = dax_region_attributes, 71 }; 72 73 static const struct attribute_group *dax_region_attribute_groups[] = { 74 &dax_region_attribute_group, 75 NULL, 76 }; 77 78 static void dax_region_free(struct kref *kref) 79 { 80 struct dax_region *dax_region; 81 82 dax_region = container_of(kref, struct dax_region, kref); 83 kfree(dax_region); 84 } 85 86 void dax_region_put(struct dax_region *dax_region) 87 { 88 kref_put(&dax_region->kref, dax_region_free); 89 } 90 EXPORT_SYMBOL_GPL(dax_region_put); 91 92 static void dax_region_unregister(void *region) 93 { 94 struct dax_region *dax_region = region; 95 96 sysfs_remove_groups(&dax_region->dev->kobj, 97 dax_region_attribute_groups); 98 dax_region_put(dax_region); 99 } 100 101 struct dax_region *alloc_dax_region(struct device *parent, int region_id, 102 struct resource *res, unsigned int align, void *addr, 103 unsigned long pfn_flags) 104 { 105 struct dax_region *dax_region; 106 107 /* 108 * The DAX core assumes that it can store its private data in 109 * parent->driver_data. This WARN is a reminder / safeguard for 110 * developers of device-dax drivers. 111 */ 112 if (dev_get_drvdata(parent)) { 113 dev_WARN(parent, "dax core failed to setup private data\n"); 114 return NULL; 115 } 116 117 if (!IS_ALIGNED(res->start, align) 118 || !IS_ALIGNED(resource_size(res), align)) 119 return NULL; 120 121 dax_region = kzalloc(sizeof(*dax_region), GFP_KERNEL); 122 if (!dax_region) 123 return NULL; 124 125 dev_set_drvdata(parent, dax_region); 126 memcpy(&dax_region->res, res, sizeof(*res)); 127 dax_region->pfn_flags = pfn_flags; 128 kref_init(&dax_region->kref); 129 dax_region->id = region_id; 130 ida_init(&dax_region->ida); 131 dax_region->align = align; 132 dax_region->dev = parent; 133 dax_region->base = addr; 134 if (sysfs_create_groups(&parent->kobj, dax_region_attribute_groups)) { 135 kfree(dax_region); 136 return NULL;; 137 } 138 139 kref_get(&dax_region->kref); 140 if (devm_add_action_or_reset(parent, dax_region_unregister, dax_region)) 141 return NULL; 142 return dax_region; 143 } 144 EXPORT_SYMBOL_GPL(alloc_dax_region); 145 146 static struct dev_dax *to_dev_dax(struct device *dev) 147 { 148 return container_of(dev, struct dev_dax, dev); 149 } 150 151 static ssize_t size_show(struct device *dev, 152 struct device_attribute *attr, char *buf) 153 { 154 struct dev_dax *dev_dax = to_dev_dax(dev); 155 unsigned long long size = 0; 156 int i; 157 158 for (i = 0; i < dev_dax->num_resources; i++) 159 size += resource_size(&dev_dax->res[i]); 160 161 return sprintf(buf, "%llu\n", size); 162 } 163 static DEVICE_ATTR_RO(size); 164 165 static struct attribute *dev_dax_attributes[] = { 166 &dev_attr_size.attr, 167 NULL, 168 }; 169 170 static const struct attribute_group dev_dax_attribute_group = { 171 .attrs = dev_dax_attributes, 172 }; 173 174 static const struct attribute_group *dax_attribute_groups[] = { 175 &dev_dax_attribute_group, 176 NULL, 177 }; 178 179 static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, 180 const char *func) 181 { 182 struct dax_region *dax_region = dev_dax->region; 183 struct device *dev = &dev_dax->dev; 184 unsigned long mask; 185 186 if (!dax_alive(dev_dax->dax_dev)) 187 return -ENXIO; 188 189 /* prevent private mappings from being established */ 190 if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) { 191 dev_info(dev, "%s: %s: fail, attempted private mapping\n", 192 current->comm, func); 193 return -EINVAL; 194 } 195 196 mask = dax_region->align - 1; 197 if (vma->vm_start & mask || vma->vm_end & mask) { 198 dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", 199 current->comm, func, vma->vm_start, vma->vm_end, 200 mask); 201 return -EINVAL; 202 } 203 204 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV 205 && (vma->vm_flags & VM_DONTCOPY) == 0) { 206 dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", 207 current->comm, func); 208 return -EINVAL; 209 } 210 211 if (!vma_is_dax(vma)) { 212 dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", 213 current->comm, func); 214 return -EINVAL; 215 } 216 217 return 0; 218 } 219 220 /* see "strong" declaration in tools/testing/nvdimm/dax-dev.c */ 221 __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff, 222 unsigned long size) 223 { 224 struct resource *res; 225 /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */ 226 phys_addr_t uninitialized_var(phys); 227 int i; 228 229 for (i = 0; i < dev_dax->num_resources; i++) { 230 res = &dev_dax->res[i]; 231 phys = pgoff * PAGE_SIZE + res->start; 232 if (phys >= res->start && phys <= res->end) 233 break; 234 pgoff -= PHYS_PFN(resource_size(res)); 235 } 236 237 if (i < dev_dax->num_resources) { 238 res = &dev_dax->res[i]; 239 if (phys + size - 1 <= res->end) 240 return phys; 241 } 242 243 return -1; 244 } 245 246 static int __dev_dax_pte_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 247 { 248 struct device *dev = &dev_dax->dev; 249 struct dax_region *dax_region; 250 int rc = VM_FAULT_SIGBUS; 251 phys_addr_t phys; 252 pfn_t pfn; 253 unsigned int fault_size = PAGE_SIZE; 254 255 if (check_vma(dev_dax, vmf->vma, __func__)) 256 return VM_FAULT_SIGBUS; 257 258 dax_region = dev_dax->region; 259 if (dax_region->align > PAGE_SIZE) { 260 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 261 __func__, dax_region->align, fault_size); 262 return VM_FAULT_SIGBUS; 263 } 264 265 if (fault_size != dax_region->align) 266 return VM_FAULT_SIGBUS; 267 268 phys = dax_pgoff_to_phys(dev_dax, vmf->pgoff, PAGE_SIZE); 269 if (phys == -1) { 270 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 271 vmf->pgoff); 272 return VM_FAULT_SIGBUS; 273 } 274 275 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 276 277 rc = vm_insert_mixed(vmf->vma, vmf->address, pfn); 278 279 if (rc == -ENOMEM) 280 return VM_FAULT_OOM; 281 if (rc < 0 && rc != -EBUSY) 282 return VM_FAULT_SIGBUS; 283 284 return VM_FAULT_NOPAGE; 285 } 286 287 static int __dev_dax_pmd_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 288 { 289 unsigned long pmd_addr = vmf->address & PMD_MASK; 290 struct device *dev = &dev_dax->dev; 291 struct dax_region *dax_region; 292 phys_addr_t phys; 293 pgoff_t pgoff; 294 pfn_t pfn; 295 unsigned int fault_size = PMD_SIZE; 296 297 if (check_vma(dev_dax, vmf->vma, __func__)) 298 return VM_FAULT_SIGBUS; 299 300 dax_region = dev_dax->region; 301 if (dax_region->align > PMD_SIZE) { 302 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 303 __func__, dax_region->align, fault_size); 304 return VM_FAULT_SIGBUS; 305 } 306 307 /* dax pmd mappings require pfn_t_devmap() */ 308 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 309 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 310 return VM_FAULT_SIGBUS; 311 } 312 313 if (fault_size < dax_region->align) 314 return VM_FAULT_SIGBUS; 315 else if (fault_size > dax_region->align) 316 return VM_FAULT_FALLBACK; 317 318 /* if we are outside of the VMA */ 319 if (pmd_addr < vmf->vma->vm_start || 320 (pmd_addr + PMD_SIZE) > vmf->vma->vm_end) 321 return VM_FAULT_SIGBUS; 322 323 pgoff = linear_page_index(vmf->vma, pmd_addr); 324 phys = dax_pgoff_to_phys(dev_dax, pgoff, PMD_SIZE); 325 if (phys == -1) { 326 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 327 pgoff); 328 return VM_FAULT_SIGBUS; 329 } 330 331 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 332 333 return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, 334 vmf->flags & FAULT_FLAG_WRITE); 335 } 336 337 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD 338 static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 339 { 340 unsigned long pud_addr = vmf->address & PUD_MASK; 341 struct device *dev = &dev_dax->dev; 342 struct dax_region *dax_region; 343 phys_addr_t phys; 344 pgoff_t pgoff; 345 pfn_t pfn; 346 unsigned int fault_size = PUD_SIZE; 347 348 349 if (check_vma(dev_dax, vmf->vma, __func__)) 350 return VM_FAULT_SIGBUS; 351 352 dax_region = dev_dax->region; 353 if (dax_region->align > PUD_SIZE) { 354 dev_dbg(dev, "%s: alignment (%#x) > fault size (%#x)\n", 355 __func__, dax_region->align, fault_size); 356 return VM_FAULT_SIGBUS; 357 } 358 359 /* dax pud mappings require pfn_t_devmap() */ 360 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { 361 dev_dbg(dev, "%s: region lacks devmap flags\n", __func__); 362 return VM_FAULT_SIGBUS; 363 } 364 365 if (fault_size < dax_region->align) 366 return VM_FAULT_SIGBUS; 367 else if (fault_size > dax_region->align) 368 return VM_FAULT_FALLBACK; 369 370 /* if we are outside of the VMA */ 371 if (pud_addr < vmf->vma->vm_start || 372 (pud_addr + PUD_SIZE) > vmf->vma->vm_end) 373 return VM_FAULT_SIGBUS; 374 375 pgoff = linear_page_index(vmf->vma, pud_addr); 376 phys = dax_pgoff_to_phys(dev_dax, pgoff, PUD_SIZE); 377 if (phys == -1) { 378 dev_dbg(dev, "%s: pgoff_to_phys(%#lx) failed\n", __func__, 379 pgoff); 380 return VM_FAULT_SIGBUS; 381 } 382 383 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 384 385 return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, pfn, 386 vmf->flags & FAULT_FLAG_WRITE); 387 } 388 #else 389 static int __dev_dax_pud_fault(struct dev_dax *dev_dax, struct vm_fault *vmf) 390 { 391 return VM_FAULT_FALLBACK; 392 } 393 #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ 394 395 static int dev_dax_huge_fault(struct vm_fault *vmf, 396 enum page_entry_size pe_size) 397 { 398 int rc, id; 399 struct file *filp = vmf->vma->vm_file; 400 struct dev_dax *dev_dax = filp->private_data; 401 402 dev_dbg(&dev_dax->dev, "%s: %s: %s (%#lx - %#lx) size = %d\n", __func__, 403 current->comm, (vmf->flags & FAULT_FLAG_WRITE) 404 ? "write" : "read", 405 vmf->vma->vm_start, vmf->vma->vm_end, pe_size); 406 407 id = dax_read_lock(); 408 switch (pe_size) { 409 case PE_SIZE_PTE: 410 rc = __dev_dax_pte_fault(dev_dax, vmf); 411 break; 412 case PE_SIZE_PMD: 413 rc = __dev_dax_pmd_fault(dev_dax, vmf); 414 break; 415 case PE_SIZE_PUD: 416 rc = __dev_dax_pud_fault(dev_dax, vmf); 417 break; 418 default: 419 rc = VM_FAULT_SIGBUS; 420 } 421 dax_read_unlock(id); 422 423 return rc; 424 } 425 426 static int dev_dax_fault(struct vm_fault *vmf) 427 { 428 return dev_dax_huge_fault(vmf, PE_SIZE_PTE); 429 } 430 431 static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) 432 { 433 struct file *filp = vma->vm_file; 434 struct dev_dax *dev_dax = filp->private_data; 435 struct dax_region *dax_region = dev_dax->region; 436 437 if (!IS_ALIGNED(addr, dax_region->align)) 438 return -EINVAL; 439 return 0; 440 } 441 442 static const struct vm_operations_struct dax_vm_ops = { 443 .fault = dev_dax_fault, 444 .huge_fault = dev_dax_huge_fault, 445 .split = dev_dax_split, 446 }; 447 448 static int dax_mmap(struct file *filp, struct vm_area_struct *vma) 449 { 450 struct dev_dax *dev_dax = filp->private_data; 451 int rc, id; 452 453 dev_dbg(&dev_dax->dev, "%s\n", __func__); 454 455 /* 456 * We lock to check dax_dev liveness and will re-check at 457 * fault time. 458 */ 459 id = dax_read_lock(); 460 rc = check_vma(dev_dax, vma, __func__); 461 dax_read_unlock(id); 462 if (rc) 463 return rc; 464 465 vma->vm_ops = &dax_vm_ops; 466 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; 467 return 0; 468 } 469 470 /* return an unmapped area aligned to the dax region specified alignment */ 471 static unsigned long dax_get_unmapped_area(struct file *filp, 472 unsigned long addr, unsigned long len, unsigned long pgoff, 473 unsigned long flags) 474 { 475 unsigned long off, off_end, off_align, len_align, addr_align, align; 476 struct dev_dax *dev_dax = filp ? filp->private_data : NULL; 477 struct dax_region *dax_region; 478 479 if (!dev_dax || addr) 480 goto out; 481 482 dax_region = dev_dax->region; 483 align = dax_region->align; 484 off = pgoff << PAGE_SHIFT; 485 off_end = off + len; 486 off_align = round_up(off, align); 487 488 if ((off_end <= off_align) || ((off_end - off_align) < align)) 489 goto out; 490 491 len_align = len + align; 492 if ((off + len_align) < off) 493 goto out; 494 495 addr_align = current->mm->get_unmapped_area(filp, addr, len_align, 496 pgoff, flags); 497 if (!IS_ERR_VALUE(addr_align)) { 498 addr_align += (off - addr_align) & (align - 1); 499 return addr_align; 500 } 501 out: 502 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); 503 } 504 505 static int dax_open(struct inode *inode, struct file *filp) 506 { 507 struct dax_device *dax_dev = inode_dax(inode); 508 struct inode *__dax_inode = dax_inode(dax_dev); 509 struct dev_dax *dev_dax = dax_get_private(dax_dev); 510 511 dev_dbg(&dev_dax->dev, "%s\n", __func__); 512 inode->i_mapping = __dax_inode->i_mapping; 513 inode->i_mapping->host = __dax_inode; 514 filp->f_mapping = inode->i_mapping; 515 filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); 516 filp->private_data = dev_dax; 517 inode->i_flags = S_DAX; 518 519 return 0; 520 } 521 522 static int dax_release(struct inode *inode, struct file *filp) 523 { 524 struct dev_dax *dev_dax = filp->private_data; 525 526 dev_dbg(&dev_dax->dev, "%s\n", __func__); 527 return 0; 528 } 529 530 static const struct file_operations dax_fops = { 531 .llseek = noop_llseek, 532 .owner = THIS_MODULE, 533 .open = dax_open, 534 .release = dax_release, 535 .get_unmapped_area = dax_get_unmapped_area, 536 .mmap = dax_mmap, 537 }; 538 539 static void dev_dax_release(struct device *dev) 540 { 541 struct dev_dax *dev_dax = to_dev_dax(dev); 542 struct dax_region *dax_region = dev_dax->region; 543 struct dax_device *dax_dev = dev_dax->dax_dev; 544 545 if (dev_dax->id >= 0) 546 ida_simple_remove(&dax_region->ida, dev_dax->id); 547 dax_region_put(dax_region); 548 put_dax(dax_dev); 549 kfree(dev_dax); 550 } 551 552 static void kill_dev_dax(struct dev_dax *dev_dax) 553 { 554 struct dax_device *dax_dev = dev_dax->dax_dev; 555 struct inode *inode = dax_inode(dax_dev); 556 557 kill_dax(dax_dev); 558 unmap_mapping_range(inode->i_mapping, 0, 0, 1); 559 } 560 561 static void unregister_dev_dax(void *dev) 562 { 563 struct dev_dax *dev_dax = to_dev_dax(dev); 564 struct dax_device *dax_dev = dev_dax->dax_dev; 565 struct inode *inode = dax_inode(dax_dev); 566 struct cdev *cdev = inode->i_cdev; 567 568 dev_dbg(dev, "%s\n", __func__); 569 570 kill_dev_dax(dev_dax); 571 cdev_device_del(cdev, dev); 572 put_device(dev); 573 } 574 575 struct dev_dax *devm_create_dev_dax(struct dax_region *dax_region, 576 int id, struct resource *res, int count) 577 { 578 struct device *parent = dax_region->dev; 579 struct dax_device *dax_dev; 580 struct dev_dax *dev_dax; 581 struct inode *inode; 582 struct device *dev; 583 struct cdev *cdev; 584 int rc, i; 585 586 if (!count) 587 return ERR_PTR(-EINVAL); 588 589 dev_dax = kzalloc(sizeof(*dev_dax) + sizeof(*res) * count, GFP_KERNEL); 590 if (!dev_dax) 591 return ERR_PTR(-ENOMEM); 592 593 for (i = 0; i < count; i++) { 594 if (!IS_ALIGNED(res[i].start, dax_region->align) 595 || !IS_ALIGNED(resource_size(&res[i]), 596 dax_region->align)) { 597 rc = -EINVAL; 598 break; 599 } 600 dev_dax->res[i].start = res[i].start; 601 dev_dax->res[i].end = res[i].end; 602 } 603 604 if (i < count) 605 goto err_id; 606 607 if (id < 0) { 608 id = ida_simple_get(&dax_region->ida, 0, 0, GFP_KERNEL); 609 dev_dax->id = id; 610 if (id < 0) { 611 rc = id; 612 goto err_id; 613 } 614 } else { 615 /* region provider owns @id lifetime */ 616 dev_dax->id = -1; 617 } 618 619 /* 620 * No 'host' or dax_operations since there is no access to this 621 * device outside of mmap of the resulting character device. 622 */ 623 dax_dev = alloc_dax(dev_dax, NULL, NULL); 624 if (!dax_dev) { 625 rc = -ENOMEM; 626 goto err_dax; 627 } 628 629 /* from here on we're committed to teardown via dax_dev_release() */ 630 dev = &dev_dax->dev; 631 device_initialize(dev); 632 633 inode = dax_inode(dax_dev); 634 cdev = inode->i_cdev; 635 cdev_init(cdev, &dax_fops); 636 cdev->owner = parent->driver->owner; 637 638 dev_dax->num_resources = count; 639 dev_dax->dax_dev = dax_dev; 640 dev_dax->region = dax_region; 641 kref_get(&dax_region->kref); 642 643 dev->devt = inode->i_rdev; 644 dev->class = dax_class; 645 dev->parent = parent; 646 dev->groups = dax_attribute_groups; 647 dev->release = dev_dax_release; 648 dev_set_name(dev, "dax%d.%d", dax_region->id, id); 649 650 rc = cdev_device_add(cdev, dev); 651 if (rc) { 652 kill_dev_dax(dev_dax); 653 put_device(dev); 654 return ERR_PTR(rc); 655 } 656 657 rc = devm_add_action_or_reset(dax_region->dev, unregister_dev_dax, dev); 658 if (rc) 659 return ERR_PTR(rc); 660 661 return dev_dax; 662 663 err_dax: 664 if (dev_dax->id >= 0) 665 ida_simple_remove(&dax_region->ida, dev_dax->id); 666 err_id: 667 kfree(dev_dax); 668 669 return ERR_PTR(rc); 670 } 671 EXPORT_SYMBOL_GPL(devm_create_dev_dax); 672 673 static int __init dax_init(void) 674 { 675 dax_class = class_create(THIS_MODULE, "dax"); 676 return PTR_ERR_OR_ZERO(dax_class); 677 } 678 679 static void __exit dax_exit(void) 680 { 681 class_destroy(dax_class); 682 } 683 684 MODULE_AUTHOR("Intel Corporation"); 685 MODULE_LICENSE("GPL v2"); 686 subsys_initcall(dax_init); 687 module_exit(dax_exit); 688