1f394576eSJason Gunthorpe // SPDX-License-Identifier: GPL-2.0 2f394576eSJason Gunthorpe /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. 3f394576eSJason Gunthorpe * 4f394576eSJason Gunthorpe * The iopt_pages is the center of the storage and motion of PFNs. Each 5f394576eSJason Gunthorpe * iopt_pages represents a logical linear array of full PFNs. The array is 0 6f394576eSJason Gunthorpe * based and has npages in it. Accessors use 'index' to refer to the entry in 7f394576eSJason Gunthorpe * this logical array, regardless of its storage location. 8f394576eSJason Gunthorpe * 9f394576eSJason Gunthorpe * PFNs are stored in a tiered scheme: 10f394576eSJason Gunthorpe * 1) iopt_pages::pinned_pfns xarray 11f394576eSJason Gunthorpe * 2) An iommu_domain 12f394576eSJason Gunthorpe * 3) The origin of the PFNs, i.e. the userspace pointer 13f394576eSJason Gunthorpe * 14f394576eSJason Gunthorpe * PFN have to be copied between all combinations of tiers, depending on the 15f394576eSJason Gunthorpe * configuration. 16f394576eSJason Gunthorpe * 17f394576eSJason Gunthorpe * When a PFN is taken out of the userspace pointer it is pinned exactly once. 18f394576eSJason Gunthorpe * The storage locations of the PFN's index are tracked in the two interval 19f394576eSJason Gunthorpe * trees. If no interval includes the index then it is not pinned. 20f394576eSJason Gunthorpe * 21f394576eSJason Gunthorpe * If access_itree includes the PFN's index then an in-kernel access has 22f394576eSJason Gunthorpe * requested the page. The PFN is stored in the xarray so other requestors can 23f394576eSJason Gunthorpe * continue to find it. 24f394576eSJason Gunthorpe * 25f394576eSJason Gunthorpe * If the domains_itree includes the PFN's index then an iommu_domain is storing 26f394576eSJason Gunthorpe * the PFN and it can be read back using iommu_iova_to_phys(). To avoid 27f394576eSJason Gunthorpe * duplicating storage the xarray is not used if only iommu_domains are using 28f394576eSJason Gunthorpe * the PFN's index. 29f394576eSJason Gunthorpe * 30f394576eSJason Gunthorpe * As a general principle this is designed so that destroy never fails. This 31f394576eSJason Gunthorpe * means removing an iommu_domain or releasing a in-kernel access will not fail 32f394576eSJason Gunthorpe * due to insufficient memory. In practice this means some cases have to hold 33f394576eSJason Gunthorpe * PFNs in the xarray even though they are also being stored in an iommu_domain. 34f394576eSJason Gunthorpe * 35f394576eSJason Gunthorpe * While the iopt_pages can use an iommu_domain as storage, it does not have an 36f394576eSJason Gunthorpe * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the 37f394576eSJason Gunthorpe * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages 38f394576eSJason Gunthorpe * and reference their own slice of the PFN array, with sub page granularity. 39f394576eSJason Gunthorpe * 40f394576eSJason Gunthorpe * In this file the term 'last' indicates an inclusive and closed interval, eg 41f394576eSJason Gunthorpe * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to 42f394576eSJason Gunthorpe * no PFNs. 43f394576eSJason Gunthorpe * 44f394576eSJason Gunthorpe * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so 45f394576eSJason Gunthorpe * last_iova + 1 can overflow. An iopt_pages index will always be much less than 46f394576eSJason Gunthorpe * ULONG_MAX so last_index + 1 cannot overflow. 47f394576eSJason Gunthorpe */ 48f394576eSJason Gunthorpe #include <linux/overflow.h> 49f394576eSJason Gunthorpe #include <linux/slab.h> 50f394576eSJason Gunthorpe #include <linux/iommu.h> 51f394576eSJason Gunthorpe #include <linux/sched/mm.h> 52f394576eSJason Gunthorpe #include <linux/highmem.h> 53f394576eSJason Gunthorpe #include <linux/kthread.h> 54f394576eSJason Gunthorpe #include <linux/iommufd.h> 55f394576eSJason Gunthorpe 56f394576eSJason Gunthorpe #include "io_pagetable.h" 57f394576eSJason Gunthorpe #include "double_span.h" 58f394576eSJason Gunthorpe 59f4b20bb3SJason Gunthorpe #ifndef CONFIG_IOMMUFD_TEST 60f394576eSJason Gunthorpe #define TEMP_MEMORY_LIMIT 65536 61f4b20bb3SJason Gunthorpe #else 62f4b20bb3SJason Gunthorpe #define TEMP_MEMORY_LIMIT iommufd_test_memory_limit 63f4b20bb3SJason Gunthorpe #endif 64f394576eSJason Gunthorpe #define BATCH_BACKUP_SIZE 32 65f394576eSJason Gunthorpe 66f394576eSJason Gunthorpe /* 67f394576eSJason Gunthorpe * More memory makes pin_user_pages() and the batching more efficient, but as 68f394576eSJason Gunthorpe * this is only a performance optimization don't try too hard to get it. A 64k 69f394576eSJason Gunthorpe * allocation can hold about 26M of 4k pages and 13G of 2M pages in an 70f394576eSJason Gunthorpe * pfn_batch. Various destroy paths cannot fail and provide a small amount of 71f394576eSJason Gunthorpe * stack memory as a backup contingency. If backup_len is given this cannot 72f394576eSJason Gunthorpe * fail. 73f394576eSJason Gunthorpe */ 74f394576eSJason Gunthorpe static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) 75f394576eSJason Gunthorpe { 76f394576eSJason Gunthorpe void *res; 77f394576eSJason Gunthorpe 78f394576eSJason Gunthorpe if (WARN_ON(*size == 0)) 79f394576eSJason Gunthorpe return NULL; 80f394576eSJason Gunthorpe 81f394576eSJason Gunthorpe if (*size < backup_len) 82f394576eSJason Gunthorpe return backup; 83*e26eed4fSJason Gunthorpe 84*e26eed4fSJason Gunthorpe if (!backup && iommufd_should_fail()) 85*e26eed4fSJason Gunthorpe return NULL; 86*e26eed4fSJason Gunthorpe 87f394576eSJason Gunthorpe *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); 88f394576eSJason Gunthorpe res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); 89f394576eSJason Gunthorpe if (res) 90f394576eSJason Gunthorpe return res; 91f394576eSJason Gunthorpe *size = PAGE_SIZE; 92f394576eSJason Gunthorpe if (backup_len) { 93f394576eSJason Gunthorpe res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); 94f394576eSJason Gunthorpe if (res) 95f394576eSJason Gunthorpe return res; 96f394576eSJason Gunthorpe *size = backup_len; 97f394576eSJason Gunthorpe return backup; 98f394576eSJason Gunthorpe } 99f394576eSJason Gunthorpe return kmalloc(*size, GFP_KERNEL); 100f394576eSJason Gunthorpe } 101f394576eSJason Gunthorpe 102f394576eSJason Gunthorpe void interval_tree_double_span_iter_update( 103f394576eSJason Gunthorpe struct interval_tree_double_span_iter *iter) 104f394576eSJason Gunthorpe { 105f394576eSJason Gunthorpe unsigned long last_hole = ULONG_MAX; 106f394576eSJason Gunthorpe unsigned int i; 107f394576eSJason Gunthorpe 108f394576eSJason Gunthorpe for (i = 0; i != ARRAY_SIZE(iter->spans); i++) { 109f394576eSJason Gunthorpe if (interval_tree_span_iter_done(&iter->spans[i])) { 110f394576eSJason Gunthorpe iter->is_used = -1; 111f394576eSJason Gunthorpe return; 112f394576eSJason Gunthorpe } 113f394576eSJason Gunthorpe 114f394576eSJason Gunthorpe if (iter->spans[i].is_hole) { 115f394576eSJason Gunthorpe last_hole = min(last_hole, iter->spans[i].last_hole); 116f394576eSJason Gunthorpe continue; 117f394576eSJason Gunthorpe } 118f394576eSJason Gunthorpe 119f394576eSJason Gunthorpe iter->is_used = i + 1; 120f394576eSJason Gunthorpe iter->start_used = iter->spans[i].start_used; 121f394576eSJason Gunthorpe iter->last_used = min(iter->spans[i].last_used, last_hole); 122f394576eSJason Gunthorpe return; 123f394576eSJason Gunthorpe } 124f394576eSJason Gunthorpe 125f394576eSJason Gunthorpe iter->is_used = 0; 126f394576eSJason Gunthorpe iter->start_hole = iter->spans[0].start_hole; 127f394576eSJason Gunthorpe iter->last_hole = 128f394576eSJason Gunthorpe min(iter->spans[0].last_hole, iter->spans[1].last_hole); 129f394576eSJason Gunthorpe } 130f394576eSJason Gunthorpe 131f394576eSJason Gunthorpe void interval_tree_double_span_iter_first( 132f394576eSJason Gunthorpe struct interval_tree_double_span_iter *iter, 133f394576eSJason Gunthorpe struct rb_root_cached *itree1, struct rb_root_cached *itree2, 134f394576eSJason Gunthorpe unsigned long first_index, unsigned long last_index) 135f394576eSJason Gunthorpe { 136f394576eSJason Gunthorpe unsigned int i; 137f394576eSJason Gunthorpe 138f394576eSJason Gunthorpe iter->itrees[0] = itree1; 139f394576eSJason Gunthorpe iter->itrees[1] = itree2; 140f394576eSJason Gunthorpe for (i = 0; i != ARRAY_SIZE(iter->spans); i++) 141f394576eSJason Gunthorpe interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i], 142f394576eSJason Gunthorpe first_index, last_index); 143f394576eSJason Gunthorpe interval_tree_double_span_iter_update(iter); 144f394576eSJason Gunthorpe } 145f394576eSJason Gunthorpe 146f394576eSJason Gunthorpe void interval_tree_double_span_iter_next( 147f394576eSJason Gunthorpe struct interval_tree_double_span_iter *iter) 148f394576eSJason Gunthorpe { 149f394576eSJason Gunthorpe unsigned int i; 150f394576eSJason Gunthorpe 151f394576eSJason Gunthorpe if (iter->is_used == -1 || 152f394576eSJason Gunthorpe iter->last_hole == iter->spans[0].last_index) { 153f394576eSJason Gunthorpe iter->is_used = -1; 154f394576eSJason Gunthorpe return; 155f394576eSJason Gunthorpe } 156f394576eSJason Gunthorpe 157f394576eSJason Gunthorpe for (i = 0; i != ARRAY_SIZE(iter->spans); i++) 158f394576eSJason Gunthorpe interval_tree_span_iter_advance( 159f394576eSJason Gunthorpe &iter->spans[i], iter->itrees[i], iter->last_hole + 1); 160f394576eSJason Gunthorpe interval_tree_double_span_iter_update(iter); 161f394576eSJason Gunthorpe } 162f394576eSJason Gunthorpe 163f394576eSJason Gunthorpe static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) 164f394576eSJason Gunthorpe { 165f394576eSJason Gunthorpe pages->npinned += npages; 166f394576eSJason Gunthorpe } 167f394576eSJason Gunthorpe 168f394576eSJason Gunthorpe static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) 169f394576eSJason Gunthorpe { 170f394576eSJason Gunthorpe pages->npinned -= npages; 171f394576eSJason Gunthorpe } 172f394576eSJason Gunthorpe 173f394576eSJason Gunthorpe static void iopt_pages_err_unpin(struct iopt_pages *pages, 174f394576eSJason Gunthorpe unsigned long start_index, 175f394576eSJason Gunthorpe unsigned long last_index, 176f394576eSJason Gunthorpe struct page **page_list) 177f394576eSJason Gunthorpe { 178f394576eSJason Gunthorpe unsigned long npages = last_index - start_index + 1; 179f394576eSJason Gunthorpe 180f394576eSJason Gunthorpe unpin_user_pages(page_list, npages); 181f394576eSJason Gunthorpe iopt_pages_sub_npinned(pages, npages); 182f394576eSJason Gunthorpe } 183f394576eSJason Gunthorpe 184f394576eSJason Gunthorpe /* 185f394576eSJason Gunthorpe * index is the number of PAGE_SIZE units from the start of the area's 186f394576eSJason Gunthorpe * iopt_pages. If the iova is sub page-size then the area has an iova that 187f394576eSJason Gunthorpe * covers a portion of the first and last pages in the range. 188f394576eSJason Gunthorpe */ 189f394576eSJason Gunthorpe static unsigned long iopt_area_index_to_iova(struct iopt_area *area, 190f394576eSJason Gunthorpe unsigned long index) 191f394576eSJason Gunthorpe { 192f394576eSJason Gunthorpe index -= iopt_area_index(area); 193f394576eSJason Gunthorpe if (index == 0) 194f394576eSJason Gunthorpe return iopt_area_iova(area); 195f394576eSJason Gunthorpe return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE; 196f394576eSJason Gunthorpe } 197f394576eSJason Gunthorpe 198f394576eSJason Gunthorpe static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, 199f394576eSJason Gunthorpe unsigned long index) 200f394576eSJason Gunthorpe { 201f394576eSJason Gunthorpe if (index == iopt_area_last_index(area)) 202f394576eSJason Gunthorpe return iopt_area_last_iova(area); 203f394576eSJason Gunthorpe return iopt_area_iova(area) - area->page_offset + 204f394576eSJason Gunthorpe (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1; 205f394576eSJason Gunthorpe } 206f394576eSJason Gunthorpe 207f394576eSJason Gunthorpe static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, 208f394576eSJason Gunthorpe size_t size) 209f394576eSJason Gunthorpe { 210f394576eSJason Gunthorpe size_t ret; 211f394576eSJason Gunthorpe 212f394576eSJason Gunthorpe ret = iommu_unmap(domain, iova, size); 213f394576eSJason Gunthorpe /* 214f394576eSJason Gunthorpe * It is a logic error in this code or a driver bug if the IOMMU unmaps 215f394576eSJason Gunthorpe * something other than exactly as requested. This implies that the 216f394576eSJason Gunthorpe * iommu driver may not fail unmap for reasons beyond bad agruments. 217f394576eSJason Gunthorpe * Particularly, the iommu driver may not do a memory allocation on the 218f394576eSJason Gunthorpe * unmap path. 219f394576eSJason Gunthorpe */ 220f394576eSJason Gunthorpe WARN_ON(ret != size); 221f394576eSJason Gunthorpe } 222f394576eSJason Gunthorpe 2238d160cd4SJason Gunthorpe static void iopt_area_unmap_domain_range(struct iopt_area *area, 2248d160cd4SJason Gunthorpe struct iommu_domain *domain, 2258d160cd4SJason Gunthorpe unsigned long start_index, 2268d160cd4SJason Gunthorpe unsigned long last_index) 2278d160cd4SJason Gunthorpe { 2288d160cd4SJason Gunthorpe unsigned long start_iova = iopt_area_index_to_iova(area, start_index); 2298d160cd4SJason Gunthorpe 2308d160cd4SJason Gunthorpe iommu_unmap_nofail(domain, start_iova, 2318d160cd4SJason Gunthorpe iopt_area_index_to_iova_last(area, last_index) - 2328d160cd4SJason Gunthorpe start_iova + 1); 2338d160cd4SJason Gunthorpe } 2348d160cd4SJason Gunthorpe 235f394576eSJason Gunthorpe static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, 236f394576eSJason Gunthorpe unsigned long index) 237f394576eSJason Gunthorpe { 238f394576eSJason Gunthorpe struct interval_tree_node *node; 239f394576eSJason Gunthorpe 240f394576eSJason Gunthorpe node = interval_tree_iter_first(&pages->domains_itree, index, index); 241f394576eSJason Gunthorpe if (!node) 242f394576eSJason Gunthorpe return NULL; 243f394576eSJason Gunthorpe return container_of(node, struct iopt_area, pages_node); 244f394576eSJason Gunthorpe } 245f394576eSJason Gunthorpe 246f394576eSJason Gunthorpe /* 247f394576eSJason Gunthorpe * A simple datastructure to hold a vector of PFNs, optimized for contiguous 248f394576eSJason Gunthorpe * PFNs. This is used as a temporary holding memory for shuttling pfns from one 249f394576eSJason Gunthorpe * place to another. Generally everything is made more efficient if operations 250f394576eSJason Gunthorpe * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, 251f394576eSJason Gunthorpe * better cache locality, etc 252f394576eSJason Gunthorpe */ 253f394576eSJason Gunthorpe struct pfn_batch { 254f394576eSJason Gunthorpe unsigned long *pfns; 255f394576eSJason Gunthorpe u32 *npfns; 256f394576eSJason Gunthorpe unsigned int array_size; 257f394576eSJason Gunthorpe unsigned int end; 258f394576eSJason Gunthorpe unsigned int total_pfns; 259f394576eSJason Gunthorpe }; 260f394576eSJason Gunthorpe 261f394576eSJason Gunthorpe static void batch_clear(struct pfn_batch *batch) 262f394576eSJason Gunthorpe { 263f394576eSJason Gunthorpe batch->total_pfns = 0; 264f394576eSJason Gunthorpe batch->end = 0; 265f394576eSJason Gunthorpe batch->pfns[0] = 0; 266f394576eSJason Gunthorpe batch->npfns[0] = 0; 267f394576eSJason Gunthorpe } 268f394576eSJason Gunthorpe 269f394576eSJason Gunthorpe /* 270f394576eSJason Gunthorpe * Carry means we carry a portion of the final hugepage over to the front of the 271f394576eSJason Gunthorpe * batch 272f394576eSJason Gunthorpe */ 273f394576eSJason Gunthorpe static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) 274f394576eSJason Gunthorpe { 275f394576eSJason Gunthorpe if (!keep_pfns) 276f394576eSJason Gunthorpe return batch_clear(batch); 277f394576eSJason Gunthorpe 278f394576eSJason Gunthorpe batch->total_pfns = keep_pfns; 279f394576eSJason Gunthorpe batch->npfns[0] = keep_pfns; 280f394576eSJason Gunthorpe batch->pfns[0] = batch->pfns[batch->end - 1] + 281f394576eSJason Gunthorpe (batch->npfns[batch->end - 1] - keep_pfns); 282f394576eSJason Gunthorpe batch->end = 0; 283f394576eSJason Gunthorpe } 284f394576eSJason Gunthorpe 285f394576eSJason Gunthorpe static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) 286f394576eSJason Gunthorpe { 287f394576eSJason Gunthorpe if (!batch->total_pfns) 288f394576eSJason Gunthorpe return; 289f394576eSJason Gunthorpe skip_pfns = min(batch->total_pfns, skip_pfns); 290f394576eSJason Gunthorpe batch->pfns[0] += skip_pfns; 291f394576eSJason Gunthorpe batch->npfns[0] -= skip_pfns; 292f394576eSJason Gunthorpe batch->total_pfns -= skip_pfns; 293f394576eSJason Gunthorpe } 294f394576eSJason Gunthorpe 295f394576eSJason Gunthorpe static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, 296f394576eSJason Gunthorpe size_t backup_len) 297f394576eSJason Gunthorpe { 298f394576eSJason Gunthorpe const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns); 299f394576eSJason Gunthorpe size_t size = max_pages * elmsz; 300f394576eSJason Gunthorpe 301f394576eSJason Gunthorpe batch->pfns = temp_kmalloc(&size, backup, backup_len); 302f394576eSJason Gunthorpe if (!batch->pfns) 303f394576eSJason Gunthorpe return -ENOMEM; 304f394576eSJason Gunthorpe batch->array_size = size / elmsz; 305f394576eSJason Gunthorpe batch->npfns = (u32 *)(batch->pfns + batch->array_size); 306f394576eSJason Gunthorpe batch_clear(batch); 307f394576eSJason Gunthorpe return 0; 308f394576eSJason Gunthorpe } 309f394576eSJason Gunthorpe 310f394576eSJason Gunthorpe static int batch_init(struct pfn_batch *batch, size_t max_pages) 311f394576eSJason Gunthorpe { 312f394576eSJason Gunthorpe return __batch_init(batch, max_pages, NULL, 0); 313f394576eSJason Gunthorpe } 314f394576eSJason Gunthorpe 315f394576eSJason Gunthorpe static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, 316f394576eSJason Gunthorpe void *backup, size_t backup_len) 317f394576eSJason Gunthorpe { 318f394576eSJason Gunthorpe __batch_init(batch, max_pages, backup, backup_len); 319f394576eSJason Gunthorpe } 320f394576eSJason Gunthorpe 321f394576eSJason Gunthorpe static void batch_destroy(struct pfn_batch *batch, void *backup) 322f394576eSJason Gunthorpe { 323f394576eSJason Gunthorpe if (batch->pfns != backup) 324f394576eSJason Gunthorpe kfree(batch->pfns); 325f394576eSJason Gunthorpe } 326f394576eSJason Gunthorpe 327f394576eSJason Gunthorpe /* true if the pfn could be added, false otherwise */ 328f394576eSJason Gunthorpe static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) 329f394576eSJason Gunthorpe { 330f394576eSJason Gunthorpe const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); 331f394576eSJason Gunthorpe 332f394576eSJason Gunthorpe if (batch->end && 333f394576eSJason Gunthorpe pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && 334f394576eSJason Gunthorpe batch->npfns[batch->end - 1] != MAX_NPFNS) { 335f394576eSJason Gunthorpe batch->npfns[batch->end - 1]++; 336f394576eSJason Gunthorpe batch->total_pfns++; 337f394576eSJason Gunthorpe return true; 338f394576eSJason Gunthorpe } 339f394576eSJason Gunthorpe if (batch->end == batch->array_size) 340f394576eSJason Gunthorpe return false; 341f394576eSJason Gunthorpe batch->total_pfns++; 342f394576eSJason Gunthorpe batch->pfns[batch->end] = pfn; 343f394576eSJason Gunthorpe batch->npfns[batch->end] = 1; 344f394576eSJason Gunthorpe batch->end++; 345f394576eSJason Gunthorpe return true; 346f394576eSJason Gunthorpe } 347f394576eSJason Gunthorpe 348f394576eSJason Gunthorpe /* 349f394576eSJason Gunthorpe * Fill the batch with pfns from the domain. When the batch is full, or it 350f394576eSJason Gunthorpe * reaches last_index, the function will return. The caller should use 351f394576eSJason Gunthorpe * batch->total_pfns to determine the starting point for the next iteration. 352f394576eSJason Gunthorpe */ 353f394576eSJason Gunthorpe static void batch_from_domain(struct pfn_batch *batch, 354f394576eSJason Gunthorpe struct iommu_domain *domain, 355f394576eSJason Gunthorpe struct iopt_area *area, unsigned long start_index, 356f394576eSJason Gunthorpe unsigned long last_index) 357f394576eSJason Gunthorpe { 358f394576eSJason Gunthorpe unsigned int page_offset = 0; 359f394576eSJason Gunthorpe unsigned long iova; 360f394576eSJason Gunthorpe phys_addr_t phys; 361f394576eSJason Gunthorpe 362f394576eSJason Gunthorpe iova = iopt_area_index_to_iova(area, start_index); 363f394576eSJason Gunthorpe if (start_index == iopt_area_index(area)) 364f394576eSJason Gunthorpe page_offset = area->page_offset; 365f394576eSJason Gunthorpe while (start_index <= last_index) { 366f394576eSJason Gunthorpe /* 367f394576eSJason Gunthorpe * This is pretty slow, it would be nice to get the page size 368f394576eSJason Gunthorpe * back from the driver, or have the driver directly fill the 369f394576eSJason Gunthorpe * batch. 370f394576eSJason Gunthorpe */ 371f394576eSJason Gunthorpe phys = iommu_iova_to_phys(domain, iova) - page_offset; 372f394576eSJason Gunthorpe if (!batch_add_pfn(batch, PHYS_PFN(phys))) 373f394576eSJason Gunthorpe return; 374f394576eSJason Gunthorpe iova += PAGE_SIZE - page_offset; 375f394576eSJason Gunthorpe page_offset = 0; 376f394576eSJason Gunthorpe start_index++; 377f394576eSJason Gunthorpe } 378f394576eSJason Gunthorpe } 379f394576eSJason Gunthorpe 380f394576eSJason Gunthorpe static struct page **raw_pages_from_domain(struct iommu_domain *domain, 381f394576eSJason Gunthorpe struct iopt_area *area, 382f394576eSJason Gunthorpe unsigned long start_index, 383f394576eSJason Gunthorpe unsigned long last_index, 384f394576eSJason Gunthorpe struct page **out_pages) 385f394576eSJason Gunthorpe { 386f394576eSJason Gunthorpe unsigned int page_offset = 0; 387f394576eSJason Gunthorpe unsigned long iova; 388f394576eSJason Gunthorpe phys_addr_t phys; 389f394576eSJason Gunthorpe 390f394576eSJason Gunthorpe iova = iopt_area_index_to_iova(area, start_index); 391f394576eSJason Gunthorpe if (start_index == iopt_area_index(area)) 392f394576eSJason Gunthorpe page_offset = area->page_offset; 393f394576eSJason Gunthorpe while (start_index <= last_index) { 394f394576eSJason Gunthorpe phys = iommu_iova_to_phys(domain, iova) - page_offset; 395f394576eSJason Gunthorpe *(out_pages++) = pfn_to_page(PHYS_PFN(phys)); 396f394576eSJason Gunthorpe iova += PAGE_SIZE - page_offset; 397f394576eSJason Gunthorpe page_offset = 0; 398f394576eSJason Gunthorpe start_index++; 399f394576eSJason Gunthorpe } 400f394576eSJason Gunthorpe return out_pages; 401f394576eSJason Gunthorpe } 402f394576eSJason Gunthorpe 403f394576eSJason Gunthorpe /* Continues reading a domain until we reach a discontiguity in the pfns. */ 404f394576eSJason Gunthorpe static void batch_from_domain_continue(struct pfn_batch *batch, 405f394576eSJason Gunthorpe struct iommu_domain *domain, 406f394576eSJason Gunthorpe struct iopt_area *area, 407f394576eSJason Gunthorpe unsigned long start_index, 408f394576eSJason Gunthorpe unsigned long last_index) 409f394576eSJason Gunthorpe { 410f394576eSJason Gunthorpe unsigned int array_size = batch->array_size; 411f394576eSJason Gunthorpe 412f394576eSJason Gunthorpe batch->array_size = batch->end; 413f394576eSJason Gunthorpe batch_from_domain(batch, domain, area, start_index, last_index); 414f394576eSJason Gunthorpe batch->array_size = array_size; 415f394576eSJason Gunthorpe } 416f394576eSJason Gunthorpe 417f394576eSJason Gunthorpe /* 418f394576eSJason Gunthorpe * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That 419f394576eSJason Gunthorpe * mode permits splitting a mapped area up, and then one of the splits is 420f394576eSJason Gunthorpe * unmapped. Doing this normally would cause us to violate our invariant of 421f394576eSJason Gunthorpe * pairing map/unmap. Thus, to support old VFIO compatibility disable support 422f394576eSJason Gunthorpe * for batching consecutive PFNs. All PFNs mapped into the iommu are done in 423f394576eSJason Gunthorpe * PAGE_SIZE units, not larger or smaller. 424f394576eSJason Gunthorpe */ 425f394576eSJason Gunthorpe static int batch_iommu_map_small(struct iommu_domain *domain, 426f394576eSJason Gunthorpe unsigned long iova, phys_addr_t paddr, 427f394576eSJason Gunthorpe size_t size, int prot) 428f394576eSJason Gunthorpe { 429f394576eSJason Gunthorpe unsigned long start_iova = iova; 430f394576eSJason Gunthorpe int rc; 431f394576eSJason Gunthorpe 432f394576eSJason Gunthorpe while (size) { 433f394576eSJason Gunthorpe rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot); 434f394576eSJason Gunthorpe if (rc) 435f394576eSJason Gunthorpe goto err_unmap; 436f394576eSJason Gunthorpe iova += PAGE_SIZE; 437f394576eSJason Gunthorpe paddr += PAGE_SIZE; 438f394576eSJason Gunthorpe size -= PAGE_SIZE; 439f394576eSJason Gunthorpe } 440f394576eSJason Gunthorpe return 0; 441f394576eSJason Gunthorpe 442f394576eSJason Gunthorpe err_unmap: 443f394576eSJason Gunthorpe if (start_iova != iova) 444f394576eSJason Gunthorpe iommu_unmap_nofail(domain, start_iova, iova - start_iova); 445f394576eSJason Gunthorpe return rc; 446f394576eSJason Gunthorpe } 447f394576eSJason Gunthorpe 448f394576eSJason Gunthorpe static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, 449f394576eSJason Gunthorpe struct iopt_area *area, unsigned long start_index) 450f394576eSJason Gunthorpe { 451f394576eSJason Gunthorpe bool disable_large_pages = area->iopt->disable_large_pages; 452f394576eSJason Gunthorpe unsigned long last_iova = iopt_area_last_iova(area); 453f394576eSJason Gunthorpe unsigned int page_offset = 0; 454f394576eSJason Gunthorpe unsigned long start_iova; 455f394576eSJason Gunthorpe unsigned long next_iova; 456f394576eSJason Gunthorpe unsigned int cur = 0; 457f394576eSJason Gunthorpe unsigned long iova; 458f394576eSJason Gunthorpe int rc; 459f394576eSJason Gunthorpe 460f394576eSJason Gunthorpe /* The first index might be a partial page */ 461f394576eSJason Gunthorpe if (start_index == iopt_area_index(area)) 462f394576eSJason Gunthorpe page_offset = area->page_offset; 463f394576eSJason Gunthorpe next_iova = iova = start_iova = 464f394576eSJason Gunthorpe iopt_area_index_to_iova(area, start_index); 465f394576eSJason Gunthorpe while (cur < batch->end) { 466f394576eSJason Gunthorpe next_iova = min(last_iova + 1, 467f394576eSJason Gunthorpe next_iova + batch->npfns[cur] * PAGE_SIZE - 468f394576eSJason Gunthorpe page_offset); 469f394576eSJason Gunthorpe if (disable_large_pages) 470f394576eSJason Gunthorpe rc = batch_iommu_map_small( 471f394576eSJason Gunthorpe domain, iova, 472f394576eSJason Gunthorpe PFN_PHYS(batch->pfns[cur]) + page_offset, 473f394576eSJason Gunthorpe next_iova - iova, area->iommu_prot); 474f394576eSJason Gunthorpe else 475f394576eSJason Gunthorpe rc = iommu_map(domain, iova, 476f394576eSJason Gunthorpe PFN_PHYS(batch->pfns[cur]) + page_offset, 477f394576eSJason Gunthorpe next_iova - iova, area->iommu_prot); 478f394576eSJason Gunthorpe if (rc) 479f394576eSJason Gunthorpe goto err_unmap; 480f394576eSJason Gunthorpe iova = next_iova; 481f394576eSJason Gunthorpe page_offset = 0; 482f394576eSJason Gunthorpe cur++; 483f394576eSJason Gunthorpe } 484f394576eSJason Gunthorpe return 0; 485f394576eSJason Gunthorpe err_unmap: 486f394576eSJason Gunthorpe if (start_iova != iova) 487f394576eSJason Gunthorpe iommu_unmap_nofail(domain, start_iova, iova - start_iova); 488f394576eSJason Gunthorpe return rc; 489f394576eSJason Gunthorpe } 490f394576eSJason Gunthorpe 491f394576eSJason Gunthorpe static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, 492f394576eSJason Gunthorpe unsigned long start_index, 493f394576eSJason Gunthorpe unsigned long last_index) 494f394576eSJason Gunthorpe { 495f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 496f394576eSJason Gunthorpe void *entry; 497f394576eSJason Gunthorpe 498f394576eSJason Gunthorpe rcu_read_lock(); 499f394576eSJason Gunthorpe while (true) { 500f394576eSJason Gunthorpe entry = xas_next(&xas); 501f394576eSJason Gunthorpe if (xas_retry(&xas, entry)) 502f394576eSJason Gunthorpe continue; 503f394576eSJason Gunthorpe WARN_ON(!xa_is_value(entry)); 504f394576eSJason Gunthorpe if (!batch_add_pfn(batch, xa_to_value(entry)) || 505f394576eSJason Gunthorpe start_index == last_index) 506f394576eSJason Gunthorpe break; 507f394576eSJason Gunthorpe start_index++; 508f394576eSJason Gunthorpe } 509f394576eSJason Gunthorpe rcu_read_unlock(); 510f394576eSJason Gunthorpe } 511f394576eSJason Gunthorpe 512f394576eSJason Gunthorpe static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, 513f394576eSJason Gunthorpe unsigned long start_index, 514f394576eSJason Gunthorpe unsigned long last_index) 515f394576eSJason Gunthorpe { 516f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 517f394576eSJason Gunthorpe void *entry; 518f394576eSJason Gunthorpe 519f394576eSJason Gunthorpe xas_lock(&xas); 520f394576eSJason Gunthorpe while (true) { 521f394576eSJason Gunthorpe entry = xas_next(&xas); 522f394576eSJason Gunthorpe if (xas_retry(&xas, entry)) 523f394576eSJason Gunthorpe continue; 524f394576eSJason Gunthorpe WARN_ON(!xa_is_value(entry)); 525f394576eSJason Gunthorpe if (!batch_add_pfn(batch, xa_to_value(entry))) 526f394576eSJason Gunthorpe break; 527f394576eSJason Gunthorpe xas_store(&xas, NULL); 528f394576eSJason Gunthorpe if (start_index == last_index) 529f394576eSJason Gunthorpe break; 530f394576eSJason Gunthorpe start_index++; 531f394576eSJason Gunthorpe } 532f394576eSJason Gunthorpe xas_unlock(&xas); 533f394576eSJason Gunthorpe } 534f394576eSJason Gunthorpe 535f394576eSJason Gunthorpe static void clear_xarray(struct xarray *xa, unsigned long start_index, 536f394576eSJason Gunthorpe unsigned long last_index) 537f394576eSJason Gunthorpe { 538f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 539f394576eSJason Gunthorpe void *entry; 540f394576eSJason Gunthorpe 541f394576eSJason Gunthorpe xas_lock(&xas); 542f394576eSJason Gunthorpe xas_for_each(&xas, entry, last_index) 543f394576eSJason Gunthorpe xas_store(&xas, NULL); 544f394576eSJason Gunthorpe xas_unlock(&xas); 545f394576eSJason Gunthorpe } 546f394576eSJason Gunthorpe 547f394576eSJason Gunthorpe static int pages_to_xarray(struct xarray *xa, unsigned long start_index, 548f394576eSJason Gunthorpe unsigned long last_index, struct page **pages) 549f394576eSJason Gunthorpe { 550f394576eSJason Gunthorpe struct page **end_pages = pages + (last_index - start_index) + 1; 551*e26eed4fSJason Gunthorpe struct page **half_pages = pages + (end_pages - pages) / 2; 552f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 553f394576eSJason Gunthorpe 554f394576eSJason Gunthorpe do { 555f394576eSJason Gunthorpe void *old; 556f394576eSJason Gunthorpe 557f394576eSJason Gunthorpe xas_lock(&xas); 558f394576eSJason Gunthorpe while (pages != end_pages) { 559*e26eed4fSJason Gunthorpe /* xarray does not participate in fault injection */ 560*e26eed4fSJason Gunthorpe if (pages == half_pages && iommufd_should_fail()) { 561*e26eed4fSJason Gunthorpe xas_set_err(&xas, -EINVAL); 562*e26eed4fSJason Gunthorpe xas_unlock(&xas); 563*e26eed4fSJason Gunthorpe /* aka xas_destroy() */ 564*e26eed4fSJason Gunthorpe xas_nomem(&xas, GFP_KERNEL); 565*e26eed4fSJason Gunthorpe goto err_clear; 566*e26eed4fSJason Gunthorpe } 567*e26eed4fSJason Gunthorpe 568f394576eSJason Gunthorpe old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); 569f394576eSJason Gunthorpe if (xas_error(&xas)) 570f394576eSJason Gunthorpe break; 571f394576eSJason Gunthorpe WARN_ON(old); 572f394576eSJason Gunthorpe pages++; 573f394576eSJason Gunthorpe xas_next(&xas); 574f394576eSJason Gunthorpe } 575f394576eSJason Gunthorpe xas_unlock(&xas); 576f394576eSJason Gunthorpe } while (xas_nomem(&xas, GFP_KERNEL)); 577f394576eSJason Gunthorpe 578*e26eed4fSJason Gunthorpe err_clear: 579f394576eSJason Gunthorpe if (xas_error(&xas)) { 580f394576eSJason Gunthorpe if (xas.xa_index != start_index) 581f394576eSJason Gunthorpe clear_xarray(xa, start_index, xas.xa_index - 1); 582f394576eSJason Gunthorpe return xas_error(&xas); 583f394576eSJason Gunthorpe } 584f394576eSJason Gunthorpe return 0; 585f394576eSJason Gunthorpe } 586f394576eSJason Gunthorpe 587f394576eSJason Gunthorpe static void batch_from_pages(struct pfn_batch *batch, struct page **pages, 588f394576eSJason Gunthorpe size_t npages) 589f394576eSJason Gunthorpe { 590f394576eSJason Gunthorpe struct page **end = pages + npages; 591f394576eSJason Gunthorpe 592f394576eSJason Gunthorpe for (; pages != end; pages++) 593f394576eSJason Gunthorpe if (!batch_add_pfn(batch, page_to_pfn(*pages))) 594f394576eSJason Gunthorpe break; 595f394576eSJason Gunthorpe } 596f394576eSJason Gunthorpe 597f394576eSJason Gunthorpe static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, 598f394576eSJason Gunthorpe unsigned int first_page_off, size_t npages) 599f394576eSJason Gunthorpe { 600f394576eSJason Gunthorpe unsigned int cur = 0; 601f394576eSJason Gunthorpe 602f394576eSJason Gunthorpe while (first_page_off) { 603f394576eSJason Gunthorpe if (batch->npfns[cur] > first_page_off) 604f394576eSJason Gunthorpe break; 605f394576eSJason Gunthorpe first_page_off -= batch->npfns[cur]; 606f394576eSJason Gunthorpe cur++; 607f394576eSJason Gunthorpe } 608f394576eSJason Gunthorpe 609f394576eSJason Gunthorpe while (npages) { 610f394576eSJason Gunthorpe size_t to_unpin = min_t(size_t, npages, 611f394576eSJason Gunthorpe batch->npfns[cur] - first_page_off); 612f394576eSJason Gunthorpe 613f394576eSJason Gunthorpe unpin_user_page_range_dirty_lock( 614f394576eSJason Gunthorpe pfn_to_page(batch->pfns[cur] + first_page_off), 615f394576eSJason Gunthorpe to_unpin, pages->writable); 616f394576eSJason Gunthorpe iopt_pages_sub_npinned(pages, to_unpin); 617f394576eSJason Gunthorpe cur++; 618f394576eSJason Gunthorpe first_page_off = 0; 619f394576eSJason Gunthorpe npages -= to_unpin; 620f394576eSJason Gunthorpe } 621f394576eSJason Gunthorpe } 622f394576eSJason Gunthorpe 623f394576eSJason Gunthorpe static void copy_data_page(struct page *page, void *data, unsigned long offset, 624f394576eSJason Gunthorpe size_t length, unsigned int flags) 625f394576eSJason Gunthorpe { 626f394576eSJason Gunthorpe void *mem; 627f394576eSJason Gunthorpe 628f394576eSJason Gunthorpe mem = kmap_local_page(page); 629f394576eSJason Gunthorpe if (flags & IOMMUFD_ACCESS_RW_WRITE) { 630f394576eSJason Gunthorpe memcpy(mem + offset, data, length); 631f394576eSJason Gunthorpe set_page_dirty_lock(page); 632f394576eSJason Gunthorpe } else { 633f394576eSJason Gunthorpe memcpy(data, mem + offset, length); 634f394576eSJason Gunthorpe } 635f394576eSJason Gunthorpe kunmap_local(mem); 636f394576eSJason Gunthorpe } 637f394576eSJason Gunthorpe 638f394576eSJason Gunthorpe static unsigned long batch_rw(struct pfn_batch *batch, void *data, 639f394576eSJason Gunthorpe unsigned long offset, unsigned long length, 640f394576eSJason Gunthorpe unsigned int flags) 641f394576eSJason Gunthorpe { 642f394576eSJason Gunthorpe unsigned long copied = 0; 643f394576eSJason Gunthorpe unsigned int npage = 0; 644f394576eSJason Gunthorpe unsigned int cur = 0; 645f394576eSJason Gunthorpe 646f394576eSJason Gunthorpe while (cur < batch->end) { 647f394576eSJason Gunthorpe unsigned long bytes = min(length, PAGE_SIZE - offset); 648f394576eSJason Gunthorpe 649f394576eSJason Gunthorpe copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data, 650f394576eSJason Gunthorpe offset, bytes, flags); 651f394576eSJason Gunthorpe offset = 0; 652f394576eSJason Gunthorpe length -= bytes; 653f394576eSJason Gunthorpe data += bytes; 654f394576eSJason Gunthorpe copied += bytes; 655f394576eSJason Gunthorpe npage++; 656f394576eSJason Gunthorpe if (npage == batch->npfns[cur]) { 657f394576eSJason Gunthorpe npage = 0; 658f394576eSJason Gunthorpe cur++; 659f394576eSJason Gunthorpe } 660f394576eSJason Gunthorpe if (!length) 661f394576eSJason Gunthorpe break; 662f394576eSJason Gunthorpe } 663f394576eSJason Gunthorpe return copied; 664f394576eSJason Gunthorpe } 665f394576eSJason Gunthorpe 666f394576eSJason Gunthorpe /* pfn_reader_user is just the pin_user_pages() path */ 667f394576eSJason Gunthorpe struct pfn_reader_user { 668f394576eSJason Gunthorpe struct page **upages; 669f394576eSJason Gunthorpe size_t upages_len; 670f394576eSJason Gunthorpe unsigned long upages_start; 671f394576eSJason Gunthorpe unsigned long upages_end; 672f394576eSJason Gunthorpe unsigned int gup_flags; 673f394576eSJason Gunthorpe /* 674f394576eSJason Gunthorpe * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is 675f394576eSJason Gunthorpe * neither 676f394576eSJason Gunthorpe */ 677f394576eSJason Gunthorpe int locked; 678f394576eSJason Gunthorpe }; 679f394576eSJason Gunthorpe 680f394576eSJason Gunthorpe static void pfn_reader_user_init(struct pfn_reader_user *user, 681f394576eSJason Gunthorpe struct iopt_pages *pages) 682f394576eSJason Gunthorpe { 683f394576eSJason Gunthorpe user->upages = NULL; 684f394576eSJason Gunthorpe user->upages_start = 0; 685f394576eSJason Gunthorpe user->upages_end = 0; 686f394576eSJason Gunthorpe user->locked = -1; 687f394576eSJason Gunthorpe 688f394576eSJason Gunthorpe if (pages->writable) { 689f394576eSJason Gunthorpe user->gup_flags = FOLL_LONGTERM | FOLL_WRITE; 690f394576eSJason Gunthorpe } else { 691f394576eSJason Gunthorpe /* Still need to break COWs on read */ 692f394576eSJason Gunthorpe user->gup_flags = FOLL_LONGTERM | FOLL_FORCE | FOLL_WRITE; 693f394576eSJason Gunthorpe } 694f394576eSJason Gunthorpe } 695f394576eSJason Gunthorpe 696f394576eSJason Gunthorpe static void pfn_reader_user_destroy(struct pfn_reader_user *user, 697f394576eSJason Gunthorpe struct iopt_pages *pages) 698f394576eSJason Gunthorpe { 699f394576eSJason Gunthorpe if (user->locked != -1) { 700f394576eSJason Gunthorpe if (user->locked) 701f394576eSJason Gunthorpe mmap_read_unlock(pages->source_mm); 702f394576eSJason Gunthorpe if (pages->source_mm != current->mm) 703f394576eSJason Gunthorpe mmput(pages->source_mm); 704f394576eSJason Gunthorpe user->locked = 0; 705f394576eSJason Gunthorpe } 706f394576eSJason Gunthorpe 707f394576eSJason Gunthorpe kfree(user->upages); 708f394576eSJason Gunthorpe user->upages = NULL; 709f394576eSJason Gunthorpe } 710f394576eSJason Gunthorpe 711f394576eSJason Gunthorpe static int pfn_reader_user_pin(struct pfn_reader_user *user, 712f394576eSJason Gunthorpe struct iopt_pages *pages, 713f394576eSJason Gunthorpe unsigned long start_index, 714f394576eSJason Gunthorpe unsigned long last_index) 715f394576eSJason Gunthorpe { 716f394576eSJason Gunthorpe bool remote_mm = pages->source_mm != current->mm; 717f394576eSJason Gunthorpe unsigned long npages; 718f394576eSJason Gunthorpe uintptr_t uptr; 719f394576eSJason Gunthorpe long rc; 720f394576eSJason Gunthorpe 721f394576eSJason Gunthorpe if (!user->upages) { 722f394576eSJason Gunthorpe /* All undone in pfn_reader_destroy() */ 723f394576eSJason Gunthorpe user->upages_len = 724f394576eSJason Gunthorpe (last_index - start_index + 1) * sizeof(*user->upages); 725f394576eSJason Gunthorpe user->upages = temp_kmalloc(&user->upages_len, NULL, 0); 726f394576eSJason Gunthorpe if (!user->upages) 727f394576eSJason Gunthorpe return -ENOMEM; 728f394576eSJason Gunthorpe } 729f394576eSJason Gunthorpe 730f394576eSJason Gunthorpe if (user->locked == -1) { 731f394576eSJason Gunthorpe /* 732f394576eSJason Gunthorpe * The majority of usages will run the map task within the mm 733f394576eSJason Gunthorpe * providing the pages, so we can optimize into 734f394576eSJason Gunthorpe * get_user_pages_fast() 735f394576eSJason Gunthorpe */ 736f394576eSJason Gunthorpe if (remote_mm) { 737f394576eSJason Gunthorpe if (!mmget_not_zero(pages->source_mm)) 738f394576eSJason Gunthorpe return -EFAULT; 739f394576eSJason Gunthorpe } 740f394576eSJason Gunthorpe user->locked = 0; 741f394576eSJason Gunthorpe } 742f394576eSJason Gunthorpe 743f394576eSJason Gunthorpe npages = min_t(unsigned long, last_index - start_index + 1, 744f394576eSJason Gunthorpe user->upages_len / sizeof(*user->upages)); 745f394576eSJason Gunthorpe 746*e26eed4fSJason Gunthorpe 747*e26eed4fSJason Gunthorpe if (iommufd_should_fail()) 748*e26eed4fSJason Gunthorpe return -EFAULT; 749*e26eed4fSJason Gunthorpe 750f394576eSJason Gunthorpe uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); 751f394576eSJason Gunthorpe if (!remote_mm) 752f394576eSJason Gunthorpe rc = pin_user_pages_fast(uptr, npages, user->gup_flags, 753f394576eSJason Gunthorpe user->upages); 754f394576eSJason Gunthorpe else { 755f394576eSJason Gunthorpe if (!user->locked) { 756f394576eSJason Gunthorpe mmap_read_lock(pages->source_mm); 757f394576eSJason Gunthorpe user->locked = 1; 758f394576eSJason Gunthorpe } 759f394576eSJason Gunthorpe /* 760f394576eSJason Gunthorpe * FIXME: last NULL can be &pfns->locked once the GUP patch 761f394576eSJason Gunthorpe * is merged. 762f394576eSJason Gunthorpe */ 763f394576eSJason Gunthorpe rc = pin_user_pages_remote(pages->source_mm, uptr, npages, 764f394576eSJason Gunthorpe user->gup_flags, user->upages, NULL, 765f394576eSJason Gunthorpe NULL); 766f394576eSJason Gunthorpe } 767f394576eSJason Gunthorpe if (rc <= 0) { 768f394576eSJason Gunthorpe if (WARN_ON(!rc)) 769f394576eSJason Gunthorpe return -EFAULT; 770f394576eSJason Gunthorpe return rc; 771f394576eSJason Gunthorpe } 772f394576eSJason Gunthorpe iopt_pages_add_npinned(pages, rc); 773f394576eSJason Gunthorpe user->upages_start = start_index; 774f394576eSJason Gunthorpe user->upages_end = start_index + rc; 775f394576eSJason Gunthorpe return 0; 776f394576eSJason Gunthorpe } 777f394576eSJason Gunthorpe 778f394576eSJason Gunthorpe /* This is the "modern" and faster accounting method used by io_uring */ 779f394576eSJason Gunthorpe static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) 780f394576eSJason Gunthorpe { 781f394576eSJason Gunthorpe unsigned long lock_limit; 782f394576eSJason Gunthorpe unsigned long cur_pages; 783f394576eSJason Gunthorpe unsigned long new_pages; 784f394576eSJason Gunthorpe 785f394576eSJason Gunthorpe lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> 786f394576eSJason Gunthorpe PAGE_SHIFT; 787f394576eSJason Gunthorpe npages = pages->npinned - pages->last_npinned; 788f394576eSJason Gunthorpe do { 789f394576eSJason Gunthorpe cur_pages = atomic_long_read(&pages->source_user->locked_vm); 790f394576eSJason Gunthorpe new_pages = cur_pages + npages; 791f394576eSJason Gunthorpe if (new_pages > lock_limit) 792f394576eSJason Gunthorpe return -ENOMEM; 793f394576eSJason Gunthorpe } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, 794f394576eSJason Gunthorpe new_pages) != cur_pages); 795f394576eSJason Gunthorpe return 0; 796f394576eSJason Gunthorpe } 797f394576eSJason Gunthorpe 798f394576eSJason Gunthorpe static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) 799f394576eSJason Gunthorpe { 800f394576eSJason Gunthorpe if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)) 801f394576eSJason Gunthorpe return; 802f394576eSJason Gunthorpe atomic_long_sub(npages, &pages->source_user->locked_vm); 803f394576eSJason Gunthorpe } 804f394576eSJason Gunthorpe 805f394576eSJason Gunthorpe /* This is the accounting method used for compatibility with VFIO */ 806f394576eSJason Gunthorpe static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, 807f394576eSJason Gunthorpe bool inc, struct pfn_reader_user *user) 808f394576eSJason Gunthorpe { 809f394576eSJason Gunthorpe bool do_put = false; 810f394576eSJason Gunthorpe int rc; 811f394576eSJason Gunthorpe 812f394576eSJason Gunthorpe if (user && user->locked) { 813f394576eSJason Gunthorpe mmap_read_unlock(pages->source_mm); 814f394576eSJason Gunthorpe user->locked = 0; 815f394576eSJason Gunthorpe /* If we had the lock then we also have a get */ 816f394576eSJason Gunthorpe } else if ((!user || !user->upages) && 817f394576eSJason Gunthorpe pages->source_mm != current->mm) { 818f394576eSJason Gunthorpe if (!mmget_not_zero(pages->source_mm)) 819f394576eSJason Gunthorpe return -EINVAL; 820f394576eSJason Gunthorpe do_put = true; 821f394576eSJason Gunthorpe } 822f394576eSJason Gunthorpe 823f394576eSJason Gunthorpe mmap_write_lock(pages->source_mm); 824f394576eSJason Gunthorpe rc = __account_locked_vm(pages->source_mm, npages, inc, 825f394576eSJason Gunthorpe pages->source_task, false); 826f394576eSJason Gunthorpe mmap_write_unlock(pages->source_mm); 827f394576eSJason Gunthorpe 828f394576eSJason Gunthorpe if (do_put) 829f394576eSJason Gunthorpe mmput(pages->source_mm); 830f394576eSJason Gunthorpe return rc; 831f394576eSJason Gunthorpe } 832f394576eSJason Gunthorpe 833f394576eSJason Gunthorpe static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, 834f394576eSJason Gunthorpe bool inc, struct pfn_reader_user *user) 835f394576eSJason Gunthorpe { 836f394576eSJason Gunthorpe int rc = 0; 837f394576eSJason Gunthorpe 838f394576eSJason Gunthorpe switch (pages->account_mode) { 839f394576eSJason Gunthorpe case IOPT_PAGES_ACCOUNT_NONE: 840f394576eSJason Gunthorpe break; 841f394576eSJason Gunthorpe case IOPT_PAGES_ACCOUNT_USER: 842f394576eSJason Gunthorpe if (inc) 843f394576eSJason Gunthorpe rc = incr_user_locked_vm(pages, npages); 844f394576eSJason Gunthorpe else 845f394576eSJason Gunthorpe decr_user_locked_vm(pages, npages); 846f394576eSJason Gunthorpe break; 847f394576eSJason Gunthorpe case IOPT_PAGES_ACCOUNT_MM: 848f394576eSJason Gunthorpe rc = update_mm_locked_vm(pages, npages, inc, user); 849f394576eSJason Gunthorpe break; 850f394576eSJason Gunthorpe } 851f394576eSJason Gunthorpe if (rc) 852f394576eSJason Gunthorpe return rc; 853f394576eSJason Gunthorpe 854f394576eSJason Gunthorpe pages->last_npinned = pages->npinned; 855f394576eSJason Gunthorpe if (inc) 856f394576eSJason Gunthorpe atomic64_add(npages, &pages->source_mm->pinned_vm); 857f394576eSJason Gunthorpe else 858f394576eSJason Gunthorpe atomic64_sub(npages, &pages->source_mm->pinned_vm); 859f394576eSJason Gunthorpe return 0; 860f394576eSJason Gunthorpe } 861f394576eSJason Gunthorpe 862f394576eSJason Gunthorpe static void update_unpinned(struct iopt_pages *pages) 863f394576eSJason Gunthorpe { 864f394576eSJason Gunthorpe if (WARN_ON(pages->npinned > pages->last_npinned)) 865f394576eSJason Gunthorpe return; 866f394576eSJason Gunthorpe if (pages->npinned == pages->last_npinned) 867f394576eSJason Gunthorpe return; 868f394576eSJason Gunthorpe do_update_pinned(pages, pages->last_npinned - pages->npinned, false, 869f394576eSJason Gunthorpe NULL); 870f394576eSJason Gunthorpe } 871f394576eSJason Gunthorpe 872f394576eSJason Gunthorpe /* 873f394576eSJason Gunthorpe * Changes in the number of pages pinned is done after the pages have been read 874f394576eSJason Gunthorpe * and processed. If the user lacked the limit then the error unwind will unpin 875f394576eSJason Gunthorpe * everything that was just pinned. This is because it is expensive to calculate 876f394576eSJason Gunthorpe * how many pages we have already pinned within a range to generate an accurate 877f394576eSJason Gunthorpe * prediction in advance of doing the work to actually pin them. 878f394576eSJason Gunthorpe */ 879f394576eSJason Gunthorpe static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, 880f394576eSJason Gunthorpe struct iopt_pages *pages) 881f394576eSJason Gunthorpe { 882f394576eSJason Gunthorpe unsigned long npages; 883f394576eSJason Gunthorpe bool inc; 884f394576eSJason Gunthorpe 885f394576eSJason Gunthorpe lockdep_assert_held(&pages->mutex); 886f394576eSJason Gunthorpe 887f394576eSJason Gunthorpe if (pages->npinned == pages->last_npinned) 888f394576eSJason Gunthorpe return 0; 889f394576eSJason Gunthorpe 890f394576eSJason Gunthorpe if (pages->npinned < pages->last_npinned) { 891f394576eSJason Gunthorpe npages = pages->last_npinned - pages->npinned; 892f394576eSJason Gunthorpe inc = false; 893f394576eSJason Gunthorpe } else { 894*e26eed4fSJason Gunthorpe if (iommufd_should_fail()) 895*e26eed4fSJason Gunthorpe return -ENOMEM; 896f394576eSJason Gunthorpe npages = pages->npinned - pages->last_npinned; 897f394576eSJason Gunthorpe inc = true; 898f394576eSJason Gunthorpe } 899f394576eSJason Gunthorpe return do_update_pinned(pages, npages, inc, user); 900f394576eSJason Gunthorpe } 901f394576eSJason Gunthorpe 902f394576eSJason Gunthorpe /* 903f394576eSJason Gunthorpe * PFNs are stored in three places, in order of preference: 904f394576eSJason Gunthorpe * - The iopt_pages xarray. This is only populated if there is a 905f394576eSJason Gunthorpe * iopt_pages_access 906f394576eSJason Gunthorpe * - The iommu_domain under an area 907f394576eSJason Gunthorpe * - The original PFN source, ie pages->source_mm 908f394576eSJason Gunthorpe * 909f394576eSJason Gunthorpe * This iterator reads the pfns optimizing to load according to the 910f394576eSJason Gunthorpe * above order. 911f394576eSJason Gunthorpe */ 912f394576eSJason Gunthorpe struct pfn_reader { 913f394576eSJason Gunthorpe struct iopt_pages *pages; 914f394576eSJason Gunthorpe struct interval_tree_double_span_iter span; 915f394576eSJason Gunthorpe struct pfn_batch batch; 916f394576eSJason Gunthorpe unsigned long batch_start_index; 917f394576eSJason Gunthorpe unsigned long batch_end_index; 918f394576eSJason Gunthorpe unsigned long last_index; 919f394576eSJason Gunthorpe 920f394576eSJason Gunthorpe struct pfn_reader_user user; 921f394576eSJason Gunthorpe }; 922f394576eSJason Gunthorpe 923f394576eSJason Gunthorpe static int pfn_reader_update_pinned(struct pfn_reader *pfns) 924f394576eSJason Gunthorpe { 925f394576eSJason Gunthorpe return pfn_reader_user_update_pinned(&pfns->user, pfns->pages); 926f394576eSJason Gunthorpe } 927f394576eSJason Gunthorpe 928f394576eSJason Gunthorpe /* 929f394576eSJason Gunthorpe * The batch can contain a mixture of pages that are still in use and pages that 930f394576eSJason Gunthorpe * need to be unpinned. Unpin only pages that are not held anywhere else. 931f394576eSJason Gunthorpe */ 932f394576eSJason Gunthorpe static void pfn_reader_unpin(struct pfn_reader *pfns) 933f394576eSJason Gunthorpe { 934f394576eSJason Gunthorpe unsigned long last = pfns->batch_end_index - 1; 935f394576eSJason Gunthorpe unsigned long start = pfns->batch_start_index; 936f394576eSJason Gunthorpe struct interval_tree_double_span_iter span; 937f394576eSJason Gunthorpe struct iopt_pages *pages = pfns->pages; 938f394576eSJason Gunthorpe 939f394576eSJason Gunthorpe lockdep_assert_held(&pages->mutex); 940f394576eSJason Gunthorpe 941f394576eSJason Gunthorpe interval_tree_for_each_double_span(&span, &pages->access_itree, 942f394576eSJason Gunthorpe &pages->domains_itree, start, last) { 943f394576eSJason Gunthorpe if (span.is_used) 944f394576eSJason Gunthorpe continue; 945f394576eSJason Gunthorpe 946f394576eSJason Gunthorpe batch_unpin(&pfns->batch, pages, span.start_hole - start, 947f394576eSJason Gunthorpe span.last_hole - span.start_hole + 1); 948f394576eSJason Gunthorpe } 949f394576eSJason Gunthorpe } 950f394576eSJason Gunthorpe 951f394576eSJason Gunthorpe /* Process a single span to load it from the proper storage */ 952f394576eSJason Gunthorpe static int pfn_reader_fill_span(struct pfn_reader *pfns) 953f394576eSJason Gunthorpe { 954f394576eSJason Gunthorpe struct interval_tree_double_span_iter *span = &pfns->span; 955f394576eSJason Gunthorpe unsigned long start_index = pfns->batch_end_index; 956f394576eSJason Gunthorpe struct iopt_area *area; 957f394576eSJason Gunthorpe int rc; 958f394576eSJason Gunthorpe 959f394576eSJason Gunthorpe if (span->is_used == 1) { 960f394576eSJason Gunthorpe batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, 961f394576eSJason Gunthorpe start_index, span->last_used); 962f394576eSJason Gunthorpe return 0; 963f394576eSJason Gunthorpe } 964f394576eSJason Gunthorpe 965f394576eSJason Gunthorpe if (span->is_used == 2) { 966f394576eSJason Gunthorpe /* 967f394576eSJason Gunthorpe * Pull as many pages from the first domain we find in the 968f394576eSJason Gunthorpe * target span. If it is too small then we will be called again 969f394576eSJason Gunthorpe * and we'll find another area. 970f394576eSJason Gunthorpe */ 971f394576eSJason Gunthorpe area = iopt_pages_find_domain_area(pfns->pages, start_index); 972f394576eSJason Gunthorpe if (WARN_ON(!area)) 973f394576eSJason Gunthorpe return -EINVAL; 974f394576eSJason Gunthorpe 975f394576eSJason Gunthorpe /* The storage_domain cannot change without the pages mutex */ 976f394576eSJason Gunthorpe batch_from_domain( 977f394576eSJason Gunthorpe &pfns->batch, area->storage_domain, area, start_index, 978f394576eSJason Gunthorpe min(iopt_area_last_index(area), span->last_used)); 979f394576eSJason Gunthorpe return 0; 980f394576eSJason Gunthorpe } 981f394576eSJason Gunthorpe 982f394576eSJason Gunthorpe if (start_index >= pfns->user.upages_end) { 983f394576eSJason Gunthorpe rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, 984f394576eSJason Gunthorpe span->last_hole); 985f394576eSJason Gunthorpe if (rc) 986f394576eSJason Gunthorpe return rc; 987f394576eSJason Gunthorpe } 988f394576eSJason Gunthorpe 989f394576eSJason Gunthorpe batch_from_pages(&pfns->batch, 990f394576eSJason Gunthorpe pfns->user.upages + 991f394576eSJason Gunthorpe (start_index - pfns->user.upages_start), 992f394576eSJason Gunthorpe pfns->user.upages_end - start_index); 993f394576eSJason Gunthorpe return 0; 994f394576eSJason Gunthorpe } 995f394576eSJason Gunthorpe 996f394576eSJason Gunthorpe static bool pfn_reader_done(struct pfn_reader *pfns) 997f394576eSJason Gunthorpe { 998f394576eSJason Gunthorpe return pfns->batch_start_index == pfns->last_index + 1; 999f394576eSJason Gunthorpe } 1000f394576eSJason Gunthorpe 1001f394576eSJason Gunthorpe static int pfn_reader_next(struct pfn_reader *pfns) 1002f394576eSJason Gunthorpe { 1003f394576eSJason Gunthorpe int rc; 1004f394576eSJason Gunthorpe 1005f394576eSJason Gunthorpe batch_clear(&pfns->batch); 1006f394576eSJason Gunthorpe pfns->batch_start_index = pfns->batch_end_index; 1007f394576eSJason Gunthorpe 1008f394576eSJason Gunthorpe while (pfns->batch_end_index != pfns->last_index + 1) { 1009f394576eSJason Gunthorpe unsigned int npfns = pfns->batch.total_pfns; 1010f394576eSJason Gunthorpe 1011f394576eSJason Gunthorpe rc = pfn_reader_fill_span(pfns); 1012f394576eSJason Gunthorpe if (rc) 1013f394576eSJason Gunthorpe return rc; 1014f394576eSJason Gunthorpe 1015f394576eSJason Gunthorpe if (WARN_ON(!pfns->batch.total_pfns)) 1016f394576eSJason Gunthorpe return -EINVAL; 1017f394576eSJason Gunthorpe 1018f394576eSJason Gunthorpe pfns->batch_end_index = 1019f394576eSJason Gunthorpe pfns->batch_start_index + pfns->batch.total_pfns; 1020f394576eSJason Gunthorpe if (pfns->batch_end_index == pfns->span.last_used + 1) 1021f394576eSJason Gunthorpe interval_tree_double_span_iter_next(&pfns->span); 1022f394576eSJason Gunthorpe 1023f394576eSJason Gunthorpe /* Batch is full */ 1024f394576eSJason Gunthorpe if (npfns == pfns->batch.total_pfns) 1025f394576eSJason Gunthorpe return 0; 1026f394576eSJason Gunthorpe } 1027f394576eSJason Gunthorpe return 0; 1028f394576eSJason Gunthorpe } 1029f394576eSJason Gunthorpe 1030f394576eSJason Gunthorpe static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, 1031f394576eSJason Gunthorpe unsigned long start_index, unsigned long last_index) 1032f394576eSJason Gunthorpe { 1033f394576eSJason Gunthorpe int rc; 1034f394576eSJason Gunthorpe 1035f394576eSJason Gunthorpe lockdep_assert_held(&pages->mutex); 1036f394576eSJason Gunthorpe 1037f394576eSJason Gunthorpe pfns->pages = pages; 1038f394576eSJason Gunthorpe pfns->batch_start_index = start_index; 1039f394576eSJason Gunthorpe pfns->batch_end_index = start_index; 1040f394576eSJason Gunthorpe pfns->last_index = last_index; 1041f394576eSJason Gunthorpe pfn_reader_user_init(&pfns->user, pages); 1042f394576eSJason Gunthorpe rc = batch_init(&pfns->batch, last_index - start_index + 1); 1043f394576eSJason Gunthorpe if (rc) 1044f394576eSJason Gunthorpe return rc; 1045f394576eSJason Gunthorpe interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree, 1046f394576eSJason Gunthorpe &pages->domains_itree, start_index, 1047f394576eSJason Gunthorpe last_index); 1048f394576eSJason Gunthorpe return 0; 1049f394576eSJason Gunthorpe } 1050f394576eSJason Gunthorpe 1051f394576eSJason Gunthorpe /* 1052f394576eSJason Gunthorpe * There are many assertions regarding the state of pages->npinned vs 1053f394576eSJason Gunthorpe * pages->last_pinned, for instance something like unmapping a domain must only 1054f394576eSJason Gunthorpe * decrement the npinned, and pfn_reader_destroy() must be called only after all 1055f394576eSJason Gunthorpe * the pins are updated. This is fine for success flows, but error flows 1056f394576eSJason Gunthorpe * sometimes need to release the pins held inside the pfn_reader before going on 1057f394576eSJason Gunthorpe * to complete unmapping and releasing pins held in domains. 1058f394576eSJason Gunthorpe */ 1059f394576eSJason Gunthorpe static void pfn_reader_release_pins(struct pfn_reader *pfns) 1060f394576eSJason Gunthorpe { 1061f394576eSJason Gunthorpe struct iopt_pages *pages = pfns->pages; 1062f394576eSJason Gunthorpe 1063f394576eSJason Gunthorpe if (pfns->user.upages_end > pfns->batch_end_index) { 1064f394576eSJason Gunthorpe size_t npages = pfns->user.upages_end - pfns->batch_end_index; 1065f394576eSJason Gunthorpe 1066f394576eSJason Gunthorpe /* Any pages not transferred to the batch are just unpinned */ 1067f394576eSJason Gunthorpe unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - 1068f394576eSJason Gunthorpe pfns->user.upages_start), 1069f394576eSJason Gunthorpe npages); 1070f394576eSJason Gunthorpe iopt_pages_sub_npinned(pages, npages); 1071f394576eSJason Gunthorpe pfns->user.upages_end = pfns->batch_end_index; 1072f394576eSJason Gunthorpe } 1073f394576eSJason Gunthorpe if (pfns->batch_start_index != pfns->batch_end_index) { 1074f394576eSJason Gunthorpe pfn_reader_unpin(pfns); 1075f394576eSJason Gunthorpe pfns->batch_start_index = pfns->batch_end_index; 1076f394576eSJason Gunthorpe } 1077f394576eSJason Gunthorpe } 1078f394576eSJason Gunthorpe 1079f394576eSJason Gunthorpe static void pfn_reader_destroy(struct pfn_reader *pfns) 1080f394576eSJason Gunthorpe { 1081f394576eSJason Gunthorpe struct iopt_pages *pages = pfns->pages; 1082f394576eSJason Gunthorpe 1083f394576eSJason Gunthorpe pfn_reader_release_pins(pfns); 1084f394576eSJason Gunthorpe pfn_reader_user_destroy(&pfns->user, pfns->pages); 1085f394576eSJason Gunthorpe batch_destroy(&pfns->batch, NULL); 1086f394576eSJason Gunthorpe WARN_ON(pages->last_npinned != pages->npinned); 1087f394576eSJason Gunthorpe } 1088f394576eSJason Gunthorpe 1089f394576eSJason Gunthorpe static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, 1090f394576eSJason Gunthorpe unsigned long start_index, unsigned long last_index) 1091f394576eSJason Gunthorpe { 1092f394576eSJason Gunthorpe int rc; 1093f394576eSJason Gunthorpe 1094f394576eSJason Gunthorpe rc = pfn_reader_init(pfns, pages, start_index, last_index); 1095f394576eSJason Gunthorpe if (rc) 1096f394576eSJason Gunthorpe return rc; 1097f394576eSJason Gunthorpe rc = pfn_reader_next(pfns); 1098f394576eSJason Gunthorpe if (rc) { 1099f394576eSJason Gunthorpe pfn_reader_destroy(pfns); 1100f394576eSJason Gunthorpe return rc; 1101f394576eSJason Gunthorpe } 1102f394576eSJason Gunthorpe return 0; 1103f394576eSJason Gunthorpe } 11048d160cd4SJason Gunthorpe 11058d160cd4SJason Gunthorpe struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, 11068d160cd4SJason Gunthorpe bool writable) 11078d160cd4SJason Gunthorpe { 11088d160cd4SJason Gunthorpe struct iopt_pages *pages; 11098d160cd4SJason Gunthorpe 11108d160cd4SJason Gunthorpe /* 11118d160cd4SJason Gunthorpe * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP 11128d160cd4SJason Gunthorpe * below from overflow 11138d160cd4SJason Gunthorpe */ 11148d160cd4SJason Gunthorpe if (length > SIZE_MAX - PAGE_SIZE || length == 0) 11158d160cd4SJason Gunthorpe return ERR_PTR(-EINVAL); 11168d160cd4SJason Gunthorpe 11178d160cd4SJason Gunthorpe pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); 11188d160cd4SJason Gunthorpe if (!pages) 11198d160cd4SJason Gunthorpe return ERR_PTR(-ENOMEM); 11208d160cd4SJason Gunthorpe 11218d160cd4SJason Gunthorpe kref_init(&pages->kref); 11228d160cd4SJason Gunthorpe xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT); 11238d160cd4SJason Gunthorpe mutex_init(&pages->mutex); 11248d160cd4SJason Gunthorpe pages->source_mm = current->mm; 11258d160cd4SJason Gunthorpe mmgrab(pages->source_mm); 11268d160cd4SJason Gunthorpe pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); 11278d160cd4SJason Gunthorpe pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); 11288d160cd4SJason Gunthorpe pages->access_itree = RB_ROOT_CACHED; 11298d160cd4SJason Gunthorpe pages->domains_itree = RB_ROOT_CACHED; 11308d160cd4SJason Gunthorpe pages->writable = writable; 11318d160cd4SJason Gunthorpe if (capable(CAP_IPC_LOCK)) 11328d160cd4SJason Gunthorpe pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; 11338d160cd4SJason Gunthorpe else 11348d160cd4SJason Gunthorpe pages->account_mode = IOPT_PAGES_ACCOUNT_USER; 11358d160cd4SJason Gunthorpe pages->source_task = current->group_leader; 11368d160cd4SJason Gunthorpe get_task_struct(current->group_leader); 11378d160cd4SJason Gunthorpe pages->source_user = get_uid(current_user()); 11388d160cd4SJason Gunthorpe return pages; 11398d160cd4SJason Gunthorpe } 11408d160cd4SJason Gunthorpe 11418d160cd4SJason Gunthorpe void iopt_release_pages(struct kref *kref) 11428d160cd4SJason Gunthorpe { 11438d160cd4SJason Gunthorpe struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); 11448d160cd4SJason Gunthorpe 11458d160cd4SJason Gunthorpe WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root)); 11468d160cd4SJason Gunthorpe WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root)); 11478d160cd4SJason Gunthorpe WARN_ON(pages->npinned); 11488d160cd4SJason Gunthorpe WARN_ON(!xa_empty(&pages->pinned_pfns)); 11498d160cd4SJason Gunthorpe mmdrop(pages->source_mm); 11508d160cd4SJason Gunthorpe mutex_destroy(&pages->mutex); 11518d160cd4SJason Gunthorpe put_task_struct(pages->source_task); 11528d160cd4SJason Gunthorpe free_uid(pages->source_user); 11538d160cd4SJason Gunthorpe kfree(pages); 11548d160cd4SJason Gunthorpe } 11558d160cd4SJason Gunthorpe 11568d160cd4SJason Gunthorpe static void 11578d160cd4SJason Gunthorpe iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area, 11588d160cd4SJason Gunthorpe struct iopt_pages *pages, struct iommu_domain *domain, 11598d160cd4SJason Gunthorpe unsigned long start_index, unsigned long last_index, 11608d160cd4SJason Gunthorpe unsigned long *unmapped_end_index, 11618d160cd4SJason Gunthorpe unsigned long real_last_index) 11628d160cd4SJason Gunthorpe { 11638d160cd4SJason Gunthorpe while (start_index <= last_index) { 11648d160cd4SJason Gunthorpe unsigned long batch_last_index; 11658d160cd4SJason Gunthorpe 11668d160cd4SJason Gunthorpe if (*unmapped_end_index <= last_index) { 11678d160cd4SJason Gunthorpe unsigned long start = 11688d160cd4SJason Gunthorpe max(start_index, *unmapped_end_index); 11698d160cd4SJason Gunthorpe 11708d160cd4SJason Gunthorpe batch_from_domain(batch, domain, area, start, 11718d160cd4SJason Gunthorpe last_index); 11728d160cd4SJason Gunthorpe batch_last_index = start + batch->total_pfns - 1; 11738d160cd4SJason Gunthorpe } else { 11748d160cd4SJason Gunthorpe batch_last_index = last_index; 11758d160cd4SJason Gunthorpe } 11768d160cd4SJason Gunthorpe 11778d160cd4SJason Gunthorpe /* 11788d160cd4SJason Gunthorpe * unmaps must always 'cut' at a place where the pfns are not 11798d160cd4SJason Gunthorpe * contiguous to pair with the maps that always install 11808d160cd4SJason Gunthorpe * contiguous pages. Thus, if we have to stop unpinning in the 11818d160cd4SJason Gunthorpe * middle of the domains we need to keep reading pfns until we 11828d160cd4SJason Gunthorpe * find a cut point to do the unmap. The pfns we read are 11838d160cd4SJason Gunthorpe * carried over and either skipped or integrated into the next 11848d160cd4SJason Gunthorpe * batch. 11858d160cd4SJason Gunthorpe */ 11868d160cd4SJason Gunthorpe if (batch_last_index == last_index && 11878d160cd4SJason Gunthorpe last_index != real_last_index) 11888d160cd4SJason Gunthorpe batch_from_domain_continue(batch, domain, area, 11898d160cd4SJason Gunthorpe last_index + 1, 11908d160cd4SJason Gunthorpe real_last_index); 11918d160cd4SJason Gunthorpe 11928d160cd4SJason Gunthorpe if (*unmapped_end_index <= batch_last_index) { 11938d160cd4SJason Gunthorpe iopt_area_unmap_domain_range( 11948d160cd4SJason Gunthorpe area, domain, *unmapped_end_index, 11958d160cd4SJason Gunthorpe start_index + batch->total_pfns - 1); 11968d160cd4SJason Gunthorpe *unmapped_end_index = start_index + batch->total_pfns; 11978d160cd4SJason Gunthorpe } 11988d160cd4SJason Gunthorpe 11998d160cd4SJason Gunthorpe /* unpin must follow unmap */ 12008d160cd4SJason Gunthorpe batch_unpin(batch, pages, 0, 12018d160cd4SJason Gunthorpe batch_last_index - start_index + 1); 12028d160cd4SJason Gunthorpe start_index = batch_last_index + 1; 12038d160cd4SJason Gunthorpe 12048d160cd4SJason Gunthorpe batch_clear_carry(batch, 12058d160cd4SJason Gunthorpe *unmapped_end_index - batch_last_index - 1); 12068d160cd4SJason Gunthorpe } 12078d160cd4SJason Gunthorpe } 12088d160cd4SJason Gunthorpe 12098d160cd4SJason Gunthorpe static void __iopt_area_unfill_domain(struct iopt_area *area, 12108d160cd4SJason Gunthorpe struct iopt_pages *pages, 12118d160cd4SJason Gunthorpe struct iommu_domain *domain, 12128d160cd4SJason Gunthorpe unsigned long last_index) 12138d160cd4SJason Gunthorpe { 12148d160cd4SJason Gunthorpe struct interval_tree_double_span_iter span; 12158d160cd4SJason Gunthorpe unsigned long start_index = iopt_area_index(area); 12168d160cd4SJason Gunthorpe unsigned long unmapped_end_index = start_index; 12178d160cd4SJason Gunthorpe u64 backup[BATCH_BACKUP_SIZE]; 12188d160cd4SJason Gunthorpe struct pfn_batch batch; 12198d160cd4SJason Gunthorpe 12208d160cd4SJason Gunthorpe lockdep_assert_held(&pages->mutex); 12218d160cd4SJason Gunthorpe 12228d160cd4SJason Gunthorpe /* 12238d160cd4SJason Gunthorpe * For security we must not unpin something that is still DMA mapped, 12248d160cd4SJason Gunthorpe * so this must unmap any IOVA before we go ahead and unpin the pages. 12258d160cd4SJason Gunthorpe * This creates a complexity where we need to skip over unpinning pages 12268d160cd4SJason Gunthorpe * held in the xarray, but continue to unmap from the domain. 12278d160cd4SJason Gunthorpe * 12288d160cd4SJason Gunthorpe * The domain unmap cannot stop in the middle of a contiguous range of 12298d160cd4SJason Gunthorpe * PFNs. To solve this problem the unpinning step will read ahead to the 12308d160cd4SJason Gunthorpe * end of any contiguous span, unmap that whole span, and then only 12318d160cd4SJason Gunthorpe * unpin the leading part that does not have any accesses. The residual 12328d160cd4SJason Gunthorpe * PFNs that were unmapped but not unpinned are called a "carry" in the 12338d160cd4SJason Gunthorpe * batch as they are moved to the front of the PFN list and continue on 12348d160cd4SJason Gunthorpe * to the next iteration(s). 12358d160cd4SJason Gunthorpe */ 12368d160cd4SJason Gunthorpe batch_init_backup(&batch, last_index + 1, backup, sizeof(backup)); 12378d160cd4SJason Gunthorpe interval_tree_for_each_double_span(&span, &pages->domains_itree, 12388d160cd4SJason Gunthorpe &pages->access_itree, start_index, 12398d160cd4SJason Gunthorpe last_index) { 12408d160cd4SJason Gunthorpe if (span.is_used) { 12418d160cd4SJason Gunthorpe batch_skip_carry(&batch, 12428d160cd4SJason Gunthorpe span.last_used - span.start_used + 1); 12438d160cd4SJason Gunthorpe continue; 12448d160cd4SJason Gunthorpe } 12458d160cd4SJason Gunthorpe iopt_area_unpin_domain(&batch, area, pages, domain, 12468d160cd4SJason Gunthorpe span.start_hole, span.last_hole, 12478d160cd4SJason Gunthorpe &unmapped_end_index, last_index); 12488d160cd4SJason Gunthorpe } 12498d160cd4SJason Gunthorpe /* 12508d160cd4SJason Gunthorpe * If the range ends in a access then we do the residual unmap without 12518d160cd4SJason Gunthorpe * any unpins. 12528d160cd4SJason Gunthorpe */ 12538d160cd4SJason Gunthorpe if (unmapped_end_index != last_index + 1) 12548d160cd4SJason Gunthorpe iopt_area_unmap_domain_range(area, domain, unmapped_end_index, 12558d160cd4SJason Gunthorpe last_index); 12568d160cd4SJason Gunthorpe WARN_ON(batch.total_pfns); 12578d160cd4SJason Gunthorpe batch_destroy(&batch, backup); 12588d160cd4SJason Gunthorpe update_unpinned(pages); 12598d160cd4SJason Gunthorpe } 12608d160cd4SJason Gunthorpe 12618d160cd4SJason Gunthorpe static void iopt_area_unfill_partial_domain(struct iopt_area *area, 12628d160cd4SJason Gunthorpe struct iopt_pages *pages, 12638d160cd4SJason Gunthorpe struct iommu_domain *domain, 12648d160cd4SJason Gunthorpe unsigned long end_index) 12658d160cd4SJason Gunthorpe { 12668d160cd4SJason Gunthorpe if (end_index != iopt_area_index(area)) 12678d160cd4SJason Gunthorpe __iopt_area_unfill_domain(area, pages, domain, end_index - 1); 12688d160cd4SJason Gunthorpe } 12698d160cd4SJason Gunthorpe 12708d160cd4SJason Gunthorpe /** 12718d160cd4SJason Gunthorpe * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain 12728d160cd4SJason Gunthorpe * @area: The IOVA range to unmap 12738d160cd4SJason Gunthorpe * @domain: The domain to unmap 12748d160cd4SJason Gunthorpe * 12758d160cd4SJason Gunthorpe * The caller must know that unpinning is not required, usually because there 12768d160cd4SJason Gunthorpe * are other domains in the iopt. 12778d160cd4SJason Gunthorpe */ 12788d160cd4SJason Gunthorpe void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) 12798d160cd4SJason Gunthorpe { 12808d160cd4SJason Gunthorpe iommu_unmap_nofail(domain, iopt_area_iova(area), 12818d160cd4SJason Gunthorpe iopt_area_length(area)); 12828d160cd4SJason Gunthorpe } 12838d160cd4SJason Gunthorpe 12848d160cd4SJason Gunthorpe /** 12858d160cd4SJason Gunthorpe * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain 12868d160cd4SJason Gunthorpe * @area: IOVA area to use 12878d160cd4SJason Gunthorpe * @pages: page supplier for the area (area->pages is NULL) 12888d160cd4SJason Gunthorpe * @domain: Domain to unmap from 12898d160cd4SJason Gunthorpe * 12908d160cd4SJason Gunthorpe * The domain should be removed from the domains_itree before calling. The 12918d160cd4SJason Gunthorpe * domain will always be unmapped, but the PFNs may not be unpinned if there are 12928d160cd4SJason Gunthorpe * still accesses. 12938d160cd4SJason Gunthorpe */ 12948d160cd4SJason Gunthorpe void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, 12958d160cd4SJason Gunthorpe struct iommu_domain *domain) 12968d160cd4SJason Gunthorpe { 12978d160cd4SJason Gunthorpe __iopt_area_unfill_domain(area, pages, domain, 12988d160cd4SJason Gunthorpe iopt_area_last_index(area)); 12998d160cd4SJason Gunthorpe } 13008d160cd4SJason Gunthorpe 13018d160cd4SJason Gunthorpe /** 13028d160cd4SJason Gunthorpe * iopt_area_fill_domain() - Map PFNs from the area into a domain 13038d160cd4SJason Gunthorpe * @area: IOVA area to use 13048d160cd4SJason Gunthorpe * @domain: Domain to load PFNs into 13058d160cd4SJason Gunthorpe * 13068d160cd4SJason Gunthorpe * Read the pfns from the area's underlying iopt_pages and map them into the 13078d160cd4SJason Gunthorpe * given domain. Called when attaching a new domain to an io_pagetable. 13088d160cd4SJason Gunthorpe */ 13098d160cd4SJason Gunthorpe int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) 13108d160cd4SJason Gunthorpe { 13118d160cd4SJason Gunthorpe unsigned long done_end_index; 13128d160cd4SJason Gunthorpe struct pfn_reader pfns; 13138d160cd4SJason Gunthorpe int rc; 13148d160cd4SJason Gunthorpe 13158d160cd4SJason Gunthorpe lockdep_assert_held(&area->pages->mutex); 13168d160cd4SJason Gunthorpe 13178d160cd4SJason Gunthorpe rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), 13188d160cd4SJason Gunthorpe iopt_area_last_index(area)); 13198d160cd4SJason Gunthorpe if (rc) 13208d160cd4SJason Gunthorpe return rc; 13218d160cd4SJason Gunthorpe 13228d160cd4SJason Gunthorpe while (!pfn_reader_done(&pfns)) { 13238d160cd4SJason Gunthorpe done_end_index = pfns.batch_start_index; 13248d160cd4SJason Gunthorpe rc = batch_to_domain(&pfns.batch, domain, area, 13258d160cd4SJason Gunthorpe pfns.batch_start_index); 13268d160cd4SJason Gunthorpe if (rc) 13278d160cd4SJason Gunthorpe goto out_unmap; 13288d160cd4SJason Gunthorpe done_end_index = pfns.batch_end_index; 13298d160cd4SJason Gunthorpe 13308d160cd4SJason Gunthorpe rc = pfn_reader_next(&pfns); 13318d160cd4SJason Gunthorpe if (rc) 13328d160cd4SJason Gunthorpe goto out_unmap; 13338d160cd4SJason Gunthorpe } 13348d160cd4SJason Gunthorpe 13358d160cd4SJason Gunthorpe rc = pfn_reader_update_pinned(&pfns); 13368d160cd4SJason Gunthorpe if (rc) 13378d160cd4SJason Gunthorpe goto out_unmap; 13388d160cd4SJason Gunthorpe goto out_destroy; 13398d160cd4SJason Gunthorpe 13408d160cd4SJason Gunthorpe out_unmap: 13418d160cd4SJason Gunthorpe pfn_reader_release_pins(&pfns); 13428d160cd4SJason Gunthorpe iopt_area_unfill_partial_domain(area, area->pages, domain, 13438d160cd4SJason Gunthorpe done_end_index); 13448d160cd4SJason Gunthorpe out_destroy: 13458d160cd4SJason Gunthorpe pfn_reader_destroy(&pfns); 13468d160cd4SJason Gunthorpe return rc; 13478d160cd4SJason Gunthorpe } 13488d160cd4SJason Gunthorpe 13498d160cd4SJason Gunthorpe /** 13508d160cd4SJason Gunthorpe * iopt_area_fill_domains() - Install PFNs into the area's domains 13518d160cd4SJason Gunthorpe * @area: The area to act on 13528d160cd4SJason Gunthorpe * @pages: The pages associated with the area (area->pages is NULL) 13538d160cd4SJason Gunthorpe * 13548d160cd4SJason Gunthorpe * Called during area creation. The area is freshly created and not inserted in 13558d160cd4SJason Gunthorpe * the domains_itree yet. PFNs are read and loaded into every domain held in the 13568d160cd4SJason Gunthorpe * area's io_pagetable and the area is installed in the domains_itree. 13578d160cd4SJason Gunthorpe * 13588d160cd4SJason Gunthorpe * On failure all domains are left unchanged. 13598d160cd4SJason Gunthorpe */ 13608d160cd4SJason Gunthorpe int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) 13618d160cd4SJason Gunthorpe { 13628d160cd4SJason Gunthorpe unsigned long done_first_end_index; 13638d160cd4SJason Gunthorpe unsigned long done_all_end_index; 13648d160cd4SJason Gunthorpe struct iommu_domain *domain; 13658d160cd4SJason Gunthorpe unsigned long unmap_index; 13668d160cd4SJason Gunthorpe struct pfn_reader pfns; 13678d160cd4SJason Gunthorpe unsigned long index; 13688d160cd4SJason Gunthorpe int rc; 13698d160cd4SJason Gunthorpe 13708d160cd4SJason Gunthorpe lockdep_assert_held(&area->iopt->domains_rwsem); 13718d160cd4SJason Gunthorpe 13728d160cd4SJason Gunthorpe if (xa_empty(&area->iopt->domains)) 13738d160cd4SJason Gunthorpe return 0; 13748d160cd4SJason Gunthorpe 13758d160cd4SJason Gunthorpe mutex_lock(&pages->mutex); 13768d160cd4SJason Gunthorpe rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), 13778d160cd4SJason Gunthorpe iopt_area_last_index(area)); 13788d160cd4SJason Gunthorpe if (rc) 13798d160cd4SJason Gunthorpe goto out_unlock; 13808d160cd4SJason Gunthorpe 13818d160cd4SJason Gunthorpe while (!pfn_reader_done(&pfns)) { 13828d160cd4SJason Gunthorpe done_first_end_index = pfns.batch_end_index; 13838d160cd4SJason Gunthorpe done_all_end_index = pfns.batch_start_index; 13848d160cd4SJason Gunthorpe xa_for_each(&area->iopt->domains, index, domain) { 13858d160cd4SJason Gunthorpe rc = batch_to_domain(&pfns.batch, domain, area, 13868d160cd4SJason Gunthorpe pfns.batch_start_index); 13878d160cd4SJason Gunthorpe if (rc) 13888d160cd4SJason Gunthorpe goto out_unmap; 13898d160cd4SJason Gunthorpe } 13908d160cd4SJason Gunthorpe done_all_end_index = done_first_end_index; 13918d160cd4SJason Gunthorpe 13928d160cd4SJason Gunthorpe rc = pfn_reader_next(&pfns); 13938d160cd4SJason Gunthorpe if (rc) 13948d160cd4SJason Gunthorpe goto out_unmap; 13958d160cd4SJason Gunthorpe } 13968d160cd4SJason Gunthorpe rc = pfn_reader_update_pinned(&pfns); 13978d160cd4SJason Gunthorpe if (rc) 13988d160cd4SJason Gunthorpe goto out_unmap; 13998d160cd4SJason Gunthorpe 14008d160cd4SJason Gunthorpe area->storage_domain = xa_load(&area->iopt->domains, 0); 14018d160cd4SJason Gunthorpe interval_tree_insert(&area->pages_node, &pages->domains_itree); 14028d160cd4SJason Gunthorpe goto out_destroy; 14038d160cd4SJason Gunthorpe 14048d160cd4SJason Gunthorpe out_unmap: 14058d160cd4SJason Gunthorpe pfn_reader_release_pins(&pfns); 14068d160cd4SJason Gunthorpe xa_for_each(&area->iopt->domains, unmap_index, domain) { 14078d160cd4SJason Gunthorpe unsigned long end_index; 14088d160cd4SJason Gunthorpe 14098d160cd4SJason Gunthorpe if (unmap_index < index) 14108d160cd4SJason Gunthorpe end_index = done_first_end_index; 14118d160cd4SJason Gunthorpe else 14128d160cd4SJason Gunthorpe end_index = done_all_end_index; 14138d160cd4SJason Gunthorpe 14148d160cd4SJason Gunthorpe /* 14158d160cd4SJason Gunthorpe * The area is not yet part of the domains_itree so we have to 14168d160cd4SJason Gunthorpe * manage the unpinning specially. The last domain does the 14178d160cd4SJason Gunthorpe * unpin, every other domain is just unmapped. 14188d160cd4SJason Gunthorpe */ 14198d160cd4SJason Gunthorpe if (unmap_index != area->iopt->next_domain_id - 1) { 14208d160cd4SJason Gunthorpe if (end_index != iopt_area_index(area)) 14218d160cd4SJason Gunthorpe iopt_area_unmap_domain_range( 14228d160cd4SJason Gunthorpe area, domain, iopt_area_index(area), 14238d160cd4SJason Gunthorpe end_index - 1); 14248d160cd4SJason Gunthorpe } else { 14258d160cd4SJason Gunthorpe iopt_area_unfill_partial_domain(area, pages, domain, 14268d160cd4SJason Gunthorpe end_index); 14278d160cd4SJason Gunthorpe } 14288d160cd4SJason Gunthorpe } 14298d160cd4SJason Gunthorpe out_destroy: 14308d160cd4SJason Gunthorpe pfn_reader_destroy(&pfns); 14318d160cd4SJason Gunthorpe out_unlock: 14328d160cd4SJason Gunthorpe mutex_unlock(&pages->mutex); 14338d160cd4SJason Gunthorpe return rc; 14348d160cd4SJason Gunthorpe } 14358d160cd4SJason Gunthorpe 14368d160cd4SJason Gunthorpe /** 14378d160cd4SJason Gunthorpe * iopt_area_unfill_domains() - unmap PFNs from the area's domains 14388d160cd4SJason Gunthorpe * @area: The area to act on 14398d160cd4SJason Gunthorpe * @pages: The pages associated with the area (area->pages is NULL) 14408d160cd4SJason Gunthorpe * 14418d160cd4SJason Gunthorpe * Called during area destruction. This unmaps the iova's covered by all the 14428d160cd4SJason Gunthorpe * area's domains and releases the PFNs. 14438d160cd4SJason Gunthorpe */ 14448d160cd4SJason Gunthorpe void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) 14458d160cd4SJason Gunthorpe { 14468d160cd4SJason Gunthorpe struct io_pagetable *iopt = area->iopt; 14478d160cd4SJason Gunthorpe struct iommu_domain *domain; 14488d160cd4SJason Gunthorpe unsigned long index; 14498d160cd4SJason Gunthorpe 14508d160cd4SJason Gunthorpe lockdep_assert_held(&iopt->domains_rwsem); 14518d160cd4SJason Gunthorpe 14528d160cd4SJason Gunthorpe mutex_lock(&pages->mutex); 14538d160cd4SJason Gunthorpe if (!area->storage_domain) 14548d160cd4SJason Gunthorpe goto out_unlock; 14558d160cd4SJason Gunthorpe 14568d160cd4SJason Gunthorpe xa_for_each(&iopt->domains, index, domain) 14578d160cd4SJason Gunthorpe if (domain != area->storage_domain) 14588d160cd4SJason Gunthorpe iopt_area_unmap_domain_range( 14598d160cd4SJason Gunthorpe area, domain, iopt_area_index(area), 14608d160cd4SJason Gunthorpe iopt_area_last_index(area)); 14618d160cd4SJason Gunthorpe 14628d160cd4SJason Gunthorpe interval_tree_remove(&area->pages_node, &pages->domains_itree); 14638d160cd4SJason Gunthorpe iopt_area_unfill_domain(area, pages, area->storage_domain); 14648d160cd4SJason Gunthorpe area->storage_domain = NULL; 14658d160cd4SJason Gunthorpe out_unlock: 14668d160cd4SJason Gunthorpe mutex_unlock(&pages->mutex); 14678d160cd4SJason Gunthorpe } 14688d160cd4SJason Gunthorpe 14698d160cd4SJason Gunthorpe static void iopt_pages_unpin_xarray(struct pfn_batch *batch, 14708d160cd4SJason Gunthorpe struct iopt_pages *pages, 14718d160cd4SJason Gunthorpe unsigned long start_index, 14728d160cd4SJason Gunthorpe unsigned long end_index) 14738d160cd4SJason Gunthorpe { 14748d160cd4SJason Gunthorpe while (start_index <= end_index) { 14758d160cd4SJason Gunthorpe batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index, 14768d160cd4SJason Gunthorpe end_index); 14778d160cd4SJason Gunthorpe batch_unpin(batch, pages, 0, batch->total_pfns); 14788d160cd4SJason Gunthorpe start_index += batch->total_pfns; 14798d160cd4SJason Gunthorpe batch_clear(batch); 14808d160cd4SJason Gunthorpe } 14818d160cd4SJason Gunthorpe } 14828d160cd4SJason Gunthorpe 14838d160cd4SJason Gunthorpe /** 14848d160cd4SJason Gunthorpe * iopt_pages_unfill_xarray() - Update the xarry after removing an access 14858d160cd4SJason Gunthorpe * @pages: The pages to act on 14868d160cd4SJason Gunthorpe * @start_index: Starting PFN index 14878d160cd4SJason Gunthorpe * @last_index: Last PFN index 14888d160cd4SJason Gunthorpe * 14898d160cd4SJason Gunthorpe * Called when an iopt_pages_access is removed, removes pages from the itree. 14908d160cd4SJason Gunthorpe * The access should already be removed from the access_itree. 14918d160cd4SJason Gunthorpe */ 14928d160cd4SJason Gunthorpe void iopt_pages_unfill_xarray(struct iopt_pages *pages, 14938d160cd4SJason Gunthorpe unsigned long start_index, 14948d160cd4SJason Gunthorpe unsigned long last_index) 14958d160cd4SJason Gunthorpe { 14968d160cd4SJason Gunthorpe struct interval_tree_double_span_iter span; 14978d160cd4SJason Gunthorpe u64 backup[BATCH_BACKUP_SIZE]; 14988d160cd4SJason Gunthorpe struct pfn_batch batch; 14998d160cd4SJason Gunthorpe bool batch_inited = false; 15008d160cd4SJason Gunthorpe 15018d160cd4SJason Gunthorpe lockdep_assert_held(&pages->mutex); 15028d160cd4SJason Gunthorpe 15038d160cd4SJason Gunthorpe interval_tree_for_each_double_span(&span, &pages->access_itree, 15048d160cd4SJason Gunthorpe &pages->domains_itree, start_index, 15058d160cd4SJason Gunthorpe last_index) { 15068d160cd4SJason Gunthorpe if (!span.is_used) { 15078d160cd4SJason Gunthorpe if (!batch_inited) { 15088d160cd4SJason Gunthorpe batch_init_backup(&batch, 15098d160cd4SJason Gunthorpe last_index - start_index + 1, 15108d160cd4SJason Gunthorpe backup, sizeof(backup)); 15118d160cd4SJason Gunthorpe batch_inited = true; 15128d160cd4SJason Gunthorpe } 15138d160cd4SJason Gunthorpe iopt_pages_unpin_xarray(&batch, pages, span.start_hole, 15148d160cd4SJason Gunthorpe span.last_hole); 15158d160cd4SJason Gunthorpe } else if (span.is_used == 2) { 15168d160cd4SJason Gunthorpe /* Covered by a domain */ 15178d160cd4SJason Gunthorpe clear_xarray(&pages->pinned_pfns, span.start_used, 15188d160cd4SJason Gunthorpe span.last_used); 15198d160cd4SJason Gunthorpe } 15208d160cd4SJason Gunthorpe /* Otherwise covered by an existing access */ 15218d160cd4SJason Gunthorpe } 15228d160cd4SJason Gunthorpe if (batch_inited) 15238d160cd4SJason Gunthorpe batch_destroy(&batch, backup); 15248d160cd4SJason Gunthorpe update_unpinned(pages); 15258d160cd4SJason Gunthorpe } 15268d160cd4SJason Gunthorpe 15278d160cd4SJason Gunthorpe /** 15288d160cd4SJason Gunthorpe * iopt_pages_fill_from_xarray() - Fast path for reading PFNs 15298d160cd4SJason Gunthorpe * @pages: The pages to act on 15308d160cd4SJason Gunthorpe * @start_index: The first page index in the range 15318d160cd4SJason Gunthorpe * @last_index: The last page index in the range 15328d160cd4SJason Gunthorpe * @out_pages: The output array to return the pages 15338d160cd4SJason Gunthorpe * 15348d160cd4SJason Gunthorpe * This can be called if the caller is holding a refcount on an 15358d160cd4SJason Gunthorpe * iopt_pages_access that is known to have already been filled. It quickly reads 15368d160cd4SJason Gunthorpe * the pages directly from the xarray. 15378d160cd4SJason Gunthorpe * 15388d160cd4SJason Gunthorpe * This is part of the SW iommu interface to read pages for in-kernel use. 15398d160cd4SJason Gunthorpe */ 15408d160cd4SJason Gunthorpe void iopt_pages_fill_from_xarray(struct iopt_pages *pages, 15418d160cd4SJason Gunthorpe unsigned long start_index, 15428d160cd4SJason Gunthorpe unsigned long last_index, 15438d160cd4SJason Gunthorpe struct page **out_pages) 15448d160cd4SJason Gunthorpe { 15458d160cd4SJason Gunthorpe XA_STATE(xas, &pages->pinned_pfns, start_index); 15468d160cd4SJason Gunthorpe void *entry; 15478d160cd4SJason Gunthorpe 15488d160cd4SJason Gunthorpe rcu_read_lock(); 15498d160cd4SJason Gunthorpe while (start_index <= last_index) { 15508d160cd4SJason Gunthorpe entry = xas_next(&xas); 15518d160cd4SJason Gunthorpe if (xas_retry(&xas, entry)) 15528d160cd4SJason Gunthorpe continue; 15538d160cd4SJason Gunthorpe WARN_ON(!xa_is_value(entry)); 15548d160cd4SJason Gunthorpe *(out_pages++) = pfn_to_page(xa_to_value(entry)); 15558d160cd4SJason Gunthorpe start_index++; 15568d160cd4SJason Gunthorpe } 15578d160cd4SJason Gunthorpe rcu_read_unlock(); 15588d160cd4SJason Gunthorpe } 15598d160cd4SJason Gunthorpe 15608d160cd4SJason Gunthorpe static int iopt_pages_fill_from_domain(struct iopt_pages *pages, 15618d160cd4SJason Gunthorpe unsigned long start_index, 15628d160cd4SJason Gunthorpe unsigned long last_index, 15638d160cd4SJason Gunthorpe struct page **out_pages) 15648d160cd4SJason Gunthorpe { 15658d160cd4SJason Gunthorpe while (start_index != last_index + 1) { 15668d160cd4SJason Gunthorpe unsigned long domain_last; 15678d160cd4SJason Gunthorpe struct iopt_area *area; 15688d160cd4SJason Gunthorpe 15698d160cd4SJason Gunthorpe area = iopt_pages_find_domain_area(pages, start_index); 15708d160cd4SJason Gunthorpe if (WARN_ON(!area)) 15718d160cd4SJason Gunthorpe return -EINVAL; 15728d160cd4SJason Gunthorpe 15738d160cd4SJason Gunthorpe domain_last = min(iopt_area_last_index(area), last_index); 15748d160cd4SJason Gunthorpe out_pages = raw_pages_from_domain(area->storage_domain, area, 15758d160cd4SJason Gunthorpe start_index, domain_last, 15768d160cd4SJason Gunthorpe out_pages); 15778d160cd4SJason Gunthorpe start_index = domain_last + 1; 15788d160cd4SJason Gunthorpe } 15798d160cd4SJason Gunthorpe return 0; 15808d160cd4SJason Gunthorpe } 15818d160cd4SJason Gunthorpe 15828d160cd4SJason Gunthorpe static int iopt_pages_fill_from_mm(struct iopt_pages *pages, 15838d160cd4SJason Gunthorpe struct pfn_reader_user *user, 15848d160cd4SJason Gunthorpe unsigned long start_index, 15858d160cd4SJason Gunthorpe unsigned long last_index, 15868d160cd4SJason Gunthorpe struct page **out_pages) 15878d160cd4SJason Gunthorpe { 15888d160cd4SJason Gunthorpe unsigned long cur_index = start_index; 15898d160cd4SJason Gunthorpe int rc; 15908d160cd4SJason Gunthorpe 15918d160cd4SJason Gunthorpe while (cur_index != last_index + 1) { 15928d160cd4SJason Gunthorpe user->upages = out_pages + (cur_index - start_index); 15938d160cd4SJason Gunthorpe rc = pfn_reader_user_pin(user, pages, cur_index, last_index); 15948d160cd4SJason Gunthorpe if (rc) 15958d160cd4SJason Gunthorpe goto out_unpin; 15968d160cd4SJason Gunthorpe cur_index = user->upages_end; 15978d160cd4SJason Gunthorpe } 15988d160cd4SJason Gunthorpe return 0; 15998d160cd4SJason Gunthorpe 16008d160cd4SJason Gunthorpe out_unpin: 16018d160cd4SJason Gunthorpe if (start_index != cur_index) 16028d160cd4SJason Gunthorpe iopt_pages_err_unpin(pages, start_index, cur_index - 1, 16038d160cd4SJason Gunthorpe out_pages); 16048d160cd4SJason Gunthorpe return rc; 16058d160cd4SJason Gunthorpe } 16068d160cd4SJason Gunthorpe 16078d160cd4SJason Gunthorpe /** 16088d160cd4SJason Gunthorpe * iopt_pages_fill_xarray() - Read PFNs 16098d160cd4SJason Gunthorpe * @pages: The pages to act on 16108d160cd4SJason Gunthorpe * @start_index: The first page index in the range 16118d160cd4SJason Gunthorpe * @last_index: The last page index in the range 16128d160cd4SJason Gunthorpe * @out_pages: The output array to return the pages, may be NULL 16138d160cd4SJason Gunthorpe * 16148d160cd4SJason Gunthorpe * This populates the xarray and returns the pages in out_pages. As the slow 16158d160cd4SJason Gunthorpe * path this is able to copy pages from other storage tiers into the xarray. 16168d160cd4SJason Gunthorpe * 16178d160cd4SJason Gunthorpe * On failure the xarray is left unchanged. 16188d160cd4SJason Gunthorpe * 16198d160cd4SJason Gunthorpe * This is part of the SW iommu interface to read pages for in-kernel use. 16208d160cd4SJason Gunthorpe */ 16218d160cd4SJason Gunthorpe int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, 16228d160cd4SJason Gunthorpe unsigned long last_index, struct page **out_pages) 16238d160cd4SJason Gunthorpe { 16248d160cd4SJason Gunthorpe struct interval_tree_double_span_iter span; 16258d160cd4SJason Gunthorpe unsigned long xa_end = start_index; 16268d160cd4SJason Gunthorpe struct pfn_reader_user user; 16278d160cd4SJason Gunthorpe int rc; 16288d160cd4SJason Gunthorpe 16298d160cd4SJason Gunthorpe lockdep_assert_held(&pages->mutex); 16308d160cd4SJason Gunthorpe 16318d160cd4SJason Gunthorpe pfn_reader_user_init(&user, pages); 16328d160cd4SJason Gunthorpe user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages); 16338d160cd4SJason Gunthorpe interval_tree_for_each_double_span(&span, &pages->access_itree, 16348d160cd4SJason Gunthorpe &pages->domains_itree, start_index, 16358d160cd4SJason Gunthorpe last_index) { 16368d160cd4SJason Gunthorpe struct page **cur_pages; 16378d160cd4SJason Gunthorpe 16388d160cd4SJason Gunthorpe if (span.is_used == 1) { 16398d160cd4SJason Gunthorpe cur_pages = out_pages + (span.start_used - start_index); 16408d160cd4SJason Gunthorpe iopt_pages_fill_from_xarray(pages, span.start_used, 16418d160cd4SJason Gunthorpe span.last_used, cur_pages); 16428d160cd4SJason Gunthorpe continue; 16438d160cd4SJason Gunthorpe } 16448d160cd4SJason Gunthorpe 16458d160cd4SJason Gunthorpe if (span.is_used == 2) { 16468d160cd4SJason Gunthorpe cur_pages = out_pages + (span.start_used - start_index); 16478d160cd4SJason Gunthorpe iopt_pages_fill_from_domain(pages, span.start_used, 16488d160cd4SJason Gunthorpe span.last_used, cur_pages); 16498d160cd4SJason Gunthorpe rc = pages_to_xarray(&pages->pinned_pfns, 16508d160cd4SJason Gunthorpe span.start_used, span.last_used, 16518d160cd4SJason Gunthorpe cur_pages); 16528d160cd4SJason Gunthorpe if (rc) 16538d160cd4SJason Gunthorpe goto out_clean_xa; 16548d160cd4SJason Gunthorpe xa_end = span.last_used + 1; 16558d160cd4SJason Gunthorpe continue; 16568d160cd4SJason Gunthorpe } 16578d160cd4SJason Gunthorpe 16588d160cd4SJason Gunthorpe /* hole */ 16598d160cd4SJason Gunthorpe cur_pages = out_pages + (span.start_hole - start_index); 16608d160cd4SJason Gunthorpe rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, 16618d160cd4SJason Gunthorpe span.last_hole, cur_pages); 16628d160cd4SJason Gunthorpe if (rc) 16638d160cd4SJason Gunthorpe goto out_clean_xa; 16648d160cd4SJason Gunthorpe rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, 16658d160cd4SJason Gunthorpe span.last_hole, cur_pages); 16668d160cd4SJason Gunthorpe if (rc) { 16678d160cd4SJason Gunthorpe iopt_pages_err_unpin(pages, span.start_hole, 16688d160cd4SJason Gunthorpe span.last_hole, cur_pages); 16698d160cd4SJason Gunthorpe goto out_clean_xa; 16708d160cd4SJason Gunthorpe } 16718d160cd4SJason Gunthorpe xa_end = span.last_hole + 1; 16728d160cd4SJason Gunthorpe } 16738d160cd4SJason Gunthorpe rc = pfn_reader_user_update_pinned(&user, pages); 16748d160cd4SJason Gunthorpe if (rc) 16758d160cd4SJason Gunthorpe goto out_clean_xa; 16768d160cd4SJason Gunthorpe user.upages = NULL; 16778d160cd4SJason Gunthorpe pfn_reader_user_destroy(&user, pages); 16788d160cd4SJason Gunthorpe return 0; 16798d160cd4SJason Gunthorpe 16808d160cd4SJason Gunthorpe out_clean_xa: 16818d160cd4SJason Gunthorpe if (start_index != xa_end) 16828d160cd4SJason Gunthorpe iopt_pages_unfill_xarray(pages, start_index, xa_end - 1); 16838d160cd4SJason Gunthorpe user.upages = NULL; 16848d160cd4SJason Gunthorpe pfn_reader_user_destroy(&user, pages); 16858d160cd4SJason Gunthorpe return rc; 16868d160cd4SJason Gunthorpe } 16878d160cd4SJason Gunthorpe 16888d160cd4SJason Gunthorpe /* 16898d160cd4SJason Gunthorpe * This uses the pfn_reader instead of taking a shortcut by using the mm. It can 16908d160cd4SJason Gunthorpe * do every scenario and is fully consistent with what an iommu_domain would 16918d160cd4SJason Gunthorpe * see. 16928d160cd4SJason Gunthorpe */ 16938d160cd4SJason Gunthorpe static int iopt_pages_rw_slow(struct iopt_pages *pages, 16948d160cd4SJason Gunthorpe unsigned long start_index, 16958d160cd4SJason Gunthorpe unsigned long last_index, unsigned long offset, 16968d160cd4SJason Gunthorpe void *data, unsigned long length, 16978d160cd4SJason Gunthorpe unsigned int flags) 16988d160cd4SJason Gunthorpe { 16998d160cd4SJason Gunthorpe struct pfn_reader pfns; 17008d160cd4SJason Gunthorpe int rc; 17018d160cd4SJason Gunthorpe 17028d160cd4SJason Gunthorpe mutex_lock(&pages->mutex); 17038d160cd4SJason Gunthorpe 17048d160cd4SJason Gunthorpe rc = pfn_reader_first(&pfns, pages, start_index, last_index); 17058d160cd4SJason Gunthorpe if (rc) 17068d160cd4SJason Gunthorpe goto out_unlock; 17078d160cd4SJason Gunthorpe 17088d160cd4SJason Gunthorpe while (!pfn_reader_done(&pfns)) { 17098d160cd4SJason Gunthorpe unsigned long done; 17108d160cd4SJason Gunthorpe 17118d160cd4SJason Gunthorpe done = batch_rw(&pfns.batch, data, offset, length, flags); 17128d160cd4SJason Gunthorpe data += done; 17138d160cd4SJason Gunthorpe length -= done; 17148d160cd4SJason Gunthorpe offset = 0; 17158d160cd4SJason Gunthorpe pfn_reader_unpin(&pfns); 17168d160cd4SJason Gunthorpe 17178d160cd4SJason Gunthorpe rc = pfn_reader_next(&pfns); 17188d160cd4SJason Gunthorpe if (rc) 17198d160cd4SJason Gunthorpe goto out_destroy; 17208d160cd4SJason Gunthorpe } 17218d160cd4SJason Gunthorpe if (WARN_ON(length != 0)) 17228d160cd4SJason Gunthorpe rc = -EINVAL; 17238d160cd4SJason Gunthorpe out_destroy: 17248d160cd4SJason Gunthorpe pfn_reader_destroy(&pfns); 17258d160cd4SJason Gunthorpe out_unlock: 17268d160cd4SJason Gunthorpe mutex_unlock(&pages->mutex); 17278d160cd4SJason Gunthorpe return rc; 17288d160cd4SJason Gunthorpe } 17298d160cd4SJason Gunthorpe 17308d160cd4SJason Gunthorpe /* 17318d160cd4SJason Gunthorpe * A medium speed path that still allows DMA inconsistencies, but doesn't do any 17328d160cd4SJason Gunthorpe * memory allocations or interval tree searches. 17338d160cd4SJason Gunthorpe */ 17348d160cd4SJason Gunthorpe static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, 17358d160cd4SJason Gunthorpe unsigned long offset, void *data, 17368d160cd4SJason Gunthorpe unsigned long length, unsigned int flags) 17378d160cd4SJason Gunthorpe { 17388d160cd4SJason Gunthorpe struct page *page = NULL; 17398d160cd4SJason Gunthorpe int rc; 17408d160cd4SJason Gunthorpe 17418d160cd4SJason Gunthorpe if (!mmget_not_zero(pages->source_mm)) 17428d160cd4SJason Gunthorpe return iopt_pages_rw_slow(pages, index, index, offset, data, 17438d160cd4SJason Gunthorpe length, flags); 17448d160cd4SJason Gunthorpe 1745*e26eed4fSJason Gunthorpe if (iommufd_should_fail()) { 1746*e26eed4fSJason Gunthorpe rc = -EINVAL; 1747*e26eed4fSJason Gunthorpe goto out_mmput; 1748*e26eed4fSJason Gunthorpe } 1749*e26eed4fSJason Gunthorpe 17508d160cd4SJason Gunthorpe mmap_read_lock(pages->source_mm); 17518d160cd4SJason Gunthorpe rc = pin_user_pages_remote( 17528d160cd4SJason Gunthorpe pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), 17538d160cd4SJason Gunthorpe 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, 17548d160cd4SJason Gunthorpe NULL, NULL); 17558d160cd4SJason Gunthorpe mmap_read_unlock(pages->source_mm); 17568d160cd4SJason Gunthorpe if (rc != 1) { 17578d160cd4SJason Gunthorpe if (WARN_ON(rc >= 0)) 17588d160cd4SJason Gunthorpe rc = -EINVAL; 17598d160cd4SJason Gunthorpe goto out_mmput; 17608d160cd4SJason Gunthorpe } 17618d160cd4SJason Gunthorpe copy_data_page(page, data, offset, length, flags); 17628d160cd4SJason Gunthorpe unpin_user_page(page); 17638d160cd4SJason Gunthorpe rc = 0; 17648d160cd4SJason Gunthorpe 17658d160cd4SJason Gunthorpe out_mmput: 17668d160cd4SJason Gunthorpe mmput(pages->source_mm); 17678d160cd4SJason Gunthorpe return rc; 17688d160cd4SJason Gunthorpe } 17698d160cd4SJason Gunthorpe 17708d160cd4SJason Gunthorpe /** 17718d160cd4SJason Gunthorpe * iopt_pages_rw_access - Copy to/from a linear slice of the pages 17728d160cd4SJason Gunthorpe * @pages: pages to act on 17738d160cd4SJason Gunthorpe * @start_byte: First byte of pages to copy to/from 17748d160cd4SJason Gunthorpe * @data: Kernel buffer to get/put the data 17758d160cd4SJason Gunthorpe * @length: Number of bytes to copy 17768d160cd4SJason Gunthorpe * @flags: IOMMUFD_ACCESS_RW_* flags 17778d160cd4SJason Gunthorpe * 17788d160cd4SJason Gunthorpe * This will find each page in the range, kmap it and then memcpy to/from 17798d160cd4SJason Gunthorpe * the given kernel buffer. 17808d160cd4SJason Gunthorpe */ 17818d160cd4SJason Gunthorpe int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, 17828d160cd4SJason Gunthorpe void *data, unsigned long length, unsigned int flags) 17838d160cd4SJason Gunthorpe { 17848d160cd4SJason Gunthorpe unsigned long start_index = start_byte / PAGE_SIZE; 17858d160cd4SJason Gunthorpe unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE; 17868d160cd4SJason Gunthorpe bool change_mm = current->mm != pages->source_mm; 17878d160cd4SJason Gunthorpe int rc = 0; 17888d160cd4SJason Gunthorpe 1789f4b20bb3SJason Gunthorpe if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && 1790f4b20bb3SJason Gunthorpe (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH)) 1791f4b20bb3SJason Gunthorpe change_mm = true; 1792f4b20bb3SJason Gunthorpe 17938d160cd4SJason Gunthorpe if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) 17948d160cd4SJason Gunthorpe return -EPERM; 17958d160cd4SJason Gunthorpe 17968d160cd4SJason Gunthorpe if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { 17978d160cd4SJason Gunthorpe if (start_index == last_index) 17988d160cd4SJason Gunthorpe return iopt_pages_rw_page(pages, start_index, 17998d160cd4SJason Gunthorpe start_byte % PAGE_SIZE, data, 18008d160cd4SJason Gunthorpe length, flags); 18018d160cd4SJason Gunthorpe return iopt_pages_rw_slow(pages, start_index, last_index, 18028d160cd4SJason Gunthorpe start_byte % PAGE_SIZE, data, length, 18038d160cd4SJason Gunthorpe flags); 18048d160cd4SJason Gunthorpe } 18058d160cd4SJason Gunthorpe 18068d160cd4SJason Gunthorpe /* 18078d160cd4SJason Gunthorpe * Try to copy using copy_to_user(). We do this as a fast path and 18088d160cd4SJason Gunthorpe * ignore any pinning inconsistencies, unlike a real DMA path. 18098d160cd4SJason Gunthorpe */ 18108d160cd4SJason Gunthorpe if (change_mm) { 18118d160cd4SJason Gunthorpe if (!mmget_not_zero(pages->source_mm)) 18128d160cd4SJason Gunthorpe return iopt_pages_rw_slow(pages, start_index, 18138d160cd4SJason Gunthorpe last_index, 18148d160cd4SJason Gunthorpe start_byte % PAGE_SIZE, data, 18158d160cd4SJason Gunthorpe length, flags); 18168d160cd4SJason Gunthorpe kthread_use_mm(pages->source_mm); 18178d160cd4SJason Gunthorpe } 18188d160cd4SJason Gunthorpe 18198d160cd4SJason Gunthorpe if (flags & IOMMUFD_ACCESS_RW_WRITE) { 18208d160cd4SJason Gunthorpe if (copy_to_user(pages->uptr + start_byte, data, length)) 18218d160cd4SJason Gunthorpe rc = -EFAULT; 18228d160cd4SJason Gunthorpe } else { 18238d160cd4SJason Gunthorpe if (copy_from_user(data, pages->uptr + start_byte, length)) 18248d160cd4SJason Gunthorpe rc = -EFAULT; 18258d160cd4SJason Gunthorpe } 18268d160cd4SJason Gunthorpe 18278d160cd4SJason Gunthorpe if (change_mm) { 18288d160cd4SJason Gunthorpe kthread_unuse_mm(pages->source_mm); 18298d160cd4SJason Gunthorpe mmput(pages->source_mm); 18308d160cd4SJason Gunthorpe } 18318d160cd4SJason Gunthorpe 18328d160cd4SJason Gunthorpe return rc; 18338d160cd4SJason Gunthorpe } 18348d160cd4SJason Gunthorpe 18358d160cd4SJason Gunthorpe static struct iopt_pages_access * 18368d160cd4SJason Gunthorpe iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, 18378d160cd4SJason Gunthorpe unsigned long last) 18388d160cd4SJason Gunthorpe { 18398d160cd4SJason Gunthorpe struct interval_tree_node *node; 18408d160cd4SJason Gunthorpe 18418d160cd4SJason Gunthorpe lockdep_assert_held(&pages->mutex); 18428d160cd4SJason Gunthorpe 18438d160cd4SJason Gunthorpe /* There can be overlapping ranges in this interval tree */ 18448d160cd4SJason Gunthorpe for (node = interval_tree_iter_first(&pages->access_itree, index, last); 18458d160cd4SJason Gunthorpe node; node = interval_tree_iter_next(node, index, last)) 18468d160cd4SJason Gunthorpe if (node->start == index && node->last == last) 18478d160cd4SJason Gunthorpe return container_of(node, struct iopt_pages_access, 18488d160cd4SJason Gunthorpe node); 18498d160cd4SJason Gunthorpe return NULL; 18508d160cd4SJason Gunthorpe } 18518d160cd4SJason Gunthorpe 18528d160cd4SJason Gunthorpe /** 18538d160cd4SJason Gunthorpe * iopt_area_add_access() - Record an in-knerel access for PFNs 18548d160cd4SJason Gunthorpe * @area: The source of PFNs 18558d160cd4SJason Gunthorpe * @start_index: First page index 18568d160cd4SJason Gunthorpe * @last_index: Inclusive last page index 18578d160cd4SJason Gunthorpe * @out_pages: Output list of struct page's representing the PFNs 18588d160cd4SJason Gunthorpe * @flags: IOMMUFD_ACCESS_RW_* flags 18598d160cd4SJason Gunthorpe * 18608d160cd4SJason Gunthorpe * Record that an in-kernel access will be accessing the pages, ensure they are 18618d160cd4SJason Gunthorpe * pinned, and return the PFNs as a simple list of 'struct page *'. 18628d160cd4SJason Gunthorpe * 18638d160cd4SJason Gunthorpe * This should be undone through a matching call to iopt_area_remove_access() 18648d160cd4SJason Gunthorpe */ 18658d160cd4SJason Gunthorpe int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, 18668d160cd4SJason Gunthorpe unsigned long last_index, struct page **out_pages, 18678d160cd4SJason Gunthorpe unsigned int flags) 18688d160cd4SJason Gunthorpe { 18698d160cd4SJason Gunthorpe struct iopt_pages *pages = area->pages; 18708d160cd4SJason Gunthorpe struct iopt_pages_access *access; 18718d160cd4SJason Gunthorpe int rc; 18728d160cd4SJason Gunthorpe 18738d160cd4SJason Gunthorpe if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) 18748d160cd4SJason Gunthorpe return -EPERM; 18758d160cd4SJason Gunthorpe 18768d160cd4SJason Gunthorpe mutex_lock(&pages->mutex); 18778d160cd4SJason Gunthorpe access = iopt_pages_get_exact_access(pages, start_index, last_index); 18788d160cd4SJason Gunthorpe if (access) { 18798d160cd4SJason Gunthorpe area->num_accesses++; 18808d160cd4SJason Gunthorpe access->users++; 18818d160cd4SJason Gunthorpe iopt_pages_fill_from_xarray(pages, start_index, last_index, 18828d160cd4SJason Gunthorpe out_pages); 18838d160cd4SJason Gunthorpe mutex_unlock(&pages->mutex); 18848d160cd4SJason Gunthorpe return 0; 18858d160cd4SJason Gunthorpe } 18868d160cd4SJason Gunthorpe 18878d160cd4SJason Gunthorpe access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT); 18888d160cd4SJason Gunthorpe if (!access) { 18898d160cd4SJason Gunthorpe rc = -ENOMEM; 18908d160cd4SJason Gunthorpe goto err_unlock; 18918d160cd4SJason Gunthorpe } 18928d160cd4SJason Gunthorpe 18938d160cd4SJason Gunthorpe rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages); 18948d160cd4SJason Gunthorpe if (rc) 18958d160cd4SJason Gunthorpe goto err_free; 18968d160cd4SJason Gunthorpe 18978d160cd4SJason Gunthorpe access->node.start = start_index; 18988d160cd4SJason Gunthorpe access->node.last = last_index; 18998d160cd4SJason Gunthorpe access->users = 1; 19008d160cd4SJason Gunthorpe area->num_accesses++; 19018d160cd4SJason Gunthorpe interval_tree_insert(&access->node, &pages->access_itree); 19028d160cd4SJason Gunthorpe mutex_unlock(&pages->mutex); 19038d160cd4SJason Gunthorpe return 0; 19048d160cd4SJason Gunthorpe 19058d160cd4SJason Gunthorpe err_free: 19068d160cd4SJason Gunthorpe kfree(access); 19078d160cd4SJason Gunthorpe err_unlock: 19088d160cd4SJason Gunthorpe mutex_unlock(&pages->mutex); 19098d160cd4SJason Gunthorpe return rc; 19108d160cd4SJason Gunthorpe } 19118d160cd4SJason Gunthorpe 19128d160cd4SJason Gunthorpe /** 19138d160cd4SJason Gunthorpe * iopt_area_remove_access() - Release an in-kernel access for PFNs 19148d160cd4SJason Gunthorpe * @area: The source of PFNs 19158d160cd4SJason Gunthorpe * @start_index: First page index 19168d160cd4SJason Gunthorpe * @last_index: Inclusive last page index 19178d160cd4SJason Gunthorpe * 19188d160cd4SJason Gunthorpe * Undo iopt_area_add_access() and unpin the pages if necessary. The caller 19198d160cd4SJason Gunthorpe * must stop using the PFNs before calling this. 19208d160cd4SJason Gunthorpe */ 19218d160cd4SJason Gunthorpe void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, 19228d160cd4SJason Gunthorpe unsigned long last_index) 19238d160cd4SJason Gunthorpe { 19248d160cd4SJason Gunthorpe struct iopt_pages *pages = area->pages; 19258d160cd4SJason Gunthorpe struct iopt_pages_access *access; 19268d160cd4SJason Gunthorpe 19278d160cd4SJason Gunthorpe mutex_lock(&pages->mutex); 19288d160cd4SJason Gunthorpe access = iopt_pages_get_exact_access(pages, start_index, last_index); 19298d160cd4SJason Gunthorpe if (WARN_ON(!access)) 19308d160cd4SJason Gunthorpe goto out_unlock; 19318d160cd4SJason Gunthorpe 19328d160cd4SJason Gunthorpe WARN_ON(area->num_accesses == 0 || access->users == 0); 19338d160cd4SJason Gunthorpe area->num_accesses--; 19348d160cd4SJason Gunthorpe access->users--; 19358d160cd4SJason Gunthorpe if (access->users) 19368d160cd4SJason Gunthorpe goto out_unlock; 19378d160cd4SJason Gunthorpe 19388d160cd4SJason Gunthorpe interval_tree_remove(&access->node, &pages->access_itree); 19398d160cd4SJason Gunthorpe iopt_pages_unfill_xarray(pages, start_index, last_index); 19408d160cd4SJason Gunthorpe kfree(access); 19418d160cd4SJason Gunthorpe out_unlock: 19428d160cd4SJason Gunthorpe mutex_unlock(&pages->mutex); 19438d160cd4SJason Gunthorpe } 1944