1*f394576eSJason Gunthorpe // SPDX-License-Identifier: GPL-2.0 2*f394576eSJason Gunthorpe /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. 3*f394576eSJason Gunthorpe * 4*f394576eSJason Gunthorpe * The iopt_pages is the center of the storage and motion of PFNs. Each 5*f394576eSJason Gunthorpe * iopt_pages represents a logical linear array of full PFNs. The array is 0 6*f394576eSJason Gunthorpe * based and has npages in it. Accessors use 'index' to refer to the entry in 7*f394576eSJason Gunthorpe * this logical array, regardless of its storage location. 8*f394576eSJason Gunthorpe * 9*f394576eSJason Gunthorpe * PFNs are stored in a tiered scheme: 10*f394576eSJason Gunthorpe * 1) iopt_pages::pinned_pfns xarray 11*f394576eSJason Gunthorpe * 2) An iommu_domain 12*f394576eSJason Gunthorpe * 3) The origin of the PFNs, i.e. the userspace pointer 13*f394576eSJason Gunthorpe * 14*f394576eSJason Gunthorpe * PFN have to be copied between all combinations of tiers, depending on the 15*f394576eSJason Gunthorpe * configuration. 16*f394576eSJason Gunthorpe * 17*f394576eSJason Gunthorpe * When a PFN is taken out of the userspace pointer it is pinned exactly once. 18*f394576eSJason Gunthorpe * The storage locations of the PFN's index are tracked in the two interval 19*f394576eSJason Gunthorpe * trees. If no interval includes the index then it is not pinned. 20*f394576eSJason Gunthorpe * 21*f394576eSJason Gunthorpe * If access_itree includes the PFN's index then an in-kernel access has 22*f394576eSJason Gunthorpe * requested the page. The PFN is stored in the xarray so other requestors can 23*f394576eSJason Gunthorpe * continue to find it. 24*f394576eSJason Gunthorpe * 25*f394576eSJason Gunthorpe * If the domains_itree includes the PFN's index then an iommu_domain is storing 26*f394576eSJason Gunthorpe * the PFN and it can be read back using iommu_iova_to_phys(). To avoid 27*f394576eSJason Gunthorpe * duplicating storage the xarray is not used if only iommu_domains are using 28*f394576eSJason Gunthorpe * the PFN's index. 29*f394576eSJason Gunthorpe * 30*f394576eSJason Gunthorpe * As a general principle this is designed so that destroy never fails. This 31*f394576eSJason Gunthorpe * means removing an iommu_domain or releasing a in-kernel access will not fail 32*f394576eSJason Gunthorpe * due to insufficient memory. In practice this means some cases have to hold 33*f394576eSJason Gunthorpe * PFNs in the xarray even though they are also being stored in an iommu_domain. 34*f394576eSJason Gunthorpe * 35*f394576eSJason Gunthorpe * While the iopt_pages can use an iommu_domain as storage, it does not have an 36*f394576eSJason Gunthorpe * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the 37*f394576eSJason Gunthorpe * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages 38*f394576eSJason Gunthorpe * and reference their own slice of the PFN array, with sub page granularity. 39*f394576eSJason Gunthorpe * 40*f394576eSJason Gunthorpe * In this file the term 'last' indicates an inclusive and closed interval, eg 41*f394576eSJason Gunthorpe * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to 42*f394576eSJason Gunthorpe * no PFNs. 43*f394576eSJason Gunthorpe * 44*f394576eSJason Gunthorpe * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so 45*f394576eSJason Gunthorpe * last_iova + 1 can overflow. An iopt_pages index will always be much less than 46*f394576eSJason Gunthorpe * ULONG_MAX so last_index + 1 cannot overflow. 47*f394576eSJason Gunthorpe */ 48*f394576eSJason Gunthorpe #include <linux/overflow.h> 49*f394576eSJason Gunthorpe #include <linux/slab.h> 50*f394576eSJason Gunthorpe #include <linux/iommu.h> 51*f394576eSJason Gunthorpe #include <linux/sched/mm.h> 52*f394576eSJason Gunthorpe #include <linux/highmem.h> 53*f394576eSJason Gunthorpe #include <linux/kthread.h> 54*f394576eSJason Gunthorpe #include <linux/iommufd.h> 55*f394576eSJason Gunthorpe 56*f394576eSJason Gunthorpe #include "io_pagetable.h" 57*f394576eSJason Gunthorpe #include "double_span.h" 58*f394576eSJason Gunthorpe 59*f394576eSJason Gunthorpe #define TEMP_MEMORY_LIMIT 65536 60*f394576eSJason Gunthorpe #define BATCH_BACKUP_SIZE 32 61*f394576eSJason Gunthorpe 62*f394576eSJason Gunthorpe /* 63*f394576eSJason Gunthorpe * More memory makes pin_user_pages() and the batching more efficient, but as 64*f394576eSJason Gunthorpe * this is only a performance optimization don't try too hard to get it. A 64k 65*f394576eSJason Gunthorpe * allocation can hold about 26M of 4k pages and 13G of 2M pages in an 66*f394576eSJason Gunthorpe * pfn_batch. Various destroy paths cannot fail and provide a small amount of 67*f394576eSJason Gunthorpe * stack memory as a backup contingency. If backup_len is given this cannot 68*f394576eSJason Gunthorpe * fail. 69*f394576eSJason Gunthorpe */ 70*f394576eSJason Gunthorpe static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) 71*f394576eSJason Gunthorpe { 72*f394576eSJason Gunthorpe void *res; 73*f394576eSJason Gunthorpe 74*f394576eSJason Gunthorpe if (WARN_ON(*size == 0)) 75*f394576eSJason Gunthorpe return NULL; 76*f394576eSJason Gunthorpe 77*f394576eSJason Gunthorpe if (*size < backup_len) 78*f394576eSJason Gunthorpe return backup; 79*f394576eSJason Gunthorpe *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); 80*f394576eSJason Gunthorpe res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); 81*f394576eSJason Gunthorpe if (res) 82*f394576eSJason Gunthorpe return res; 83*f394576eSJason Gunthorpe *size = PAGE_SIZE; 84*f394576eSJason Gunthorpe if (backup_len) { 85*f394576eSJason Gunthorpe res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); 86*f394576eSJason Gunthorpe if (res) 87*f394576eSJason Gunthorpe return res; 88*f394576eSJason Gunthorpe *size = backup_len; 89*f394576eSJason Gunthorpe return backup; 90*f394576eSJason Gunthorpe } 91*f394576eSJason Gunthorpe return kmalloc(*size, GFP_KERNEL); 92*f394576eSJason Gunthorpe } 93*f394576eSJason Gunthorpe 94*f394576eSJason Gunthorpe void interval_tree_double_span_iter_update( 95*f394576eSJason Gunthorpe struct interval_tree_double_span_iter *iter) 96*f394576eSJason Gunthorpe { 97*f394576eSJason Gunthorpe unsigned long last_hole = ULONG_MAX; 98*f394576eSJason Gunthorpe unsigned int i; 99*f394576eSJason Gunthorpe 100*f394576eSJason Gunthorpe for (i = 0; i != ARRAY_SIZE(iter->spans); i++) { 101*f394576eSJason Gunthorpe if (interval_tree_span_iter_done(&iter->spans[i])) { 102*f394576eSJason Gunthorpe iter->is_used = -1; 103*f394576eSJason Gunthorpe return; 104*f394576eSJason Gunthorpe } 105*f394576eSJason Gunthorpe 106*f394576eSJason Gunthorpe if (iter->spans[i].is_hole) { 107*f394576eSJason Gunthorpe last_hole = min(last_hole, iter->spans[i].last_hole); 108*f394576eSJason Gunthorpe continue; 109*f394576eSJason Gunthorpe } 110*f394576eSJason Gunthorpe 111*f394576eSJason Gunthorpe iter->is_used = i + 1; 112*f394576eSJason Gunthorpe iter->start_used = iter->spans[i].start_used; 113*f394576eSJason Gunthorpe iter->last_used = min(iter->spans[i].last_used, last_hole); 114*f394576eSJason Gunthorpe return; 115*f394576eSJason Gunthorpe } 116*f394576eSJason Gunthorpe 117*f394576eSJason Gunthorpe iter->is_used = 0; 118*f394576eSJason Gunthorpe iter->start_hole = iter->spans[0].start_hole; 119*f394576eSJason Gunthorpe iter->last_hole = 120*f394576eSJason Gunthorpe min(iter->spans[0].last_hole, iter->spans[1].last_hole); 121*f394576eSJason Gunthorpe } 122*f394576eSJason Gunthorpe 123*f394576eSJason Gunthorpe void interval_tree_double_span_iter_first( 124*f394576eSJason Gunthorpe struct interval_tree_double_span_iter *iter, 125*f394576eSJason Gunthorpe struct rb_root_cached *itree1, struct rb_root_cached *itree2, 126*f394576eSJason Gunthorpe unsigned long first_index, unsigned long last_index) 127*f394576eSJason Gunthorpe { 128*f394576eSJason Gunthorpe unsigned int i; 129*f394576eSJason Gunthorpe 130*f394576eSJason Gunthorpe iter->itrees[0] = itree1; 131*f394576eSJason Gunthorpe iter->itrees[1] = itree2; 132*f394576eSJason Gunthorpe for (i = 0; i != ARRAY_SIZE(iter->spans); i++) 133*f394576eSJason Gunthorpe interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i], 134*f394576eSJason Gunthorpe first_index, last_index); 135*f394576eSJason Gunthorpe interval_tree_double_span_iter_update(iter); 136*f394576eSJason Gunthorpe } 137*f394576eSJason Gunthorpe 138*f394576eSJason Gunthorpe void interval_tree_double_span_iter_next( 139*f394576eSJason Gunthorpe struct interval_tree_double_span_iter *iter) 140*f394576eSJason Gunthorpe { 141*f394576eSJason Gunthorpe unsigned int i; 142*f394576eSJason Gunthorpe 143*f394576eSJason Gunthorpe if (iter->is_used == -1 || 144*f394576eSJason Gunthorpe iter->last_hole == iter->spans[0].last_index) { 145*f394576eSJason Gunthorpe iter->is_used = -1; 146*f394576eSJason Gunthorpe return; 147*f394576eSJason Gunthorpe } 148*f394576eSJason Gunthorpe 149*f394576eSJason Gunthorpe for (i = 0; i != ARRAY_SIZE(iter->spans); i++) 150*f394576eSJason Gunthorpe interval_tree_span_iter_advance( 151*f394576eSJason Gunthorpe &iter->spans[i], iter->itrees[i], iter->last_hole + 1); 152*f394576eSJason Gunthorpe interval_tree_double_span_iter_update(iter); 153*f394576eSJason Gunthorpe } 154*f394576eSJason Gunthorpe 155*f394576eSJason Gunthorpe static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) 156*f394576eSJason Gunthorpe { 157*f394576eSJason Gunthorpe pages->npinned += npages; 158*f394576eSJason Gunthorpe } 159*f394576eSJason Gunthorpe 160*f394576eSJason Gunthorpe static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) 161*f394576eSJason Gunthorpe { 162*f394576eSJason Gunthorpe pages->npinned -= npages; 163*f394576eSJason Gunthorpe } 164*f394576eSJason Gunthorpe 165*f394576eSJason Gunthorpe static void iopt_pages_err_unpin(struct iopt_pages *pages, 166*f394576eSJason Gunthorpe unsigned long start_index, 167*f394576eSJason Gunthorpe unsigned long last_index, 168*f394576eSJason Gunthorpe struct page **page_list) 169*f394576eSJason Gunthorpe { 170*f394576eSJason Gunthorpe unsigned long npages = last_index - start_index + 1; 171*f394576eSJason Gunthorpe 172*f394576eSJason Gunthorpe unpin_user_pages(page_list, npages); 173*f394576eSJason Gunthorpe iopt_pages_sub_npinned(pages, npages); 174*f394576eSJason Gunthorpe } 175*f394576eSJason Gunthorpe 176*f394576eSJason Gunthorpe /* 177*f394576eSJason Gunthorpe * index is the number of PAGE_SIZE units from the start of the area's 178*f394576eSJason Gunthorpe * iopt_pages. If the iova is sub page-size then the area has an iova that 179*f394576eSJason Gunthorpe * covers a portion of the first and last pages in the range. 180*f394576eSJason Gunthorpe */ 181*f394576eSJason Gunthorpe static unsigned long iopt_area_index_to_iova(struct iopt_area *area, 182*f394576eSJason Gunthorpe unsigned long index) 183*f394576eSJason Gunthorpe { 184*f394576eSJason Gunthorpe index -= iopt_area_index(area); 185*f394576eSJason Gunthorpe if (index == 0) 186*f394576eSJason Gunthorpe return iopt_area_iova(area); 187*f394576eSJason Gunthorpe return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE; 188*f394576eSJason Gunthorpe } 189*f394576eSJason Gunthorpe 190*f394576eSJason Gunthorpe static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, 191*f394576eSJason Gunthorpe unsigned long index) 192*f394576eSJason Gunthorpe { 193*f394576eSJason Gunthorpe if (index == iopt_area_last_index(area)) 194*f394576eSJason Gunthorpe return iopt_area_last_iova(area); 195*f394576eSJason Gunthorpe return iopt_area_iova(area) - area->page_offset + 196*f394576eSJason Gunthorpe (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1; 197*f394576eSJason Gunthorpe } 198*f394576eSJason Gunthorpe 199*f394576eSJason Gunthorpe static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, 200*f394576eSJason Gunthorpe size_t size) 201*f394576eSJason Gunthorpe { 202*f394576eSJason Gunthorpe size_t ret; 203*f394576eSJason Gunthorpe 204*f394576eSJason Gunthorpe ret = iommu_unmap(domain, iova, size); 205*f394576eSJason Gunthorpe /* 206*f394576eSJason Gunthorpe * It is a logic error in this code or a driver bug if the IOMMU unmaps 207*f394576eSJason Gunthorpe * something other than exactly as requested. This implies that the 208*f394576eSJason Gunthorpe * iommu driver may not fail unmap for reasons beyond bad agruments. 209*f394576eSJason Gunthorpe * Particularly, the iommu driver may not do a memory allocation on the 210*f394576eSJason Gunthorpe * unmap path. 211*f394576eSJason Gunthorpe */ 212*f394576eSJason Gunthorpe WARN_ON(ret != size); 213*f394576eSJason Gunthorpe } 214*f394576eSJason Gunthorpe 215*f394576eSJason Gunthorpe static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, 216*f394576eSJason Gunthorpe unsigned long index) 217*f394576eSJason Gunthorpe { 218*f394576eSJason Gunthorpe struct interval_tree_node *node; 219*f394576eSJason Gunthorpe 220*f394576eSJason Gunthorpe node = interval_tree_iter_first(&pages->domains_itree, index, index); 221*f394576eSJason Gunthorpe if (!node) 222*f394576eSJason Gunthorpe return NULL; 223*f394576eSJason Gunthorpe return container_of(node, struct iopt_area, pages_node); 224*f394576eSJason Gunthorpe } 225*f394576eSJason Gunthorpe 226*f394576eSJason Gunthorpe /* 227*f394576eSJason Gunthorpe * A simple datastructure to hold a vector of PFNs, optimized for contiguous 228*f394576eSJason Gunthorpe * PFNs. This is used as a temporary holding memory for shuttling pfns from one 229*f394576eSJason Gunthorpe * place to another. Generally everything is made more efficient if operations 230*f394576eSJason Gunthorpe * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, 231*f394576eSJason Gunthorpe * better cache locality, etc 232*f394576eSJason Gunthorpe */ 233*f394576eSJason Gunthorpe struct pfn_batch { 234*f394576eSJason Gunthorpe unsigned long *pfns; 235*f394576eSJason Gunthorpe u32 *npfns; 236*f394576eSJason Gunthorpe unsigned int array_size; 237*f394576eSJason Gunthorpe unsigned int end; 238*f394576eSJason Gunthorpe unsigned int total_pfns; 239*f394576eSJason Gunthorpe }; 240*f394576eSJason Gunthorpe 241*f394576eSJason Gunthorpe static void batch_clear(struct pfn_batch *batch) 242*f394576eSJason Gunthorpe { 243*f394576eSJason Gunthorpe batch->total_pfns = 0; 244*f394576eSJason Gunthorpe batch->end = 0; 245*f394576eSJason Gunthorpe batch->pfns[0] = 0; 246*f394576eSJason Gunthorpe batch->npfns[0] = 0; 247*f394576eSJason Gunthorpe } 248*f394576eSJason Gunthorpe 249*f394576eSJason Gunthorpe /* 250*f394576eSJason Gunthorpe * Carry means we carry a portion of the final hugepage over to the front of the 251*f394576eSJason Gunthorpe * batch 252*f394576eSJason Gunthorpe */ 253*f394576eSJason Gunthorpe static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) 254*f394576eSJason Gunthorpe { 255*f394576eSJason Gunthorpe if (!keep_pfns) 256*f394576eSJason Gunthorpe return batch_clear(batch); 257*f394576eSJason Gunthorpe 258*f394576eSJason Gunthorpe batch->total_pfns = keep_pfns; 259*f394576eSJason Gunthorpe batch->npfns[0] = keep_pfns; 260*f394576eSJason Gunthorpe batch->pfns[0] = batch->pfns[batch->end - 1] + 261*f394576eSJason Gunthorpe (batch->npfns[batch->end - 1] - keep_pfns); 262*f394576eSJason Gunthorpe batch->end = 0; 263*f394576eSJason Gunthorpe } 264*f394576eSJason Gunthorpe 265*f394576eSJason Gunthorpe static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) 266*f394576eSJason Gunthorpe { 267*f394576eSJason Gunthorpe if (!batch->total_pfns) 268*f394576eSJason Gunthorpe return; 269*f394576eSJason Gunthorpe skip_pfns = min(batch->total_pfns, skip_pfns); 270*f394576eSJason Gunthorpe batch->pfns[0] += skip_pfns; 271*f394576eSJason Gunthorpe batch->npfns[0] -= skip_pfns; 272*f394576eSJason Gunthorpe batch->total_pfns -= skip_pfns; 273*f394576eSJason Gunthorpe } 274*f394576eSJason Gunthorpe 275*f394576eSJason Gunthorpe static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, 276*f394576eSJason Gunthorpe size_t backup_len) 277*f394576eSJason Gunthorpe { 278*f394576eSJason Gunthorpe const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns); 279*f394576eSJason Gunthorpe size_t size = max_pages * elmsz; 280*f394576eSJason Gunthorpe 281*f394576eSJason Gunthorpe batch->pfns = temp_kmalloc(&size, backup, backup_len); 282*f394576eSJason Gunthorpe if (!batch->pfns) 283*f394576eSJason Gunthorpe return -ENOMEM; 284*f394576eSJason Gunthorpe batch->array_size = size / elmsz; 285*f394576eSJason Gunthorpe batch->npfns = (u32 *)(batch->pfns + batch->array_size); 286*f394576eSJason Gunthorpe batch_clear(batch); 287*f394576eSJason Gunthorpe return 0; 288*f394576eSJason Gunthorpe } 289*f394576eSJason Gunthorpe 290*f394576eSJason Gunthorpe static int batch_init(struct pfn_batch *batch, size_t max_pages) 291*f394576eSJason Gunthorpe { 292*f394576eSJason Gunthorpe return __batch_init(batch, max_pages, NULL, 0); 293*f394576eSJason Gunthorpe } 294*f394576eSJason Gunthorpe 295*f394576eSJason Gunthorpe static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, 296*f394576eSJason Gunthorpe void *backup, size_t backup_len) 297*f394576eSJason Gunthorpe { 298*f394576eSJason Gunthorpe __batch_init(batch, max_pages, backup, backup_len); 299*f394576eSJason Gunthorpe } 300*f394576eSJason Gunthorpe 301*f394576eSJason Gunthorpe static void batch_destroy(struct pfn_batch *batch, void *backup) 302*f394576eSJason Gunthorpe { 303*f394576eSJason Gunthorpe if (batch->pfns != backup) 304*f394576eSJason Gunthorpe kfree(batch->pfns); 305*f394576eSJason Gunthorpe } 306*f394576eSJason Gunthorpe 307*f394576eSJason Gunthorpe /* true if the pfn could be added, false otherwise */ 308*f394576eSJason Gunthorpe static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) 309*f394576eSJason Gunthorpe { 310*f394576eSJason Gunthorpe const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); 311*f394576eSJason Gunthorpe 312*f394576eSJason Gunthorpe if (batch->end && 313*f394576eSJason Gunthorpe pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && 314*f394576eSJason Gunthorpe batch->npfns[batch->end - 1] != MAX_NPFNS) { 315*f394576eSJason Gunthorpe batch->npfns[batch->end - 1]++; 316*f394576eSJason Gunthorpe batch->total_pfns++; 317*f394576eSJason Gunthorpe return true; 318*f394576eSJason Gunthorpe } 319*f394576eSJason Gunthorpe if (batch->end == batch->array_size) 320*f394576eSJason Gunthorpe return false; 321*f394576eSJason Gunthorpe batch->total_pfns++; 322*f394576eSJason Gunthorpe batch->pfns[batch->end] = pfn; 323*f394576eSJason Gunthorpe batch->npfns[batch->end] = 1; 324*f394576eSJason Gunthorpe batch->end++; 325*f394576eSJason Gunthorpe return true; 326*f394576eSJason Gunthorpe } 327*f394576eSJason Gunthorpe 328*f394576eSJason Gunthorpe /* 329*f394576eSJason Gunthorpe * Fill the batch with pfns from the domain. When the batch is full, or it 330*f394576eSJason Gunthorpe * reaches last_index, the function will return. The caller should use 331*f394576eSJason Gunthorpe * batch->total_pfns to determine the starting point for the next iteration. 332*f394576eSJason Gunthorpe */ 333*f394576eSJason Gunthorpe static void batch_from_domain(struct pfn_batch *batch, 334*f394576eSJason Gunthorpe struct iommu_domain *domain, 335*f394576eSJason Gunthorpe struct iopt_area *area, unsigned long start_index, 336*f394576eSJason Gunthorpe unsigned long last_index) 337*f394576eSJason Gunthorpe { 338*f394576eSJason Gunthorpe unsigned int page_offset = 0; 339*f394576eSJason Gunthorpe unsigned long iova; 340*f394576eSJason Gunthorpe phys_addr_t phys; 341*f394576eSJason Gunthorpe 342*f394576eSJason Gunthorpe iova = iopt_area_index_to_iova(area, start_index); 343*f394576eSJason Gunthorpe if (start_index == iopt_area_index(area)) 344*f394576eSJason Gunthorpe page_offset = area->page_offset; 345*f394576eSJason Gunthorpe while (start_index <= last_index) { 346*f394576eSJason Gunthorpe /* 347*f394576eSJason Gunthorpe * This is pretty slow, it would be nice to get the page size 348*f394576eSJason Gunthorpe * back from the driver, or have the driver directly fill the 349*f394576eSJason Gunthorpe * batch. 350*f394576eSJason Gunthorpe */ 351*f394576eSJason Gunthorpe phys = iommu_iova_to_phys(domain, iova) - page_offset; 352*f394576eSJason Gunthorpe if (!batch_add_pfn(batch, PHYS_PFN(phys))) 353*f394576eSJason Gunthorpe return; 354*f394576eSJason Gunthorpe iova += PAGE_SIZE - page_offset; 355*f394576eSJason Gunthorpe page_offset = 0; 356*f394576eSJason Gunthorpe start_index++; 357*f394576eSJason Gunthorpe } 358*f394576eSJason Gunthorpe } 359*f394576eSJason Gunthorpe 360*f394576eSJason Gunthorpe static struct page **raw_pages_from_domain(struct iommu_domain *domain, 361*f394576eSJason Gunthorpe struct iopt_area *area, 362*f394576eSJason Gunthorpe unsigned long start_index, 363*f394576eSJason Gunthorpe unsigned long last_index, 364*f394576eSJason Gunthorpe struct page **out_pages) 365*f394576eSJason Gunthorpe { 366*f394576eSJason Gunthorpe unsigned int page_offset = 0; 367*f394576eSJason Gunthorpe unsigned long iova; 368*f394576eSJason Gunthorpe phys_addr_t phys; 369*f394576eSJason Gunthorpe 370*f394576eSJason Gunthorpe iova = iopt_area_index_to_iova(area, start_index); 371*f394576eSJason Gunthorpe if (start_index == iopt_area_index(area)) 372*f394576eSJason Gunthorpe page_offset = area->page_offset; 373*f394576eSJason Gunthorpe while (start_index <= last_index) { 374*f394576eSJason Gunthorpe phys = iommu_iova_to_phys(domain, iova) - page_offset; 375*f394576eSJason Gunthorpe *(out_pages++) = pfn_to_page(PHYS_PFN(phys)); 376*f394576eSJason Gunthorpe iova += PAGE_SIZE - page_offset; 377*f394576eSJason Gunthorpe page_offset = 0; 378*f394576eSJason Gunthorpe start_index++; 379*f394576eSJason Gunthorpe } 380*f394576eSJason Gunthorpe return out_pages; 381*f394576eSJason Gunthorpe } 382*f394576eSJason Gunthorpe 383*f394576eSJason Gunthorpe /* Continues reading a domain until we reach a discontiguity in the pfns. */ 384*f394576eSJason Gunthorpe static void batch_from_domain_continue(struct pfn_batch *batch, 385*f394576eSJason Gunthorpe struct iommu_domain *domain, 386*f394576eSJason Gunthorpe struct iopt_area *area, 387*f394576eSJason Gunthorpe unsigned long start_index, 388*f394576eSJason Gunthorpe unsigned long last_index) 389*f394576eSJason Gunthorpe { 390*f394576eSJason Gunthorpe unsigned int array_size = batch->array_size; 391*f394576eSJason Gunthorpe 392*f394576eSJason Gunthorpe batch->array_size = batch->end; 393*f394576eSJason Gunthorpe batch_from_domain(batch, domain, area, start_index, last_index); 394*f394576eSJason Gunthorpe batch->array_size = array_size; 395*f394576eSJason Gunthorpe } 396*f394576eSJason Gunthorpe 397*f394576eSJason Gunthorpe /* 398*f394576eSJason Gunthorpe * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That 399*f394576eSJason Gunthorpe * mode permits splitting a mapped area up, and then one of the splits is 400*f394576eSJason Gunthorpe * unmapped. Doing this normally would cause us to violate our invariant of 401*f394576eSJason Gunthorpe * pairing map/unmap. Thus, to support old VFIO compatibility disable support 402*f394576eSJason Gunthorpe * for batching consecutive PFNs. All PFNs mapped into the iommu are done in 403*f394576eSJason Gunthorpe * PAGE_SIZE units, not larger or smaller. 404*f394576eSJason Gunthorpe */ 405*f394576eSJason Gunthorpe static int batch_iommu_map_small(struct iommu_domain *domain, 406*f394576eSJason Gunthorpe unsigned long iova, phys_addr_t paddr, 407*f394576eSJason Gunthorpe size_t size, int prot) 408*f394576eSJason Gunthorpe { 409*f394576eSJason Gunthorpe unsigned long start_iova = iova; 410*f394576eSJason Gunthorpe int rc; 411*f394576eSJason Gunthorpe 412*f394576eSJason Gunthorpe while (size) { 413*f394576eSJason Gunthorpe rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot); 414*f394576eSJason Gunthorpe if (rc) 415*f394576eSJason Gunthorpe goto err_unmap; 416*f394576eSJason Gunthorpe iova += PAGE_SIZE; 417*f394576eSJason Gunthorpe paddr += PAGE_SIZE; 418*f394576eSJason Gunthorpe size -= PAGE_SIZE; 419*f394576eSJason Gunthorpe } 420*f394576eSJason Gunthorpe return 0; 421*f394576eSJason Gunthorpe 422*f394576eSJason Gunthorpe err_unmap: 423*f394576eSJason Gunthorpe if (start_iova != iova) 424*f394576eSJason Gunthorpe iommu_unmap_nofail(domain, start_iova, iova - start_iova); 425*f394576eSJason Gunthorpe return rc; 426*f394576eSJason Gunthorpe } 427*f394576eSJason Gunthorpe 428*f394576eSJason Gunthorpe static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, 429*f394576eSJason Gunthorpe struct iopt_area *area, unsigned long start_index) 430*f394576eSJason Gunthorpe { 431*f394576eSJason Gunthorpe bool disable_large_pages = area->iopt->disable_large_pages; 432*f394576eSJason Gunthorpe unsigned long last_iova = iopt_area_last_iova(area); 433*f394576eSJason Gunthorpe unsigned int page_offset = 0; 434*f394576eSJason Gunthorpe unsigned long start_iova; 435*f394576eSJason Gunthorpe unsigned long next_iova; 436*f394576eSJason Gunthorpe unsigned int cur = 0; 437*f394576eSJason Gunthorpe unsigned long iova; 438*f394576eSJason Gunthorpe int rc; 439*f394576eSJason Gunthorpe 440*f394576eSJason Gunthorpe /* The first index might be a partial page */ 441*f394576eSJason Gunthorpe if (start_index == iopt_area_index(area)) 442*f394576eSJason Gunthorpe page_offset = area->page_offset; 443*f394576eSJason Gunthorpe next_iova = iova = start_iova = 444*f394576eSJason Gunthorpe iopt_area_index_to_iova(area, start_index); 445*f394576eSJason Gunthorpe while (cur < batch->end) { 446*f394576eSJason Gunthorpe next_iova = min(last_iova + 1, 447*f394576eSJason Gunthorpe next_iova + batch->npfns[cur] * PAGE_SIZE - 448*f394576eSJason Gunthorpe page_offset); 449*f394576eSJason Gunthorpe if (disable_large_pages) 450*f394576eSJason Gunthorpe rc = batch_iommu_map_small( 451*f394576eSJason Gunthorpe domain, iova, 452*f394576eSJason Gunthorpe PFN_PHYS(batch->pfns[cur]) + page_offset, 453*f394576eSJason Gunthorpe next_iova - iova, area->iommu_prot); 454*f394576eSJason Gunthorpe else 455*f394576eSJason Gunthorpe rc = iommu_map(domain, iova, 456*f394576eSJason Gunthorpe PFN_PHYS(batch->pfns[cur]) + page_offset, 457*f394576eSJason Gunthorpe next_iova - iova, area->iommu_prot); 458*f394576eSJason Gunthorpe if (rc) 459*f394576eSJason Gunthorpe goto err_unmap; 460*f394576eSJason Gunthorpe iova = next_iova; 461*f394576eSJason Gunthorpe page_offset = 0; 462*f394576eSJason Gunthorpe cur++; 463*f394576eSJason Gunthorpe } 464*f394576eSJason Gunthorpe return 0; 465*f394576eSJason Gunthorpe err_unmap: 466*f394576eSJason Gunthorpe if (start_iova != iova) 467*f394576eSJason Gunthorpe iommu_unmap_nofail(domain, start_iova, iova - start_iova); 468*f394576eSJason Gunthorpe return rc; 469*f394576eSJason Gunthorpe } 470*f394576eSJason Gunthorpe 471*f394576eSJason Gunthorpe static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, 472*f394576eSJason Gunthorpe unsigned long start_index, 473*f394576eSJason Gunthorpe unsigned long last_index) 474*f394576eSJason Gunthorpe { 475*f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 476*f394576eSJason Gunthorpe void *entry; 477*f394576eSJason Gunthorpe 478*f394576eSJason Gunthorpe rcu_read_lock(); 479*f394576eSJason Gunthorpe while (true) { 480*f394576eSJason Gunthorpe entry = xas_next(&xas); 481*f394576eSJason Gunthorpe if (xas_retry(&xas, entry)) 482*f394576eSJason Gunthorpe continue; 483*f394576eSJason Gunthorpe WARN_ON(!xa_is_value(entry)); 484*f394576eSJason Gunthorpe if (!batch_add_pfn(batch, xa_to_value(entry)) || 485*f394576eSJason Gunthorpe start_index == last_index) 486*f394576eSJason Gunthorpe break; 487*f394576eSJason Gunthorpe start_index++; 488*f394576eSJason Gunthorpe } 489*f394576eSJason Gunthorpe rcu_read_unlock(); 490*f394576eSJason Gunthorpe } 491*f394576eSJason Gunthorpe 492*f394576eSJason Gunthorpe static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, 493*f394576eSJason Gunthorpe unsigned long start_index, 494*f394576eSJason Gunthorpe unsigned long last_index) 495*f394576eSJason Gunthorpe { 496*f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 497*f394576eSJason Gunthorpe void *entry; 498*f394576eSJason Gunthorpe 499*f394576eSJason Gunthorpe xas_lock(&xas); 500*f394576eSJason Gunthorpe while (true) { 501*f394576eSJason Gunthorpe entry = xas_next(&xas); 502*f394576eSJason Gunthorpe if (xas_retry(&xas, entry)) 503*f394576eSJason Gunthorpe continue; 504*f394576eSJason Gunthorpe WARN_ON(!xa_is_value(entry)); 505*f394576eSJason Gunthorpe if (!batch_add_pfn(batch, xa_to_value(entry))) 506*f394576eSJason Gunthorpe break; 507*f394576eSJason Gunthorpe xas_store(&xas, NULL); 508*f394576eSJason Gunthorpe if (start_index == last_index) 509*f394576eSJason Gunthorpe break; 510*f394576eSJason Gunthorpe start_index++; 511*f394576eSJason Gunthorpe } 512*f394576eSJason Gunthorpe xas_unlock(&xas); 513*f394576eSJason Gunthorpe } 514*f394576eSJason Gunthorpe 515*f394576eSJason Gunthorpe static void clear_xarray(struct xarray *xa, unsigned long start_index, 516*f394576eSJason Gunthorpe unsigned long last_index) 517*f394576eSJason Gunthorpe { 518*f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 519*f394576eSJason Gunthorpe void *entry; 520*f394576eSJason Gunthorpe 521*f394576eSJason Gunthorpe xas_lock(&xas); 522*f394576eSJason Gunthorpe xas_for_each(&xas, entry, last_index) 523*f394576eSJason Gunthorpe xas_store(&xas, NULL); 524*f394576eSJason Gunthorpe xas_unlock(&xas); 525*f394576eSJason Gunthorpe } 526*f394576eSJason Gunthorpe 527*f394576eSJason Gunthorpe static int pages_to_xarray(struct xarray *xa, unsigned long start_index, 528*f394576eSJason Gunthorpe unsigned long last_index, struct page **pages) 529*f394576eSJason Gunthorpe { 530*f394576eSJason Gunthorpe struct page **end_pages = pages + (last_index - start_index) + 1; 531*f394576eSJason Gunthorpe XA_STATE(xas, xa, start_index); 532*f394576eSJason Gunthorpe 533*f394576eSJason Gunthorpe do { 534*f394576eSJason Gunthorpe void *old; 535*f394576eSJason Gunthorpe 536*f394576eSJason Gunthorpe xas_lock(&xas); 537*f394576eSJason Gunthorpe while (pages != end_pages) { 538*f394576eSJason Gunthorpe old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); 539*f394576eSJason Gunthorpe if (xas_error(&xas)) 540*f394576eSJason Gunthorpe break; 541*f394576eSJason Gunthorpe WARN_ON(old); 542*f394576eSJason Gunthorpe pages++; 543*f394576eSJason Gunthorpe xas_next(&xas); 544*f394576eSJason Gunthorpe } 545*f394576eSJason Gunthorpe xas_unlock(&xas); 546*f394576eSJason Gunthorpe } while (xas_nomem(&xas, GFP_KERNEL)); 547*f394576eSJason Gunthorpe 548*f394576eSJason Gunthorpe if (xas_error(&xas)) { 549*f394576eSJason Gunthorpe if (xas.xa_index != start_index) 550*f394576eSJason Gunthorpe clear_xarray(xa, start_index, xas.xa_index - 1); 551*f394576eSJason Gunthorpe return xas_error(&xas); 552*f394576eSJason Gunthorpe } 553*f394576eSJason Gunthorpe return 0; 554*f394576eSJason Gunthorpe } 555*f394576eSJason Gunthorpe 556*f394576eSJason Gunthorpe static void batch_from_pages(struct pfn_batch *batch, struct page **pages, 557*f394576eSJason Gunthorpe size_t npages) 558*f394576eSJason Gunthorpe { 559*f394576eSJason Gunthorpe struct page **end = pages + npages; 560*f394576eSJason Gunthorpe 561*f394576eSJason Gunthorpe for (; pages != end; pages++) 562*f394576eSJason Gunthorpe if (!batch_add_pfn(batch, page_to_pfn(*pages))) 563*f394576eSJason Gunthorpe break; 564*f394576eSJason Gunthorpe } 565*f394576eSJason Gunthorpe 566*f394576eSJason Gunthorpe static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, 567*f394576eSJason Gunthorpe unsigned int first_page_off, size_t npages) 568*f394576eSJason Gunthorpe { 569*f394576eSJason Gunthorpe unsigned int cur = 0; 570*f394576eSJason Gunthorpe 571*f394576eSJason Gunthorpe while (first_page_off) { 572*f394576eSJason Gunthorpe if (batch->npfns[cur] > first_page_off) 573*f394576eSJason Gunthorpe break; 574*f394576eSJason Gunthorpe first_page_off -= batch->npfns[cur]; 575*f394576eSJason Gunthorpe cur++; 576*f394576eSJason Gunthorpe } 577*f394576eSJason Gunthorpe 578*f394576eSJason Gunthorpe while (npages) { 579*f394576eSJason Gunthorpe size_t to_unpin = min_t(size_t, npages, 580*f394576eSJason Gunthorpe batch->npfns[cur] - first_page_off); 581*f394576eSJason Gunthorpe 582*f394576eSJason Gunthorpe unpin_user_page_range_dirty_lock( 583*f394576eSJason Gunthorpe pfn_to_page(batch->pfns[cur] + first_page_off), 584*f394576eSJason Gunthorpe to_unpin, pages->writable); 585*f394576eSJason Gunthorpe iopt_pages_sub_npinned(pages, to_unpin); 586*f394576eSJason Gunthorpe cur++; 587*f394576eSJason Gunthorpe first_page_off = 0; 588*f394576eSJason Gunthorpe npages -= to_unpin; 589*f394576eSJason Gunthorpe } 590*f394576eSJason Gunthorpe } 591*f394576eSJason Gunthorpe 592*f394576eSJason Gunthorpe static void copy_data_page(struct page *page, void *data, unsigned long offset, 593*f394576eSJason Gunthorpe size_t length, unsigned int flags) 594*f394576eSJason Gunthorpe { 595*f394576eSJason Gunthorpe void *mem; 596*f394576eSJason Gunthorpe 597*f394576eSJason Gunthorpe mem = kmap_local_page(page); 598*f394576eSJason Gunthorpe if (flags & IOMMUFD_ACCESS_RW_WRITE) { 599*f394576eSJason Gunthorpe memcpy(mem + offset, data, length); 600*f394576eSJason Gunthorpe set_page_dirty_lock(page); 601*f394576eSJason Gunthorpe } else { 602*f394576eSJason Gunthorpe memcpy(data, mem + offset, length); 603*f394576eSJason Gunthorpe } 604*f394576eSJason Gunthorpe kunmap_local(mem); 605*f394576eSJason Gunthorpe } 606*f394576eSJason Gunthorpe 607*f394576eSJason Gunthorpe static unsigned long batch_rw(struct pfn_batch *batch, void *data, 608*f394576eSJason Gunthorpe unsigned long offset, unsigned long length, 609*f394576eSJason Gunthorpe unsigned int flags) 610*f394576eSJason Gunthorpe { 611*f394576eSJason Gunthorpe unsigned long copied = 0; 612*f394576eSJason Gunthorpe unsigned int npage = 0; 613*f394576eSJason Gunthorpe unsigned int cur = 0; 614*f394576eSJason Gunthorpe 615*f394576eSJason Gunthorpe while (cur < batch->end) { 616*f394576eSJason Gunthorpe unsigned long bytes = min(length, PAGE_SIZE - offset); 617*f394576eSJason Gunthorpe 618*f394576eSJason Gunthorpe copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data, 619*f394576eSJason Gunthorpe offset, bytes, flags); 620*f394576eSJason Gunthorpe offset = 0; 621*f394576eSJason Gunthorpe length -= bytes; 622*f394576eSJason Gunthorpe data += bytes; 623*f394576eSJason Gunthorpe copied += bytes; 624*f394576eSJason Gunthorpe npage++; 625*f394576eSJason Gunthorpe if (npage == batch->npfns[cur]) { 626*f394576eSJason Gunthorpe npage = 0; 627*f394576eSJason Gunthorpe cur++; 628*f394576eSJason Gunthorpe } 629*f394576eSJason Gunthorpe if (!length) 630*f394576eSJason Gunthorpe break; 631*f394576eSJason Gunthorpe } 632*f394576eSJason Gunthorpe return copied; 633*f394576eSJason Gunthorpe } 634*f394576eSJason Gunthorpe 635*f394576eSJason Gunthorpe /* pfn_reader_user is just the pin_user_pages() path */ 636*f394576eSJason Gunthorpe struct pfn_reader_user { 637*f394576eSJason Gunthorpe struct page **upages; 638*f394576eSJason Gunthorpe size_t upages_len; 639*f394576eSJason Gunthorpe unsigned long upages_start; 640*f394576eSJason Gunthorpe unsigned long upages_end; 641*f394576eSJason Gunthorpe unsigned int gup_flags; 642*f394576eSJason Gunthorpe /* 643*f394576eSJason Gunthorpe * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is 644*f394576eSJason Gunthorpe * neither 645*f394576eSJason Gunthorpe */ 646*f394576eSJason Gunthorpe int locked; 647*f394576eSJason Gunthorpe }; 648*f394576eSJason Gunthorpe 649*f394576eSJason Gunthorpe static void pfn_reader_user_init(struct pfn_reader_user *user, 650*f394576eSJason Gunthorpe struct iopt_pages *pages) 651*f394576eSJason Gunthorpe { 652*f394576eSJason Gunthorpe user->upages = NULL; 653*f394576eSJason Gunthorpe user->upages_start = 0; 654*f394576eSJason Gunthorpe user->upages_end = 0; 655*f394576eSJason Gunthorpe user->locked = -1; 656*f394576eSJason Gunthorpe 657*f394576eSJason Gunthorpe if (pages->writable) { 658*f394576eSJason Gunthorpe user->gup_flags = FOLL_LONGTERM | FOLL_WRITE; 659*f394576eSJason Gunthorpe } else { 660*f394576eSJason Gunthorpe /* Still need to break COWs on read */ 661*f394576eSJason Gunthorpe user->gup_flags = FOLL_LONGTERM | FOLL_FORCE | FOLL_WRITE; 662*f394576eSJason Gunthorpe } 663*f394576eSJason Gunthorpe } 664*f394576eSJason Gunthorpe 665*f394576eSJason Gunthorpe static void pfn_reader_user_destroy(struct pfn_reader_user *user, 666*f394576eSJason Gunthorpe struct iopt_pages *pages) 667*f394576eSJason Gunthorpe { 668*f394576eSJason Gunthorpe if (user->locked != -1) { 669*f394576eSJason Gunthorpe if (user->locked) 670*f394576eSJason Gunthorpe mmap_read_unlock(pages->source_mm); 671*f394576eSJason Gunthorpe if (pages->source_mm != current->mm) 672*f394576eSJason Gunthorpe mmput(pages->source_mm); 673*f394576eSJason Gunthorpe user->locked = 0; 674*f394576eSJason Gunthorpe } 675*f394576eSJason Gunthorpe 676*f394576eSJason Gunthorpe kfree(user->upages); 677*f394576eSJason Gunthorpe user->upages = NULL; 678*f394576eSJason Gunthorpe } 679*f394576eSJason Gunthorpe 680*f394576eSJason Gunthorpe static int pfn_reader_user_pin(struct pfn_reader_user *user, 681*f394576eSJason Gunthorpe struct iopt_pages *pages, 682*f394576eSJason Gunthorpe unsigned long start_index, 683*f394576eSJason Gunthorpe unsigned long last_index) 684*f394576eSJason Gunthorpe { 685*f394576eSJason Gunthorpe bool remote_mm = pages->source_mm != current->mm; 686*f394576eSJason Gunthorpe unsigned long npages; 687*f394576eSJason Gunthorpe uintptr_t uptr; 688*f394576eSJason Gunthorpe long rc; 689*f394576eSJason Gunthorpe 690*f394576eSJason Gunthorpe if (!user->upages) { 691*f394576eSJason Gunthorpe /* All undone in pfn_reader_destroy() */ 692*f394576eSJason Gunthorpe user->upages_len = 693*f394576eSJason Gunthorpe (last_index - start_index + 1) * sizeof(*user->upages); 694*f394576eSJason Gunthorpe user->upages = temp_kmalloc(&user->upages_len, NULL, 0); 695*f394576eSJason Gunthorpe if (!user->upages) 696*f394576eSJason Gunthorpe return -ENOMEM; 697*f394576eSJason Gunthorpe } 698*f394576eSJason Gunthorpe 699*f394576eSJason Gunthorpe if (user->locked == -1) { 700*f394576eSJason Gunthorpe /* 701*f394576eSJason Gunthorpe * The majority of usages will run the map task within the mm 702*f394576eSJason Gunthorpe * providing the pages, so we can optimize into 703*f394576eSJason Gunthorpe * get_user_pages_fast() 704*f394576eSJason Gunthorpe */ 705*f394576eSJason Gunthorpe if (remote_mm) { 706*f394576eSJason Gunthorpe if (!mmget_not_zero(pages->source_mm)) 707*f394576eSJason Gunthorpe return -EFAULT; 708*f394576eSJason Gunthorpe } 709*f394576eSJason Gunthorpe user->locked = 0; 710*f394576eSJason Gunthorpe } 711*f394576eSJason Gunthorpe 712*f394576eSJason Gunthorpe npages = min_t(unsigned long, last_index - start_index + 1, 713*f394576eSJason Gunthorpe user->upages_len / sizeof(*user->upages)); 714*f394576eSJason Gunthorpe 715*f394576eSJason Gunthorpe uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); 716*f394576eSJason Gunthorpe if (!remote_mm) 717*f394576eSJason Gunthorpe rc = pin_user_pages_fast(uptr, npages, user->gup_flags, 718*f394576eSJason Gunthorpe user->upages); 719*f394576eSJason Gunthorpe else { 720*f394576eSJason Gunthorpe if (!user->locked) { 721*f394576eSJason Gunthorpe mmap_read_lock(pages->source_mm); 722*f394576eSJason Gunthorpe user->locked = 1; 723*f394576eSJason Gunthorpe } 724*f394576eSJason Gunthorpe /* 725*f394576eSJason Gunthorpe * FIXME: last NULL can be &pfns->locked once the GUP patch 726*f394576eSJason Gunthorpe * is merged. 727*f394576eSJason Gunthorpe */ 728*f394576eSJason Gunthorpe rc = pin_user_pages_remote(pages->source_mm, uptr, npages, 729*f394576eSJason Gunthorpe user->gup_flags, user->upages, NULL, 730*f394576eSJason Gunthorpe NULL); 731*f394576eSJason Gunthorpe } 732*f394576eSJason Gunthorpe if (rc <= 0) { 733*f394576eSJason Gunthorpe if (WARN_ON(!rc)) 734*f394576eSJason Gunthorpe return -EFAULT; 735*f394576eSJason Gunthorpe return rc; 736*f394576eSJason Gunthorpe } 737*f394576eSJason Gunthorpe iopt_pages_add_npinned(pages, rc); 738*f394576eSJason Gunthorpe user->upages_start = start_index; 739*f394576eSJason Gunthorpe user->upages_end = start_index + rc; 740*f394576eSJason Gunthorpe return 0; 741*f394576eSJason Gunthorpe } 742*f394576eSJason Gunthorpe 743*f394576eSJason Gunthorpe /* This is the "modern" and faster accounting method used by io_uring */ 744*f394576eSJason Gunthorpe static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) 745*f394576eSJason Gunthorpe { 746*f394576eSJason Gunthorpe unsigned long lock_limit; 747*f394576eSJason Gunthorpe unsigned long cur_pages; 748*f394576eSJason Gunthorpe unsigned long new_pages; 749*f394576eSJason Gunthorpe 750*f394576eSJason Gunthorpe lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> 751*f394576eSJason Gunthorpe PAGE_SHIFT; 752*f394576eSJason Gunthorpe npages = pages->npinned - pages->last_npinned; 753*f394576eSJason Gunthorpe do { 754*f394576eSJason Gunthorpe cur_pages = atomic_long_read(&pages->source_user->locked_vm); 755*f394576eSJason Gunthorpe new_pages = cur_pages + npages; 756*f394576eSJason Gunthorpe if (new_pages > lock_limit) 757*f394576eSJason Gunthorpe return -ENOMEM; 758*f394576eSJason Gunthorpe } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, 759*f394576eSJason Gunthorpe new_pages) != cur_pages); 760*f394576eSJason Gunthorpe return 0; 761*f394576eSJason Gunthorpe } 762*f394576eSJason Gunthorpe 763*f394576eSJason Gunthorpe static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) 764*f394576eSJason Gunthorpe { 765*f394576eSJason Gunthorpe if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)) 766*f394576eSJason Gunthorpe return; 767*f394576eSJason Gunthorpe atomic_long_sub(npages, &pages->source_user->locked_vm); 768*f394576eSJason Gunthorpe } 769*f394576eSJason Gunthorpe 770*f394576eSJason Gunthorpe /* This is the accounting method used for compatibility with VFIO */ 771*f394576eSJason Gunthorpe static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, 772*f394576eSJason Gunthorpe bool inc, struct pfn_reader_user *user) 773*f394576eSJason Gunthorpe { 774*f394576eSJason Gunthorpe bool do_put = false; 775*f394576eSJason Gunthorpe int rc; 776*f394576eSJason Gunthorpe 777*f394576eSJason Gunthorpe if (user && user->locked) { 778*f394576eSJason Gunthorpe mmap_read_unlock(pages->source_mm); 779*f394576eSJason Gunthorpe user->locked = 0; 780*f394576eSJason Gunthorpe /* If we had the lock then we also have a get */ 781*f394576eSJason Gunthorpe } else if ((!user || !user->upages) && 782*f394576eSJason Gunthorpe pages->source_mm != current->mm) { 783*f394576eSJason Gunthorpe if (!mmget_not_zero(pages->source_mm)) 784*f394576eSJason Gunthorpe return -EINVAL; 785*f394576eSJason Gunthorpe do_put = true; 786*f394576eSJason Gunthorpe } 787*f394576eSJason Gunthorpe 788*f394576eSJason Gunthorpe mmap_write_lock(pages->source_mm); 789*f394576eSJason Gunthorpe rc = __account_locked_vm(pages->source_mm, npages, inc, 790*f394576eSJason Gunthorpe pages->source_task, false); 791*f394576eSJason Gunthorpe mmap_write_unlock(pages->source_mm); 792*f394576eSJason Gunthorpe 793*f394576eSJason Gunthorpe if (do_put) 794*f394576eSJason Gunthorpe mmput(pages->source_mm); 795*f394576eSJason Gunthorpe return rc; 796*f394576eSJason Gunthorpe } 797*f394576eSJason Gunthorpe 798*f394576eSJason Gunthorpe static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, 799*f394576eSJason Gunthorpe bool inc, struct pfn_reader_user *user) 800*f394576eSJason Gunthorpe { 801*f394576eSJason Gunthorpe int rc = 0; 802*f394576eSJason Gunthorpe 803*f394576eSJason Gunthorpe switch (pages->account_mode) { 804*f394576eSJason Gunthorpe case IOPT_PAGES_ACCOUNT_NONE: 805*f394576eSJason Gunthorpe break; 806*f394576eSJason Gunthorpe case IOPT_PAGES_ACCOUNT_USER: 807*f394576eSJason Gunthorpe if (inc) 808*f394576eSJason Gunthorpe rc = incr_user_locked_vm(pages, npages); 809*f394576eSJason Gunthorpe else 810*f394576eSJason Gunthorpe decr_user_locked_vm(pages, npages); 811*f394576eSJason Gunthorpe break; 812*f394576eSJason Gunthorpe case IOPT_PAGES_ACCOUNT_MM: 813*f394576eSJason Gunthorpe rc = update_mm_locked_vm(pages, npages, inc, user); 814*f394576eSJason Gunthorpe break; 815*f394576eSJason Gunthorpe } 816*f394576eSJason Gunthorpe if (rc) 817*f394576eSJason Gunthorpe return rc; 818*f394576eSJason Gunthorpe 819*f394576eSJason Gunthorpe pages->last_npinned = pages->npinned; 820*f394576eSJason Gunthorpe if (inc) 821*f394576eSJason Gunthorpe atomic64_add(npages, &pages->source_mm->pinned_vm); 822*f394576eSJason Gunthorpe else 823*f394576eSJason Gunthorpe atomic64_sub(npages, &pages->source_mm->pinned_vm); 824*f394576eSJason Gunthorpe return 0; 825*f394576eSJason Gunthorpe } 826*f394576eSJason Gunthorpe 827*f394576eSJason Gunthorpe static void update_unpinned(struct iopt_pages *pages) 828*f394576eSJason Gunthorpe { 829*f394576eSJason Gunthorpe if (WARN_ON(pages->npinned > pages->last_npinned)) 830*f394576eSJason Gunthorpe return; 831*f394576eSJason Gunthorpe if (pages->npinned == pages->last_npinned) 832*f394576eSJason Gunthorpe return; 833*f394576eSJason Gunthorpe do_update_pinned(pages, pages->last_npinned - pages->npinned, false, 834*f394576eSJason Gunthorpe NULL); 835*f394576eSJason Gunthorpe } 836*f394576eSJason Gunthorpe 837*f394576eSJason Gunthorpe /* 838*f394576eSJason Gunthorpe * Changes in the number of pages pinned is done after the pages have been read 839*f394576eSJason Gunthorpe * and processed. If the user lacked the limit then the error unwind will unpin 840*f394576eSJason Gunthorpe * everything that was just pinned. This is because it is expensive to calculate 841*f394576eSJason Gunthorpe * how many pages we have already pinned within a range to generate an accurate 842*f394576eSJason Gunthorpe * prediction in advance of doing the work to actually pin them. 843*f394576eSJason Gunthorpe */ 844*f394576eSJason Gunthorpe static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, 845*f394576eSJason Gunthorpe struct iopt_pages *pages) 846*f394576eSJason Gunthorpe { 847*f394576eSJason Gunthorpe unsigned long npages; 848*f394576eSJason Gunthorpe bool inc; 849*f394576eSJason Gunthorpe 850*f394576eSJason Gunthorpe lockdep_assert_held(&pages->mutex); 851*f394576eSJason Gunthorpe 852*f394576eSJason Gunthorpe if (pages->npinned == pages->last_npinned) 853*f394576eSJason Gunthorpe return 0; 854*f394576eSJason Gunthorpe 855*f394576eSJason Gunthorpe if (pages->npinned < pages->last_npinned) { 856*f394576eSJason Gunthorpe npages = pages->last_npinned - pages->npinned; 857*f394576eSJason Gunthorpe inc = false; 858*f394576eSJason Gunthorpe } else { 859*f394576eSJason Gunthorpe npages = pages->npinned - pages->last_npinned; 860*f394576eSJason Gunthorpe inc = true; 861*f394576eSJason Gunthorpe } 862*f394576eSJason Gunthorpe return do_update_pinned(pages, npages, inc, user); 863*f394576eSJason Gunthorpe } 864*f394576eSJason Gunthorpe 865*f394576eSJason Gunthorpe /* 866*f394576eSJason Gunthorpe * PFNs are stored in three places, in order of preference: 867*f394576eSJason Gunthorpe * - The iopt_pages xarray. This is only populated if there is a 868*f394576eSJason Gunthorpe * iopt_pages_access 869*f394576eSJason Gunthorpe * - The iommu_domain under an area 870*f394576eSJason Gunthorpe * - The original PFN source, ie pages->source_mm 871*f394576eSJason Gunthorpe * 872*f394576eSJason Gunthorpe * This iterator reads the pfns optimizing to load according to the 873*f394576eSJason Gunthorpe * above order. 874*f394576eSJason Gunthorpe */ 875*f394576eSJason Gunthorpe struct pfn_reader { 876*f394576eSJason Gunthorpe struct iopt_pages *pages; 877*f394576eSJason Gunthorpe struct interval_tree_double_span_iter span; 878*f394576eSJason Gunthorpe struct pfn_batch batch; 879*f394576eSJason Gunthorpe unsigned long batch_start_index; 880*f394576eSJason Gunthorpe unsigned long batch_end_index; 881*f394576eSJason Gunthorpe unsigned long last_index; 882*f394576eSJason Gunthorpe 883*f394576eSJason Gunthorpe struct pfn_reader_user user; 884*f394576eSJason Gunthorpe }; 885*f394576eSJason Gunthorpe 886*f394576eSJason Gunthorpe static int pfn_reader_update_pinned(struct pfn_reader *pfns) 887*f394576eSJason Gunthorpe { 888*f394576eSJason Gunthorpe return pfn_reader_user_update_pinned(&pfns->user, pfns->pages); 889*f394576eSJason Gunthorpe } 890*f394576eSJason Gunthorpe 891*f394576eSJason Gunthorpe /* 892*f394576eSJason Gunthorpe * The batch can contain a mixture of pages that are still in use and pages that 893*f394576eSJason Gunthorpe * need to be unpinned. Unpin only pages that are not held anywhere else. 894*f394576eSJason Gunthorpe */ 895*f394576eSJason Gunthorpe static void pfn_reader_unpin(struct pfn_reader *pfns) 896*f394576eSJason Gunthorpe { 897*f394576eSJason Gunthorpe unsigned long last = pfns->batch_end_index - 1; 898*f394576eSJason Gunthorpe unsigned long start = pfns->batch_start_index; 899*f394576eSJason Gunthorpe struct interval_tree_double_span_iter span; 900*f394576eSJason Gunthorpe struct iopt_pages *pages = pfns->pages; 901*f394576eSJason Gunthorpe 902*f394576eSJason Gunthorpe lockdep_assert_held(&pages->mutex); 903*f394576eSJason Gunthorpe 904*f394576eSJason Gunthorpe interval_tree_for_each_double_span(&span, &pages->access_itree, 905*f394576eSJason Gunthorpe &pages->domains_itree, start, last) { 906*f394576eSJason Gunthorpe if (span.is_used) 907*f394576eSJason Gunthorpe continue; 908*f394576eSJason Gunthorpe 909*f394576eSJason Gunthorpe batch_unpin(&pfns->batch, pages, span.start_hole - start, 910*f394576eSJason Gunthorpe span.last_hole - span.start_hole + 1); 911*f394576eSJason Gunthorpe } 912*f394576eSJason Gunthorpe } 913*f394576eSJason Gunthorpe 914*f394576eSJason Gunthorpe /* Process a single span to load it from the proper storage */ 915*f394576eSJason Gunthorpe static int pfn_reader_fill_span(struct pfn_reader *pfns) 916*f394576eSJason Gunthorpe { 917*f394576eSJason Gunthorpe struct interval_tree_double_span_iter *span = &pfns->span; 918*f394576eSJason Gunthorpe unsigned long start_index = pfns->batch_end_index; 919*f394576eSJason Gunthorpe struct iopt_area *area; 920*f394576eSJason Gunthorpe int rc; 921*f394576eSJason Gunthorpe 922*f394576eSJason Gunthorpe if (span->is_used == 1) { 923*f394576eSJason Gunthorpe batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, 924*f394576eSJason Gunthorpe start_index, span->last_used); 925*f394576eSJason Gunthorpe return 0; 926*f394576eSJason Gunthorpe } 927*f394576eSJason Gunthorpe 928*f394576eSJason Gunthorpe if (span->is_used == 2) { 929*f394576eSJason Gunthorpe /* 930*f394576eSJason Gunthorpe * Pull as many pages from the first domain we find in the 931*f394576eSJason Gunthorpe * target span. If it is too small then we will be called again 932*f394576eSJason Gunthorpe * and we'll find another area. 933*f394576eSJason Gunthorpe */ 934*f394576eSJason Gunthorpe area = iopt_pages_find_domain_area(pfns->pages, start_index); 935*f394576eSJason Gunthorpe if (WARN_ON(!area)) 936*f394576eSJason Gunthorpe return -EINVAL; 937*f394576eSJason Gunthorpe 938*f394576eSJason Gunthorpe /* The storage_domain cannot change without the pages mutex */ 939*f394576eSJason Gunthorpe batch_from_domain( 940*f394576eSJason Gunthorpe &pfns->batch, area->storage_domain, area, start_index, 941*f394576eSJason Gunthorpe min(iopt_area_last_index(area), span->last_used)); 942*f394576eSJason Gunthorpe return 0; 943*f394576eSJason Gunthorpe } 944*f394576eSJason Gunthorpe 945*f394576eSJason Gunthorpe if (start_index >= pfns->user.upages_end) { 946*f394576eSJason Gunthorpe rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, 947*f394576eSJason Gunthorpe span->last_hole); 948*f394576eSJason Gunthorpe if (rc) 949*f394576eSJason Gunthorpe return rc; 950*f394576eSJason Gunthorpe } 951*f394576eSJason Gunthorpe 952*f394576eSJason Gunthorpe batch_from_pages(&pfns->batch, 953*f394576eSJason Gunthorpe pfns->user.upages + 954*f394576eSJason Gunthorpe (start_index - pfns->user.upages_start), 955*f394576eSJason Gunthorpe pfns->user.upages_end - start_index); 956*f394576eSJason Gunthorpe return 0; 957*f394576eSJason Gunthorpe } 958*f394576eSJason Gunthorpe 959*f394576eSJason Gunthorpe static bool pfn_reader_done(struct pfn_reader *pfns) 960*f394576eSJason Gunthorpe { 961*f394576eSJason Gunthorpe return pfns->batch_start_index == pfns->last_index + 1; 962*f394576eSJason Gunthorpe } 963*f394576eSJason Gunthorpe 964*f394576eSJason Gunthorpe static int pfn_reader_next(struct pfn_reader *pfns) 965*f394576eSJason Gunthorpe { 966*f394576eSJason Gunthorpe int rc; 967*f394576eSJason Gunthorpe 968*f394576eSJason Gunthorpe batch_clear(&pfns->batch); 969*f394576eSJason Gunthorpe pfns->batch_start_index = pfns->batch_end_index; 970*f394576eSJason Gunthorpe 971*f394576eSJason Gunthorpe while (pfns->batch_end_index != pfns->last_index + 1) { 972*f394576eSJason Gunthorpe unsigned int npfns = pfns->batch.total_pfns; 973*f394576eSJason Gunthorpe 974*f394576eSJason Gunthorpe rc = pfn_reader_fill_span(pfns); 975*f394576eSJason Gunthorpe if (rc) 976*f394576eSJason Gunthorpe return rc; 977*f394576eSJason Gunthorpe 978*f394576eSJason Gunthorpe if (WARN_ON(!pfns->batch.total_pfns)) 979*f394576eSJason Gunthorpe return -EINVAL; 980*f394576eSJason Gunthorpe 981*f394576eSJason Gunthorpe pfns->batch_end_index = 982*f394576eSJason Gunthorpe pfns->batch_start_index + pfns->batch.total_pfns; 983*f394576eSJason Gunthorpe if (pfns->batch_end_index == pfns->span.last_used + 1) 984*f394576eSJason Gunthorpe interval_tree_double_span_iter_next(&pfns->span); 985*f394576eSJason Gunthorpe 986*f394576eSJason Gunthorpe /* Batch is full */ 987*f394576eSJason Gunthorpe if (npfns == pfns->batch.total_pfns) 988*f394576eSJason Gunthorpe return 0; 989*f394576eSJason Gunthorpe } 990*f394576eSJason Gunthorpe return 0; 991*f394576eSJason Gunthorpe } 992*f394576eSJason Gunthorpe 993*f394576eSJason Gunthorpe static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, 994*f394576eSJason Gunthorpe unsigned long start_index, unsigned long last_index) 995*f394576eSJason Gunthorpe { 996*f394576eSJason Gunthorpe int rc; 997*f394576eSJason Gunthorpe 998*f394576eSJason Gunthorpe lockdep_assert_held(&pages->mutex); 999*f394576eSJason Gunthorpe 1000*f394576eSJason Gunthorpe pfns->pages = pages; 1001*f394576eSJason Gunthorpe pfns->batch_start_index = start_index; 1002*f394576eSJason Gunthorpe pfns->batch_end_index = start_index; 1003*f394576eSJason Gunthorpe pfns->last_index = last_index; 1004*f394576eSJason Gunthorpe pfn_reader_user_init(&pfns->user, pages); 1005*f394576eSJason Gunthorpe rc = batch_init(&pfns->batch, last_index - start_index + 1); 1006*f394576eSJason Gunthorpe if (rc) 1007*f394576eSJason Gunthorpe return rc; 1008*f394576eSJason Gunthorpe interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree, 1009*f394576eSJason Gunthorpe &pages->domains_itree, start_index, 1010*f394576eSJason Gunthorpe last_index); 1011*f394576eSJason Gunthorpe return 0; 1012*f394576eSJason Gunthorpe } 1013*f394576eSJason Gunthorpe 1014*f394576eSJason Gunthorpe /* 1015*f394576eSJason Gunthorpe * There are many assertions regarding the state of pages->npinned vs 1016*f394576eSJason Gunthorpe * pages->last_pinned, for instance something like unmapping a domain must only 1017*f394576eSJason Gunthorpe * decrement the npinned, and pfn_reader_destroy() must be called only after all 1018*f394576eSJason Gunthorpe * the pins are updated. This is fine for success flows, but error flows 1019*f394576eSJason Gunthorpe * sometimes need to release the pins held inside the pfn_reader before going on 1020*f394576eSJason Gunthorpe * to complete unmapping and releasing pins held in domains. 1021*f394576eSJason Gunthorpe */ 1022*f394576eSJason Gunthorpe static void pfn_reader_release_pins(struct pfn_reader *pfns) 1023*f394576eSJason Gunthorpe { 1024*f394576eSJason Gunthorpe struct iopt_pages *pages = pfns->pages; 1025*f394576eSJason Gunthorpe 1026*f394576eSJason Gunthorpe if (pfns->user.upages_end > pfns->batch_end_index) { 1027*f394576eSJason Gunthorpe size_t npages = pfns->user.upages_end - pfns->batch_end_index; 1028*f394576eSJason Gunthorpe 1029*f394576eSJason Gunthorpe /* Any pages not transferred to the batch are just unpinned */ 1030*f394576eSJason Gunthorpe unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - 1031*f394576eSJason Gunthorpe pfns->user.upages_start), 1032*f394576eSJason Gunthorpe npages); 1033*f394576eSJason Gunthorpe iopt_pages_sub_npinned(pages, npages); 1034*f394576eSJason Gunthorpe pfns->user.upages_end = pfns->batch_end_index; 1035*f394576eSJason Gunthorpe } 1036*f394576eSJason Gunthorpe if (pfns->batch_start_index != pfns->batch_end_index) { 1037*f394576eSJason Gunthorpe pfn_reader_unpin(pfns); 1038*f394576eSJason Gunthorpe pfns->batch_start_index = pfns->batch_end_index; 1039*f394576eSJason Gunthorpe } 1040*f394576eSJason Gunthorpe } 1041*f394576eSJason Gunthorpe 1042*f394576eSJason Gunthorpe static void pfn_reader_destroy(struct pfn_reader *pfns) 1043*f394576eSJason Gunthorpe { 1044*f394576eSJason Gunthorpe struct iopt_pages *pages = pfns->pages; 1045*f394576eSJason Gunthorpe 1046*f394576eSJason Gunthorpe pfn_reader_release_pins(pfns); 1047*f394576eSJason Gunthorpe pfn_reader_user_destroy(&pfns->user, pfns->pages); 1048*f394576eSJason Gunthorpe batch_destroy(&pfns->batch, NULL); 1049*f394576eSJason Gunthorpe WARN_ON(pages->last_npinned != pages->npinned); 1050*f394576eSJason Gunthorpe } 1051*f394576eSJason Gunthorpe 1052*f394576eSJason Gunthorpe static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, 1053*f394576eSJason Gunthorpe unsigned long start_index, unsigned long last_index) 1054*f394576eSJason Gunthorpe { 1055*f394576eSJason Gunthorpe int rc; 1056*f394576eSJason Gunthorpe 1057*f394576eSJason Gunthorpe rc = pfn_reader_init(pfns, pages, start_index, last_index); 1058*f394576eSJason Gunthorpe if (rc) 1059*f394576eSJason Gunthorpe return rc; 1060*f394576eSJason Gunthorpe rc = pfn_reader_next(pfns); 1061*f394576eSJason Gunthorpe if (rc) { 1062*f394576eSJason Gunthorpe pfn_reader_destroy(pfns); 1063*f394576eSJason Gunthorpe return rc; 1064*f394576eSJason Gunthorpe } 1065*f394576eSJason Gunthorpe return 0; 1066*f394576eSJason Gunthorpe } 1067