18ada2c1cSShachar Raindel /*
28ada2c1cSShachar Raindel * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
38ada2c1cSShachar Raindel *
48ada2c1cSShachar Raindel * This software is available to you under a choice of one of two
58ada2c1cSShachar Raindel * licenses. You may choose to be licensed under the terms of the GNU
68ada2c1cSShachar Raindel * General Public License (GPL) Version 2, available from the file
78ada2c1cSShachar Raindel * COPYING in the main directory of this source tree, or the
88ada2c1cSShachar Raindel * OpenIB.org BSD license below:
98ada2c1cSShachar Raindel *
108ada2c1cSShachar Raindel * Redistribution and use in source and binary forms, with or
118ada2c1cSShachar Raindel * without modification, are permitted provided that the following
128ada2c1cSShachar Raindel * conditions are met:
138ada2c1cSShachar Raindel *
148ada2c1cSShachar Raindel * - Redistributions of source code must retain the above
158ada2c1cSShachar Raindel * copyright notice, this list of conditions and the following
168ada2c1cSShachar Raindel * disclaimer.
178ada2c1cSShachar Raindel *
188ada2c1cSShachar Raindel * - Redistributions in binary form must reproduce the above
198ada2c1cSShachar Raindel * copyright notice, this list of conditions and the following
208ada2c1cSShachar Raindel * disclaimer in the documentation and/or other materials
218ada2c1cSShachar Raindel * provided with the distribution.
228ada2c1cSShachar Raindel *
238ada2c1cSShachar Raindel * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
248ada2c1cSShachar Raindel * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
258ada2c1cSShachar Raindel * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
268ada2c1cSShachar Raindel * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
278ada2c1cSShachar Raindel * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
288ada2c1cSShachar Raindel * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
298ada2c1cSShachar Raindel * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
308ada2c1cSShachar Raindel * SOFTWARE.
318ada2c1cSShachar Raindel */
328ada2c1cSShachar Raindel
338ada2c1cSShachar Raindel #include <linux/types.h>
348ada2c1cSShachar Raindel #include <linux/sched.h>
356e84f315SIngo Molnar #include <linux/sched/mm.h>
360881e7bdSIngo Molnar #include <linux/sched/task.h>
378ada2c1cSShachar Raindel #include <linux/pid.h>
388ada2c1cSShachar Raindel #include <linux/slab.h>
398ada2c1cSShachar Raindel #include <linux/export.h>
408ada2c1cSShachar Raindel #include <linux/vmalloc.h>
410008b84eSArtemy Kovalyov #include <linux/hugetlb.h>
427cc2e18fSJason Gunthorpe #include <linux/interval_tree.h>
4336f30e48SYishai Hadas #include <linux/hmm.h>
4475a3e6a3SJohn Hubbard #include <linux/pagemap.h>
458ada2c1cSShachar Raindel
468ada2c1cSShachar Raindel #include <rdma/ib_umem_odp.h>
478ada2c1cSShachar Raindel
488ada2c1cSShachar Raindel #include "uverbs.h"
498ada2c1cSShachar Raindel
ib_init_umem_odp(struct ib_umem_odp * umem_odp,const struct mmu_interval_notifier_ops * ops)50f20bef6aSJason Gunthorpe static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
51f20bef6aSJason Gunthorpe const struct mmu_interval_notifier_ops *ops)
52f25a546eSJason Gunthorpe {
53f25a546eSJason Gunthorpe int ret;
54882214e2SHaggai Eran
5522d79c9aSJason Gunthorpe umem_odp->umem.is_odp = 1;
5622d79c9aSJason Gunthorpe mutex_init(&umem_odp->umem_mutex);
5722d79c9aSJason Gunthorpe
58f25a546eSJason Gunthorpe if (!umem_odp->is_implicit_odp) {
59f25a546eSJason Gunthorpe size_t page_size = 1UL << umem_odp->page_shift;
6022d79c9aSJason Gunthorpe unsigned long start;
61204e3e56SJason Gunthorpe unsigned long end;
62f25a546eSJason Gunthorpe size_t ndmas, npfns;
63f25a546eSJason Gunthorpe
6436f30e48SYishai Hadas start = ALIGN_DOWN(umem_odp->umem.address, page_size);
6522d79c9aSJason Gunthorpe if (check_add_overflow(umem_odp->umem.address,
66f25a546eSJason Gunthorpe (unsigned long)umem_odp->umem.length,
67204e3e56SJason Gunthorpe &end))
68b97b218bSJason Gunthorpe return -EOVERFLOW;
69f25a546eSJason Gunthorpe end = ALIGN(end, page_size);
70204e3e56SJason Gunthorpe if (unlikely(end < page_size))
71f25a546eSJason Gunthorpe return -EOVERFLOW;
72f25a546eSJason Gunthorpe
73204e3e56SJason Gunthorpe ndmas = (end - start) >> umem_odp->page_shift;
74204e3e56SJason Gunthorpe if (!ndmas)
7536f30e48SYishai Hadas return -EINVAL;
7636f30e48SYishai Hadas
7722d79c9aSJason Gunthorpe npfns = (end - start) >> PAGE_SHIFT;
7822d79c9aSJason Gunthorpe umem_odp->pfn_list = kvcalloc(
7936f30e48SYishai Hadas npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
8036f30e48SYishai Hadas if (!umem_odp->pfn_list)
8136f30e48SYishai Hadas return -ENOMEM;
8236f30e48SYishai Hadas
8322d79c9aSJason Gunthorpe umem_odp->dma_list = kvcalloc(
8422d79c9aSJason Gunthorpe ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
8537824952SJason Gunthorpe if (!umem_odp->dma_list) {
8636f30e48SYishai Hadas ret = -ENOMEM;
8722d79c9aSJason Gunthorpe goto out_pfn_list;
8822d79c9aSJason Gunthorpe }
8936f30e48SYishai Hadas
9022d79c9aSJason Gunthorpe ret = mmu_interval_notifier_insert(&umem_odp->notifier,
9122d79c9aSJason Gunthorpe umem_odp->umem.owning_mm,
92f25a546eSJason Gunthorpe start, end - start, ops);
93f25a546eSJason Gunthorpe if (ret)
94f25a546eSJason Gunthorpe goto out_dma_list;
95f25a546eSJason Gunthorpe }
96c571fecaSJason Gunthorpe
9722d79c9aSJason Gunthorpe return 0;
9822d79c9aSJason Gunthorpe
9922d79c9aSJason Gunthorpe out_dma_list:
10022d79c9aSJason Gunthorpe kvfree(umem_odp->dma_list);
101c571fecaSJason Gunthorpe out_pfn_list:
10237824952SJason Gunthorpe kvfree(umem_odp->pfn_list);
10336f30e48SYishai Hadas return ret;
10436f30e48SYishai Hadas }
10522d79c9aSJason Gunthorpe
10622d79c9aSJason Gunthorpe /**
10722d79c9aSJason Gunthorpe * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem
108f20bef6aSJason Gunthorpe *
109f20bef6aSJason Gunthorpe * Implicit ODP umems do not have a VA range and do not have any page lists.
110f20bef6aSJason Gunthorpe * They exist only to hold the per_mm reference to help the driver create
111f20bef6aSJason Gunthorpe * children umems.
112f20bef6aSJason Gunthorpe *
113f20bef6aSJason Gunthorpe * @device: IB device to create UMEM
114f20bef6aSJason Gunthorpe * @access: ib_reg_mr access flags
115c320e527SMoni Shoua */
ib_umem_odp_alloc_implicit(struct ib_device * device,int access)116f20bef6aSJason Gunthorpe struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
117f20bef6aSJason Gunthorpe int access)
118c320e527SMoni Shoua {
119f20bef6aSJason Gunthorpe struct ib_umem *umem;
120f20bef6aSJason Gunthorpe struct ib_umem_odp *umem_odp;
121f20bef6aSJason Gunthorpe int ret;
122f20bef6aSJason Gunthorpe
123f20bef6aSJason Gunthorpe if (access & IB_ACCESS_HUGETLB)
124f20bef6aSJason Gunthorpe return ERR_PTR(-EINVAL);
125f20bef6aSJason Gunthorpe
126f20bef6aSJason Gunthorpe umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
127f20bef6aSJason Gunthorpe if (!umem_odp)
128f20bef6aSJason Gunthorpe return ERR_PTR(-ENOMEM);
129f20bef6aSJason Gunthorpe umem = &umem_odp->umem;
130f20bef6aSJason Gunthorpe umem->ibdev = device;
131f20bef6aSJason Gunthorpe umem->writable = ib_access_writable(access);
132c320e527SMoni Shoua umem->owning_mm = current->mm;
133f20bef6aSJason Gunthorpe umem_odp->is_implicit_odp = 1;
134f20bef6aSJason Gunthorpe umem_odp->page_shift = PAGE_SHIFT;
135f20bef6aSJason Gunthorpe
136f20bef6aSJason Gunthorpe umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
137f20bef6aSJason Gunthorpe ret = ib_init_umem_odp(umem_odp, NULL);
138f25a546eSJason Gunthorpe if (ret) {
139f25a546eSJason Gunthorpe put_pid(umem_odp->tgid);
140f20bef6aSJason Gunthorpe kfree(umem_odp);
141f25a546eSJason Gunthorpe return ERR_PTR(ret);
142f20bef6aSJason Gunthorpe }
143f20bef6aSJason Gunthorpe return umem_odp;
144f20bef6aSJason Gunthorpe }
145f20bef6aSJason Gunthorpe EXPORT_SYMBOL(ib_umem_odp_alloc_implicit);
146f20bef6aSJason Gunthorpe
147f20bef6aSJason Gunthorpe /**
148f20bef6aSJason Gunthorpe * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit
149f20bef6aSJason Gunthorpe * parent ODP umem
150f20bef6aSJason Gunthorpe *
151f20bef6aSJason Gunthorpe * @root: The parent umem enclosing the child. This must be allocated using
152f20bef6aSJason Gunthorpe * ib_alloc_implicit_odp_umem()
153f20bef6aSJason Gunthorpe * @addr: The starting userspace VA
154f20bef6aSJason Gunthorpe * @size: The length of the userspace VA
155f20bef6aSJason Gunthorpe * @ops: MMU interval ops, currently only @invalidate
156f20bef6aSJason Gunthorpe */
15711708142SColton Lewis struct ib_umem_odp *
ib_umem_odp_alloc_child(struct ib_umem_odp * root,unsigned long addr,size_t size,const struct mmu_interval_notifier_ops * ops)158f20bef6aSJason Gunthorpe ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
159f25a546eSJason Gunthorpe size_t size,
160f25a546eSJason Gunthorpe const struct mmu_interval_notifier_ops *ops)
161f25a546eSJason Gunthorpe {
162f25a546eSJason Gunthorpe /*
163f27a0d50SJason Gunthorpe * Caller must ensure that root cannot be freed during the call to
16422d79c9aSJason Gunthorpe * ib_alloc_odp_umem.
16522d79c9aSJason Gunthorpe */
16622d79c9aSJason Gunthorpe struct ib_umem_odp *odp_data;
16722d79c9aSJason Gunthorpe struct ib_umem *umem;
168d07d1d70SArtemy Kovalyov int ret;
16941b4deeaSJason Gunthorpe
170d07d1d70SArtemy Kovalyov if (WARN_ON(!root->is_implicit_odp))
171d07d1d70SArtemy Kovalyov return ERR_PTR(-EINVAL);
172f20bef6aSJason Gunthorpe
173f20bef6aSJason Gunthorpe odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL);
174f20bef6aSJason Gunthorpe if (!odp_data)
17541b4deeaSJason Gunthorpe return ERR_PTR(-ENOMEM);
17641b4deeaSJason Gunthorpe umem = &odp_data->umem;
177d07d1d70SArtemy Kovalyov umem->ibdev = root->umem.ibdev;
17841b4deeaSJason Gunthorpe umem->length = size;
17947f725eeSJason Gunthorpe umem->address = addr;
180d07d1d70SArtemy Kovalyov umem->writable = root->umem.writable;
181d07d1d70SArtemy Kovalyov umem->owning_mm = root->umem.owning_mm;
182da6a496aSMoni Shoua odp_data->page_shift = PAGE_SHIFT;
18322d79c9aSJason Gunthorpe odp_data->notifier.ops = ops;
18422d79c9aSJason Gunthorpe
185f25a546eSJason Gunthorpe /*
186d07d1d70SArtemy Kovalyov * A mmget must be held when registering a notifier, the owming_mm only
187a4e63bceSJason Gunthorpe * has a mm_grab at this point.
188a4e63bceSJason Gunthorpe */
189a4e63bceSJason Gunthorpe if (!mmget_not_zero(umem->owning_mm)) {
190a4e63bceSJason Gunthorpe ret = -EFAULT;
191a4e63bceSJason Gunthorpe goto out_free;
192a4e63bceSJason Gunthorpe }
193a4e63bceSJason Gunthorpe
194a4e63bceSJason Gunthorpe odp_data->tgid = get_pid(root->tgid);
195a4e63bceSJason Gunthorpe ret = ib_init_umem_odp(odp_data, ops);
196f25a546eSJason Gunthorpe if (ret)
197f25a546eSJason Gunthorpe goto out_tgid;
198a4e63bceSJason Gunthorpe mmput(umem->owning_mm);
199a4e63bceSJason Gunthorpe return odp_data;
200a4e63bceSJason Gunthorpe
201a4e63bceSJason Gunthorpe out_tgid:
202a4e63bceSJason Gunthorpe put_pid(odp_data->tgid);
203a4e63bceSJason Gunthorpe mmput(umem->owning_mm);
204f25a546eSJason Gunthorpe out_free:
205a4e63bceSJason Gunthorpe kfree(odp_data);
206a4e63bceSJason Gunthorpe return ERR_PTR(ret);
207d07d1d70SArtemy Kovalyov }
208d07d1d70SArtemy Kovalyov EXPORT_SYMBOL(ib_umem_odp_alloc_child);
209d07d1d70SArtemy Kovalyov
210f20bef6aSJason Gunthorpe /**
211d07d1d70SArtemy Kovalyov * ib_umem_odp_get - Create a umem_odp for a userspace va
212f20bef6aSJason Gunthorpe *
213261dc53fSJason Gunthorpe * @device: IB device struct to get UMEM
214f20bef6aSJason Gunthorpe * @addr: userspace virtual address to start at
215c320e527SMoni Shoua * @size: length of region to pin
216261dc53fSJason Gunthorpe * @access: IB_ACCESS_xxx flags for memory being pinned
217261dc53fSJason Gunthorpe * @ops: MMU interval ops, currently only @invalidate
218261dc53fSJason Gunthorpe *
21911708142SColton Lewis * The driver should use when the access flags indicate ODP memory. It avoids
220261dc53fSJason Gunthorpe * pinning, instead, stores the mm for future page fault handling in
221261dc53fSJason Gunthorpe * conjunction with MMU notifiers.
222261dc53fSJason Gunthorpe */
ib_umem_odp_get(struct ib_device * device,unsigned long addr,size_t size,int access,const struct mmu_interval_notifier_ops * ops)223261dc53fSJason Gunthorpe struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
224f20bef6aSJason Gunthorpe unsigned long addr, size_t size, int access,
225c320e527SMoni Shoua const struct mmu_interval_notifier_ops *ops)
226c320e527SMoni Shoua {
227f25a546eSJason Gunthorpe struct ib_umem_odp *umem_odp;
2288ada2c1cSShachar Raindel int ret;
229261dc53fSJason Gunthorpe
230261dc53fSJason Gunthorpe if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
231261dc53fSJason Gunthorpe return ERR_PTR(-EINVAL);
232f25a546eSJason Gunthorpe
233261dc53fSJason Gunthorpe umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
234261dc53fSJason Gunthorpe if (!umem_odp)
235261dc53fSJason Gunthorpe return ERR_PTR(-ENOMEM);
236261dc53fSJason Gunthorpe
237261dc53fSJason Gunthorpe umem_odp->umem.ibdev = device;
238261dc53fSJason Gunthorpe umem_odp->umem.length = size;
239c320e527SMoni Shoua umem_odp->umem.address = addr;
240261dc53fSJason Gunthorpe umem_odp->umem.writable = ib_access_writable(access);
241261dc53fSJason Gunthorpe umem_odp->umem.owning_mm = current->mm;
242261dc53fSJason Gunthorpe umem_odp->notifier.ops = ops;
2431eb23d04SColin Ian King
244f25a546eSJason Gunthorpe umem_odp->page_shift = PAGE_SHIFT;
2458ada2c1cSShachar Raindel #ifdef CONFIG_HUGETLB_PAGE
246d2183c6fSJason Gunthorpe if (access & IB_ACCESS_HUGETLB)
24774f75cdaSArnd Bergmann umem_odp->page_shift = HPAGE_SHIFT;
2489ff1b646SYishai Hadas #endif
2499ff1b646SYishai Hadas
25074f75cdaSArnd Bergmann umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
2510008b84eSArtemy Kovalyov ret = ib_init_umem_odp(umem_odp, ops);
252f25a546eSJason Gunthorpe if (ret)
253f25a546eSJason Gunthorpe goto err_put_pid;
254261dc53fSJason Gunthorpe return umem_odp;
255f25a546eSJason Gunthorpe
256261dc53fSJason Gunthorpe err_put_pid:
257261dc53fSJason Gunthorpe put_pid(umem_odp->tgid);
258f25a546eSJason Gunthorpe kfree(umem_odp);
259f25a546eSJason Gunthorpe return ERR_PTR(ret);
260261dc53fSJason Gunthorpe }
261261dc53fSJason Gunthorpe EXPORT_SYMBOL(ib_umem_odp_get);
2628ada2c1cSShachar Raindel
ib_umem_odp_release(struct ib_umem_odp * umem_odp)263261dc53fSJason Gunthorpe void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
2648ada2c1cSShachar Raindel {
265b5231b01SJason Gunthorpe /*
2668ada2c1cSShachar Raindel * Ensure that no more pages are mapped in the umem.
2678ada2c1cSShachar Raindel *
2688ada2c1cSShachar Raindel * It is the driver's responsibility to ensure, before calling us,
2698ada2c1cSShachar Raindel * that the hardware will not attempt to access the MR any more.
2708ada2c1cSShachar Raindel */
2718ada2c1cSShachar Raindel if (!umem_odp->is_implicit_odp) {
2728ada2c1cSShachar Raindel mutex_lock(&umem_odp->umem_mutex);
273fd7dbf03SJason Gunthorpe ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
2749dc775e7SJason Gunthorpe ib_umem_end(umem_odp));
275d2183c6fSJason Gunthorpe mutex_unlock(&umem_odp->umem_mutex);
276d2183c6fSJason Gunthorpe mmu_interval_notifier_remove(&umem_odp->notifier);
2779dc775e7SJason Gunthorpe kvfree(umem_odp->dma_list);
278f25a546eSJason Gunthorpe kvfree(umem_odp->pfn_list);
27937824952SJason Gunthorpe }
28036f30e48SYishai Hadas put_pid(umem_odp->tgid);
2818ada2c1cSShachar Raindel kfree(umem_odp);
2820f9826f4SJason Gunthorpe }
2830446cad9SJason Gunthorpe EXPORT_SYMBOL(ib_umem_odp_release);
284fd7dbf03SJason Gunthorpe
2850446cad9SJason Gunthorpe /*
2868ada2c1cSShachar Raindel * Map for DMA and insert a single page into the on-demand paging page tables.
2878ada2c1cSShachar Raindel *
2888ada2c1cSShachar Raindel * @umem: the umem to insert the page to.
2898ada2c1cSShachar Raindel * @dma_index: index in the umem to add the dma to.
2908ada2c1cSShachar Raindel * @page: the page struct to map and add.
29136f30e48SYishai Hadas * @access_mask: access permissions needed for this page.
2928ada2c1cSShachar Raindel *
2938ada2c1cSShachar Raindel * The function returns -EFAULT if the DMA mapping operation fails.
2948ada2c1cSShachar Raindel *
29536f30e48SYishai Hadas */
ib_umem_odp_map_dma_single_page(struct ib_umem_odp * umem_odp,unsigned int dma_index,struct page * page,u64 access_mask)2968ada2c1cSShachar Raindel static int ib_umem_odp_map_dma_single_page(
2978ada2c1cSShachar Raindel struct ib_umem_odp *umem_odp,
2988ada2c1cSShachar Raindel unsigned int dma_index,
299b5231b01SJason Gunthorpe struct page *page,
30036f30e48SYishai Hadas u64 access_mask)
3018ada2c1cSShachar Raindel {
30236f30e48SYishai Hadas struct ib_device *dev = umem_odp->umem.ibdev;
3038ada2c1cSShachar Raindel dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
30447f725eeSJason Gunthorpe
30536f30e48SYishai Hadas if (*dma_addr) {
3068ada2c1cSShachar Raindel /*
30736f30e48SYishai Hadas * If the page is already dma mapped it means it went through
30846870b23SJason Gunthorpe * a non-invalidating trasition, like read-only to writable.
30936f30e48SYishai Hadas * Resync the flags.
31036f30e48SYishai Hadas */
31136f30e48SYishai Hadas *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
31246870b23SJason Gunthorpe return 0;
31336f30e48SYishai Hadas }
31436f30e48SYishai Hadas
3158ada2c1cSShachar Raindel *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
3168ada2c1cSShachar Raindel DMA_BIDIRECTIONAL);
31736f30e48SYishai Hadas if (ib_dma_mapping_error(dev, *dma_addr)) {
31836f30e48SYishai Hadas *dma_addr = 0;
31936f30e48SYishai Hadas return -EFAULT;
32036f30e48SYishai Hadas }
32136f30e48SYishai Hadas umem_odp->npages++;
32236f30e48SYishai Hadas *dma_addr |= access_mask;
32336f30e48SYishai Hadas return 0;
32436f30e48SYishai Hadas }
32536f30e48SYishai Hadas
3268ada2c1cSShachar Raindel /**
3278ada2c1cSShachar Raindel * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
3288ada2c1cSShachar Raindel *
32936f30e48SYishai Hadas * Maps the range passed in the argument to DMA addresses.
3308ada2c1cSShachar Raindel * The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
33136f30e48SYishai Hadas * Upon success the ODP MR will be locked to let caller complete its device
33236f30e48SYishai Hadas * page table update.
33336f30e48SYishai Hadas *
33436f30e48SYishai Hadas * Returns the number of pages mapped in success, negative error code
3358ada2c1cSShachar Raindel * for failure.
3368ada2c1cSShachar Raindel * @umem_odp: the umem to map and pin
3378ada2c1cSShachar Raindel * @user_virt: the address from which we need to map.
338b5231b01SJason Gunthorpe * @bcnt: the minimal number of bytes to pin and map. The mapping might be
3398ada2c1cSShachar Raindel * bigger due to alignment, and may also be smaller in case of an error
3408ada2c1cSShachar Raindel * pinning or mapping a page. The actual pages mapped is returned in
3418ada2c1cSShachar Raindel * the return value.
3428ada2c1cSShachar Raindel * @access_mask: bit mask of the requested access permissions for the given
3438ada2c1cSShachar Raindel * range.
3448ada2c1cSShachar Raindel * @fault: is faulting required for the given range
3458ada2c1cSShachar Raindel */
ib_umem_odp_map_dma_and_lock(struct ib_umem_odp * umem_odp,u64 user_virt,u64 bcnt,u64 access_mask,bool fault)3468bfafde0SYishai Hadas int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
3478ada2c1cSShachar Raindel u64 bcnt, u64 access_mask, bool fault)
34836f30e48SYishai Hadas __acquires(&umem_odp->umem_mutex)
3498bfafde0SYishai Hadas {
35036f30e48SYishai Hadas struct task_struct *owning_process = NULL;
3518ada2c1cSShachar Raindel struct mm_struct *owning_mm = umem_odp->umem.owning_mm;
3528ada2c1cSShachar Raindel int pfn_index, dma_index, ret = 0, start_idx;
353f27a0d50SJason Gunthorpe unsigned int page_shift, hmm_order, pfn_start_idx;
35436f30e48SYishai Hadas unsigned long num_pfns, current_seq;
35536f30e48SYishai Hadas struct hmm_range range = {};
35636f30e48SYishai Hadas unsigned long timeout;
35736f30e48SYishai Hadas
35836f30e48SYishai Hadas if (access_mask == 0)
3598ada2c1cSShachar Raindel return -EINVAL;
3608ada2c1cSShachar Raindel
3618ada2c1cSShachar Raindel if (user_virt < ib_umem_start(umem_odp) ||
3628ada2c1cSShachar Raindel user_virt + bcnt > ib_umem_end(umem_odp))
363d2183c6fSJason Gunthorpe return -EFAULT;
364d2183c6fSJason Gunthorpe
3658ada2c1cSShachar Raindel page_shift = umem_odp->page_shift;
3668ada2c1cSShachar Raindel
367d2183c6fSJason Gunthorpe /*
3688ada2c1cSShachar Raindel * owning_process is allowed to be NULL, this means somehow the mm is
369f27a0d50SJason Gunthorpe * existing beyond the lifetime of the originating process.. Presumably
370f27a0d50SJason Gunthorpe * mmget_not_zero will fail in this case.
371f27a0d50SJason Gunthorpe */
372f27a0d50SJason Gunthorpe owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID);
373f27a0d50SJason Gunthorpe if (!owning_process || !mmget_not_zero(owning_mm)) {
374f25a546eSJason Gunthorpe ret = -EINVAL;
3754438ee3fSMoni Shoua goto out_put_task;
3768ada2c1cSShachar Raindel }
3778ada2c1cSShachar Raindel
3788ada2c1cSShachar Raindel range.notifier = &umem_odp->notifier;
3798ada2c1cSShachar Raindel range.start = ALIGN_DOWN(user_virt, 1UL << page_shift);
38036f30e48SYishai Hadas range.end = ALIGN(user_virt + bcnt, 1UL << page_shift);
38136f30e48SYishai Hadas pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
38236f30e48SYishai Hadas num_pfns = (range.end - range.start) >> PAGE_SHIFT;
38336f30e48SYishai Hadas if (fault) {
38436f30e48SYishai Hadas range.default_flags = HMM_PFN_REQ_FAULT;
3858bfafde0SYishai Hadas
38636f30e48SYishai Hadas if (access_mask & ODP_WRITE_ALLOWED_BIT)
38736f30e48SYishai Hadas range.default_flags |= HMM_PFN_REQ_WRITE;
3889beae1eaSLorenzo Stoakes }
38936f30e48SYishai Hadas
3908bfafde0SYishai Hadas range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
3919beae1eaSLorenzo Stoakes timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
39236f30e48SYishai Hadas
39336f30e48SYishai Hadas retry:
3948ada2c1cSShachar Raindel current_seq = range.notifier_seq =
39536f30e48SYishai Hadas mmu_interval_read_begin(&umem_odp->notifier);
39636f30e48SYishai Hadas
39736f30e48SYishai Hadas mmap_read_lock(owning_mm);
3988ada2c1cSShachar Raindel ret = hmm_range_fault(&range);
399d8ed45c5SMichel Lespinasse mmap_read_unlock(owning_mm);
40036f30e48SYishai Hadas if (unlikely(ret)) {
401d8ed45c5SMichel Lespinasse if (ret == -EBUSY && !time_after(jiffies, timeout))
40236f30e48SYishai Hadas goto retry;
40336f30e48SYishai Hadas goto out_put_mm;
40436f30e48SYishai Hadas }
40536f30e48SYishai Hadas
406b02394aaSMoni Shoua start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift;
4078ada2c1cSShachar Raindel dma_index = start_idx;
40836f30e48SYishai Hadas
40936f30e48SYishai Hadas mutex_lock(&umem_odp->umem_mutex);
41036f30e48SYishai Hadas if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) {
411b5231b01SJason Gunthorpe mutex_unlock(&umem_odp->umem_mutex);
41236f30e48SYishai Hadas goto retry;
41336f30e48SYishai Hadas }
41436f30e48SYishai Hadas
415403cd12eSArtemy Kovalyov for (pfn_index = 0; pfn_index < num_pfns;
41636f30e48SYishai Hadas pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
41736f30e48SYishai Hadas
41836f30e48SYishai Hadas if (fault) {
4198bfafde0SYishai Hadas /*
4208bfafde0SYishai Hadas * Since we asked for hmm_range_fault() to populate
42136f30e48SYishai Hadas * pages it shouldn't return an error entry on success.
4228bfafde0SYishai Hadas */
4238bfafde0SYishai Hadas WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
42436f30e48SYishai Hadas WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
42536f30e48SYishai Hadas } else {
42636f30e48SYishai Hadas if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
4278bfafde0SYishai Hadas WARN_ON(umem_odp->dma_list[dma_index]);
4288bfafde0SYishai Hadas continue;
4298bfafde0SYishai Hadas }
4308bfafde0SYishai Hadas access_mask = ODP_READ_ALLOWED_BIT;
4318bfafde0SYishai Hadas if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
4328bfafde0SYishai Hadas access_mask |= ODP_WRITE_ALLOWED_BIT;
4338bfafde0SYishai Hadas }
4348bfafde0SYishai Hadas
4358bfafde0SYishai Hadas hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
4368bfafde0SYishai Hadas /* If a hugepage was detected and ODP wasn't set for, the umem
43736f30e48SYishai Hadas * page_shift will be used, the opposite case is an error.
43836f30e48SYishai Hadas */
43936f30e48SYishai Hadas if (hmm_order + PAGE_SHIFT < page_shift) {
44036f30e48SYishai Hadas ret = -EINVAL;
44136f30e48SYishai Hadas ibdev_dbg(umem_odp->umem.ibdev,
44236f30e48SYishai Hadas "%s: un-expected hmm_order %u, page_shift %u\n",
44336f30e48SYishai Hadas __func__, hmm_order, page_shift);
4443cea7b4aSWenpeng Liang break;
44536f30e48SYishai Hadas }
44636f30e48SYishai Hadas
447403cd12eSArtemy Kovalyov ret = ib_umem_odp_map_dma_single_page(
448403cd12eSArtemy Kovalyov umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
4498ada2c1cSShachar Raindel access_mask);
45036f30e48SYishai Hadas if (ret < 0) {
45136f30e48SYishai Hadas ibdev_dbg(umem_odp->umem.ibdev,
452b02394aaSMoni Shoua "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
45336f30e48SYishai Hadas break;
45436f30e48SYishai Hadas }
4558ada2c1cSShachar Raindel }
456b02394aaSMoni Shoua /* upon success lock should stay on hold for the callee */
4578ada2c1cSShachar Raindel if (!ret)
458b599b310SJulia Lawall ret = dma_index - start_idx;
45936f30e48SYishai Hadas else
46036f30e48SYishai Hadas mutex_unlock(&umem_odp->umem_mutex);
46136f30e48SYishai Hadas
462b5231b01SJason Gunthorpe out_put_mm:
4638ada2c1cSShachar Raindel mmput_async(owning_mm);
46436f30e48SYishai Hadas out_put_task:
465*85eaeb50SYishai Hadas if (owning_process)
4668ada2c1cSShachar Raindel put_task_struct(owning_process);
467f27a0d50SJason Gunthorpe return ret;
4688ada2c1cSShachar Raindel }
4698ada2c1cSShachar Raindel EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
4708ada2c1cSShachar Raindel
ib_umem_odp_unmap_dma_pages(struct ib_umem_odp * umem_odp,u64 virt,u64 bound)47136f30e48SYishai Hadas void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
4728ada2c1cSShachar Raindel u64 bound)
473b5231b01SJason Gunthorpe {
4748ada2c1cSShachar Raindel dma_addr_t dma_addr;
4758ada2c1cSShachar Raindel dma_addr_t dma;
47636f30e48SYishai Hadas int idx;
47736f30e48SYishai Hadas u64 addr;
4788ada2c1cSShachar Raindel struct ib_device *dev = umem_odp->umem.ibdev;
4798ada2c1cSShachar Raindel
48047f725eeSJason Gunthorpe lockdep_assert_held(&umem_odp->umem_mutex);
4818ada2c1cSShachar Raindel
4829dc775e7SJason Gunthorpe virt = max_t(u64, virt, ib_umem_start(umem_odp));
4839dc775e7SJason Gunthorpe bound = min_t(u64, bound, ib_umem_end(umem_odp));
484d2183c6fSJason Gunthorpe for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
485d2183c6fSJason Gunthorpe idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
486d2183c6fSJason Gunthorpe dma = umem_odp->dma_list[idx];
487d2183c6fSJason Gunthorpe
48836f30e48SYishai Hadas /* The access flags guaranteed a valid DMA address in case was NULL */
4898ada2c1cSShachar Raindel if (dma) {
49036f30e48SYishai Hadas unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
49136f30e48SYishai Hadas struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
49236f30e48SYishai Hadas
49336f30e48SYishai Hadas dma_addr = dma & ODP_DMA_ADDR_MASK;
4948ada2c1cSShachar Raindel ib_dma_unmap_page(dev, dma_addr,
49536f30e48SYishai Hadas BIT(umem_odp->page_shift),
496dd82e668SJason Gunthorpe DMA_BIDIRECTIONAL);
497dd82e668SJason Gunthorpe if (dma & ODP_WRITE_ALLOWED_BIT) {
4988ada2c1cSShachar Raindel struct page *head_page = compound_head(page);
499325ad061SGuy Shapiro /*
500325ad061SGuy Shapiro * set_page_dirty prefers being called with
501882214e2SHaggai Eran * the page lock. However, MMU notifiers are
502882214e2SHaggai Eran * called sometimes with and sometimes without
503882214e2SHaggai Eran * the lock. We rely on the umem_mutex instead
504882214e2SHaggai Eran * to prevent other mmu notifiers from
505882214e2SHaggai Eran * continuing and allowing the page mapping to
506882214e2SHaggai Eran * be removed.
507882214e2SHaggai Eran */
508882214e2SHaggai Eran set_page_dirty(head_page);
509882214e2SHaggai Eran }
510882214e2SHaggai Eran umem_odp->dma_list[idx] = 0;
511325ad061SGuy Shapiro umem_odp->npages--;
512b5231b01SJason Gunthorpe }
513d10bcf94SShiraz Saleem }
5148ada2c1cSShachar Raindel }
5158ada2c1cSShachar Raindel EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
5168ada2c1cSShachar Raindel