xref: /openbmc/linux/drivers/infiniband/hw/mlx5/odp.c (revision 594cac11)
18cdd312cSHaggai Eran /*
26cf0a15fSSaeed Mahameed  * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
38cdd312cSHaggai Eran  *
48cdd312cSHaggai Eran  * This software is available to you under a choice of one of two
58cdd312cSHaggai Eran  * licenses.  You may choose to be licensed under the terms of the GNU
68cdd312cSHaggai Eran  * General Public License (GPL) Version 2, available from the file
78cdd312cSHaggai Eran  * COPYING in the main directory of this source tree, or the
88cdd312cSHaggai Eran  * OpenIB.org BSD license below:
98cdd312cSHaggai Eran  *
108cdd312cSHaggai Eran  *     Redistribution and use in source and binary forms, with or
118cdd312cSHaggai Eran  *     without modification, are permitted provided that the following
128cdd312cSHaggai Eran  *     conditions are met:
138cdd312cSHaggai Eran  *
148cdd312cSHaggai Eran  *      - Redistributions of source code must retain the above
158cdd312cSHaggai Eran  *        copyright notice, this list of conditions and the following
168cdd312cSHaggai Eran  *        disclaimer.
178cdd312cSHaggai Eran  *
188cdd312cSHaggai Eran  *      - Redistributions in binary form must reproduce the above
198cdd312cSHaggai Eran  *        copyright notice, this list of conditions and the following
208cdd312cSHaggai Eran  *        disclaimer in the documentation and/or other materials
218cdd312cSHaggai Eran  *        provided with the distribution.
228cdd312cSHaggai Eran  *
238cdd312cSHaggai Eran  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
248cdd312cSHaggai Eran  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
258cdd312cSHaggai Eran  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
268cdd312cSHaggai Eran  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
278cdd312cSHaggai Eran  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
288cdd312cSHaggai Eran  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
298cdd312cSHaggai Eran  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
308cdd312cSHaggai Eran  * SOFTWARE.
318cdd312cSHaggai Eran  */
328cdd312cSHaggai Eran 
337bdf65d4SHaggai Eran #include <rdma/ib_umem_odp.h>
34e980b441SJérémy Lefaure #include <linux/kernel.h>
3590da7dc8SJianxin Xiong #include <linux/dma-buf.h>
3690da7dc8SJianxin Xiong #include <linux/dma-resv.h>
377bdf65d4SHaggai Eran 
388cdd312cSHaggai Eran #include "mlx5_ib.h"
3981713d37SArtemy Kovalyov #include "cmd.h"
40f49c856aSAharon Landau #include "umr.h"
41333fbaa0SLeon Romanovsky #include "qp.h"
428cdd312cSHaggai Eran 
43d5d284b8SSaeed Mahameed #include <linux/mlx5/eq.h>
44d5d284b8SSaeed Mahameed 
45d5d284b8SSaeed Mahameed /* Contains the details of a pagefault. */
46d5d284b8SSaeed Mahameed struct mlx5_pagefault {
47d5d284b8SSaeed Mahameed 	u32			bytes_committed;
48d5d284b8SSaeed Mahameed 	u32			token;
49d5d284b8SSaeed Mahameed 	u8			event_subtype;
50d5d284b8SSaeed Mahameed 	u8			type;
51d5d284b8SSaeed Mahameed 	union {
52d5d284b8SSaeed Mahameed 		/* Initiator or send message responder pagefault details. */
53d5d284b8SSaeed Mahameed 		struct {
54d5d284b8SSaeed Mahameed 			/* Received packet size, only valid for responders. */
55d5d284b8SSaeed Mahameed 			u32	packet_size;
56d5d284b8SSaeed Mahameed 			/*
57d5d284b8SSaeed Mahameed 			 * Number of resource holding WQE, depends on type.
58d5d284b8SSaeed Mahameed 			 */
59d5d284b8SSaeed Mahameed 			u32	wq_num;
60d5d284b8SSaeed Mahameed 			/*
61d5d284b8SSaeed Mahameed 			 * WQE index. Refers to either the send queue or
62d5d284b8SSaeed Mahameed 			 * receive queue, according to event_subtype.
63d5d284b8SSaeed Mahameed 			 */
64d5d284b8SSaeed Mahameed 			u16	wqe_index;
65d5d284b8SSaeed Mahameed 		} wqe;
66d5d284b8SSaeed Mahameed 		/* RDMA responder pagefault details */
67d5d284b8SSaeed Mahameed 		struct {
68d5d284b8SSaeed Mahameed 			u32	r_key;
69d5d284b8SSaeed Mahameed 			/*
70d5d284b8SSaeed Mahameed 			 * Received packet size, minimal size page fault
71d5d284b8SSaeed Mahameed 			 * resolution required for forward progress.
72d5d284b8SSaeed Mahameed 			 */
73d5d284b8SSaeed Mahameed 			u32	packet_size;
74d5d284b8SSaeed Mahameed 			u32	rdma_op_len;
75d5d284b8SSaeed Mahameed 			u64	rdma_va;
76d5d284b8SSaeed Mahameed 		} rdma;
77d5d284b8SSaeed Mahameed 	};
78d5d284b8SSaeed Mahameed 
79d5d284b8SSaeed Mahameed 	struct mlx5_ib_pf_eq	*eq;
80d5d284b8SSaeed Mahameed 	struct work_struct	work;
81d5d284b8SSaeed Mahameed };
82d5d284b8SSaeed Mahameed 
83eab668a6SHaggai Eran #define MAX_PREFETCH_LEN (4*1024*1024U)
84eab668a6SHaggai Eran 
85b4cfe447SHaggai Eran /* Timeout in ms to wait for an active mmu notifier to complete when handling
86b4cfe447SHaggai Eran  * a pagefault. */
87b4cfe447SHaggai Eran #define MMU_NOTIFIER_TIMEOUT 1000
88b4cfe447SHaggai Eran 
8981713d37SArtemy Kovalyov #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
9081713d37SArtemy Kovalyov #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
9181713d37SArtemy Kovalyov #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
9281713d37SArtemy Kovalyov #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
9381713d37SArtemy Kovalyov #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
9481713d37SArtemy Kovalyov 
9581713d37SArtemy Kovalyov #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
9681713d37SArtemy Kovalyov 
9781713d37SArtemy Kovalyov static u64 mlx5_imr_ksm_entries;
9881713d37SArtemy Kovalyov 
populate_klm(struct mlx5_klm * pklm,size_t idx,size_t nentries,struct mlx5_ib_mr * imr,int flags)99cbe4b8f0SArtemy Kovalyov static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
100423f52d6SJason Gunthorpe 			struct mlx5_ib_mr *imr, int flags)
10181713d37SArtemy Kovalyov {
102423f52d6SJason Gunthorpe 	struct mlx5_klm *end = pklm + nentries;
10381713d37SArtemy Kovalyov 
10481713d37SArtemy Kovalyov 	if (flags & MLX5_IB_UPD_XLT_ZAP) {
105423f52d6SJason Gunthorpe 		for (; pklm != end; pklm++, idx++) {
10681713d37SArtemy Kovalyov 			pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
107*594cac11SOr Har-Toov 			pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
10881713d37SArtemy Kovalyov 			pklm->va = 0;
10981713d37SArtemy Kovalyov 		}
11081713d37SArtemy Kovalyov 		return;
11181713d37SArtemy Kovalyov 	}
11281713d37SArtemy Kovalyov 
113f28b1932SJason Gunthorpe 	/*
114423f52d6SJason Gunthorpe 	 * The locking here is pretty subtle. Ideally the implicit_children
115423f52d6SJason Gunthorpe 	 * xarray would be protected by the umem_mutex, however that is not
116f28b1932SJason Gunthorpe 	 * possible. Instead this uses a weaker update-then-lock pattern:
117f28b1932SJason Gunthorpe 	 *
118423f52d6SJason Gunthorpe 	 *    xa_store()
119f28b1932SJason Gunthorpe 	 *    mutex_lock(umem_mutex)
120636bdbfcSAharon Landau 	 *     mlx5r_umr_update_xlt()
121f28b1932SJason Gunthorpe 	 *    mutex_unlock(umem_mutex)
122f28b1932SJason Gunthorpe 	 *    destroy lkey
123f28b1932SJason Gunthorpe 	 *
124423f52d6SJason Gunthorpe 	 * ie any change the xarray must be followed by the locked update_xlt
125423f52d6SJason Gunthorpe 	 * before destroying.
126f28b1932SJason Gunthorpe 	 *
127f28b1932SJason Gunthorpe 	 * The umem_mutex provides the acquire/release semantic needed to make
128db72438cSYishai Hadas 	 * the xa_store() visible to a racing thread.
129f28b1932SJason Gunthorpe 	 */
130423f52d6SJason Gunthorpe 	lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
131f28b1932SJason Gunthorpe 
132423f52d6SJason Gunthorpe 	for (; pklm != end; pklm++, idx++) {
133423f52d6SJason Gunthorpe 		struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
13481713d37SArtemy Kovalyov 
13581713d37SArtemy Kovalyov 		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
136423f52d6SJason Gunthorpe 		if (mtt) {
13781713d37SArtemy Kovalyov 			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
138423f52d6SJason Gunthorpe 			pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
13981713d37SArtemy Kovalyov 		} else {
140*594cac11SOr Har-Toov 			pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
1419162420dSJason Gunthorpe 			pklm->va = 0;
14281713d37SArtemy Kovalyov 		}
14381713d37SArtemy Kovalyov 	}
14481713d37SArtemy Kovalyov }
14581713d37SArtemy Kovalyov 
umem_dma_to_mtt(dma_addr_t umem_dma)146cbe4b8f0SArtemy Kovalyov static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
147cbe4b8f0SArtemy Kovalyov {
148cbe4b8f0SArtemy Kovalyov 	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
149cbe4b8f0SArtemy Kovalyov 
150cbe4b8f0SArtemy Kovalyov 	if (umem_dma & ODP_READ_ALLOWED_BIT)
151cbe4b8f0SArtemy Kovalyov 		mtt_entry |= MLX5_IB_MTT_READ;
152cbe4b8f0SArtemy Kovalyov 	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
153cbe4b8f0SArtemy Kovalyov 		mtt_entry |= MLX5_IB_MTT_WRITE;
154cbe4b8f0SArtemy Kovalyov 
155cbe4b8f0SArtemy Kovalyov 	return mtt_entry;
156cbe4b8f0SArtemy Kovalyov }
157cbe4b8f0SArtemy Kovalyov 
populate_mtt(__be64 * pas,size_t idx,size_t nentries,struct mlx5_ib_mr * mr,int flags)158cbe4b8f0SArtemy Kovalyov static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
159cbe4b8f0SArtemy Kovalyov 			 struct mlx5_ib_mr *mr, int flags)
160cbe4b8f0SArtemy Kovalyov {
161cbe4b8f0SArtemy Kovalyov 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
162cbe4b8f0SArtemy Kovalyov 	dma_addr_t pa;
163cbe4b8f0SArtemy Kovalyov 	size_t i;
164cbe4b8f0SArtemy Kovalyov 
165cbe4b8f0SArtemy Kovalyov 	if (flags & MLX5_IB_UPD_XLT_ZAP)
166cbe4b8f0SArtemy Kovalyov 		return;
167cbe4b8f0SArtemy Kovalyov 
168cbe4b8f0SArtemy Kovalyov 	for (i = 0; i < nentries; i++) {
169cbe4b8f0SArtemy Kovalyov 		pa = odp->dma_list[idx + i];
170cbe4b8f0SArtemy Kovalyov 		pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
171cbe4b8f0SArtemy Kovalyov 	}
172cbe4b8f0SArtemy Kovalyov }
173cbe4b8f0SArtemy Kovalyov 
mlx5_odp_populate_xlt(void * xlt,size_t idx,size_t nentries,struct mlx5_ib_mr * mr,int flags)174cbe4b8f0SArtemy Kovalyov void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
175cbe4b8f0SArtemy Kovalyov 			   struct mlx5_ib_mr *mr, int flags)
176cbe4b8f0SArtemy Kovalyov {
177cbe4b8f0SArtemy Kovalyov 	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
178cbe4b8f0SArtemy Kovalyov 		populate_klm(xlt, idx, nentries, mr, flags);
179cbe4b8f0SArtemy Kovalyov 	} else {
180cbe4b8f0SArtemy Kovalyov 		populate_mtt(xlt, idx, nentries, mr, flags);
181cbe4b8f0SArtemy Kovalyov 	}
182cbe4b8f0SArtemy Kovalyov }
183cbe4b8f0SArtemy Kovalyov 
1845256edcbSJason Gunthorpe /*
185db72438cSYishai Hadas  * This must be called after the mr has been removed from implicit_children.
186db72438cSYishai Hadas  * NOTE: The MR does not necessarily have to be
187d561987fSJason Gunthorpe  * empty here, parallel page faults could have raced with the free process and
188d561987fSJason Gunthorpe  * added pages to it.
1895256edcbSJason Gunthorpe  */
free_implicit_child_mr_work(struct work_struct * work)1905256edcbSJason Gunthorpe static void free_implicit_child_mr_work(struct work_struct *work)
1915256edcbSJason Gunthorpe {
1925256edcbSJason Gunthorpe 	struct mlx5_ib_mr *mr =
1935256edcbSJason Gunthorpe 		container_of(work, struct mlx5_ib_mr, odp_destroy.work);
194db72438cSYishai Hadas 	struct mlx5_ib_mr *imr = mr->parent;
195e6fb246cSJason Gunthorpe 	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
196e6fb246cSJason Gunthorpe 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1975256edcbSJason Gunthorpe 
198e6fb246cSJason Gunthorpe 	mlx5r_deref_wait_odp_mkey(&mr->mmkey);
199e6fb246cSJason Gunthorpe 
200e6fb246cSJason Gunthorpe 	mutex_lock(&odp_imr->umem_mutex);
201636bdbfcSAharon Landau 	mlx5r_umr_update_xlt(mr->parent,
202636bdbfcSAharon Landau 			     ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0,
203e6fb246cSJason Gunthorpe 			     MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
204e6fb246cSJason Gunthorpe 	mutex_unlock(&odp_imr->umem_mutex);
205e6fb246cSJason Gunthorpe 	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
206e6fb246cSJason Gunthorpe 
207db72438cSYishai Hadas 	mlx5r_deref_odp_mkey(&imr->mmkey);
2085256edcbSJason Gunthorpe }
2095256edcbSJason Gunthorpe 
destroy_unused_implicit_child_mr(struct mlx5_ib_mr * mr)2105256edcbSJason Gunthorpe static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
2115256edcbSJason Gunthorpe {
2125256edcbSJason Gunthorpe 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
2135256edcbSJason Gunthorpe 	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
2145256edcbSJason Gunthorpe 	struct mlx5_ib_mr *imr = mr->parent;
2155256edcbSJason Gunthorpe 
216db72438cSYishai Hadas 	if (!refcount_inc_not_zero(&imr->mmkey.usecount))
217db72438cSYishai Hadas 		return;
2185256edcbSJason Gunthorpe 
219db72438cSYishai Hadas 	xa_erase(&imr->implicit_children, idx);
2205256edcbSJason Gunthorpe 
221db72438cSYishai Hadas 	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
222db72438cSYishai Hadas 	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
223db72438cSYishai Hadas 	queue_work(system_unbound_wq, &mr->odp_destroy.work);
22481713d37SArtemy Kovalyov }
22581713d37SArtemy Kovalyov 
mlx5_ib_invalidate_range(struct mmu_interval_notifier * mni,const struct mmu_notifier_range * range,unsigned long cur_seq)226f25a546eSJason Gunthorpe static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
227f25a546eSJason Gunthorpe 				     const struct mmu_notifier_range *range,
228f25a546eSJason Gunthorpe 				     unsigned long cur_seq)
229b4cfe447SHaggai Eran {
230f25a546eSJason Gunthorpe 	struct ib_umem_odp *umem_odp =
231f25a546eSJason Gunthorpe 		container_of(mni, struct ib_umem_odp, notifier);
232b4cfe447SHaggai Eran 	struct mlx5_ib_mr *mr;
23302648b4bSTariq Toukan 	const u64 umr_block_mask = MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT - 1;
234b4cfe447SHaggai Eran 	u64 idx = 0, blk_start_idx = 0;
235a3de94e3SErez Alfasi 	u64 invalidations = 0;
236f25a546eSJason Gunthorpe 	unsigned long start;
237f25a546eSJason Gunthorpe 	unsigned long end;
238b4cfe447SHaggai Eran 	int in_block = 0;
239b4cfe447SHaggai Eran 	u64 addr;
240b4cfe447SHaggai Eran 
241f25a546eSJason Gunthorpe 	if (!mmu_notifier_range_blockable(range))
242f25a546eSJason Gunthorpe 		return false;
243f25a546eSJason Gunthorpe 
24409689703SJason Gunthorpe 	mutex_lock(&umem_odp->umem_mutex);
245f25a546eSJason Gunthorpe 	mmu_interval_set_seq(mni, cur_seq);
24609689703SJason Gunthorpe 	/*
24709689703SJason Gunthorpe 	 * If npages is zero then umem_odp->private may not be setup yet. This
24809689703SJason Gunthorpe 	 * does not complete until after the first page is mapped for DMA.
24909689703SJason Gunthorpe 	 */
25009689703SJason Gunthorpe 	if (!umem_odp->npages)
25109689703SJason Gunthorpe 		goto out;
252b5231b01SJason Gunthorpe 	mr = umem_odp->private;
253b4cfe447SHaggai Eran 
254f25a546eSJason Gunthorpe 	start = max_t(u64, ib_umem_start(umem_odp), range->start);
255f25a546eSJason Gunthorpe 	end = min_t(u64, ib_umem_end(umem_odp), range->end);
256b4cfe447SHaggai Eran 
257b4cfe447SHaggai Eran 	/*
258b4cfe447SHaggai Eran 	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
259b4cfe447SHaggai Eran 	 * while we are doing the invalidation, no page fault will attempt to
260b4cfe447SHaggai Eran 	 * overwrite the same MTTs.  Concurent invalidations might race us,
261b4cfe447SHaggai Eran 	 * but they will write 0s as well, so no difference in the end result.
262b4cfe447SHaggai Eran 	 */
263d2183c6fSJason Gunthorpe 	for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
264d2183c6fSJason Gunthorpe 		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
265b4cfe447SHaggai Eran 		/*
266b4cfe447SHaggai Eran 		 * Strive to write the MTTs in chunks, but avoid overwriting
267b4cfe447SHaggai Eran 		 * non-existing MTTs. The huristic here can be improved to
268b4cfe447SHaggai Eran 		 * estimate the cost of another UMR vs. the cost of bigger
269b4cfe447SHaggai Eran 		 * UMR.
270b4cfe447SHaggai Eran 		 */
271b5231b01SJason Gunthorpe 		if (umem_odp->dma_list[idx] &
272b4cfe447SHaggai Eran 		    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
273b4cfe447SHaggai Eran 			if (!in_block) {
274b4cfe447SHaggai Eran 				blk_start_idx = idx;
275b4cfe447SHaggai Eran 				in_block = 1;
276b4cfe447SHaggai Eran 			}
277a3de94e3SErez Alfasi 
278a3de94e3SErez Alfasi 			/* Count page invalidations */
279a3de94e3SErez Alfasi 			invalidations += idx - blk_start_idx + 1;
280b4cfe447SHaggai Eran 		} else {
281b4cfe447SHaggai Eran 			u64 umr_offset = idx & umr_block_mask;
282b4cfe447SHaggai Eran 
283b4cfe447SHaggai Eran 			if (in_block && umr_offset == 0) {
284636bdbfcSAharon Landau 				mlx5r_umr_update_xlt(mr, blk_start_idx,
285b2ac9188SArtemy Kovalyov 						     idx - blk_start_idx, 0,
2867d0cc6edSArtemy Kovalyov 						     MLX5_IB_UPD_XLT_ZAP |
2877d0cc6edSArtemy Kovalyov 						     MLX5_IB_UPD_XLT_ATOMIC);
288b4cfe447SHaggai Eran 				in_block = 0;
289b4cfe447SHaggai Eran 			}
290b4cfe447SHaggai Eran 		}
291b4cfe447SHaggai Eran 	}
292b4cfe447SHaggai Eran 	if (in_block)
293636bdbfcSAharon Landau 		mlx5r_umr_update_xlt(mr, blk_start_idx,
294b2ac9188SArtemy Kovalyov 				     idx - blk_start_idx + 1, 0,
2957d0cc6edSArtemy Kovalyov 				     MLX5_IB_UPD_XLT_ZAP |
2967d0cc6edSArtemy Kovalyov 				     MLX5_IB_UPD_XLT_ATOMIC);
297a3de94e3SErez Alfasi 
298a3de94e3SErez Alfasi 	mlx5_update_odp_stats(mr, invalidations, invalidations);
299a3de94e3SErez Alfasi 
300b4cfe447SHaggai Eran 	/*
301b4cfe447SHaggai Eran 	 * We are now sure that the device will not access the
302b4cfe447SHaggai Eran 	 * memory. We can safely unmap it, and mark it as dirty if
303b4cfe447SHaggai Eran 	 * needed.
304b4cfe447SHaggai Eran 	 */
305b4cfe447SHaggai Eran 
306b5231b01SJason Gunthorpe 	ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
30781713d37SArtemy Kovalyov 
3085256edcbSJason Gunthorpe 	if (unlikely(!umem_odp->npages && mr->parent))
3095256edcbSJason Gunthorpe 		destroy_unused_implicit_child_mr(mr);
31009689703SJason Gunthorpe out:
3119dc775e7SJason Gunthorpe 	mutex_unlock(&umem_odp->umem_mutex);
312f25a546eSJason Gunthorpe 	return true;
313b4cfe447SHaggai Eran }
314b4cfe447SHaggai Eran 
315f25a546eSJason Gunthorpe const struct mmu_interval_notifier_ops mlx5_mn_ops = {
316f25a546eSJason Gunthorpe 	.invalidate = mlx5_ib_invalidate_range,
317f25a546eSJason Gunthorpe };
318f25a546eSJason Gunthorpe 
internal_fill_odp_caps(struct mlx5_ib_dev * dev)319e5dc370bSShay Drory static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
3208cdd312cSHaggai Eran {
3218cdd312cSHaggai Eran 	struct ib_odp_caps *caps = &dev->odp_caps;
3228cdd312cSHaggai Eran 
3238cdd312cSHaggai Eran 	memset(caps, 0, sizeof(*caps));
3248cdd312cSHaggai Eran 
325f49c856aSAharon Landau 	if (!MLX5_CAP_GEN(dev->mdev, pg) || !mlx5r_umr_can_load_pas(dev, 0))
326938fe83cSSaeed Mahameed 		return;
3278cdd312cSHaggai Eran 
328b4cfe447SHaggai Eran 	caps->general_caps = IB_ODP_SUPPORT;
329b4cfe447SHaggai Eran 
330c438fde1SArtemy Kovalyov 	if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
331c438fde1SArtemy Kovalyov 		dev->odp_max_size = U64_MAX;
332c438fde1SArtemy Kovalyov 	else
333c438fde1SArtemy Kovalyov 		dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
334c438fde1SArtemy Kovalyov 
335938fe83cSSaeed Mahameed 	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
336938fe83cSSaeed Mahameed 		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
337938fe83cSSaeed Mahameed 
3382e68daceSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
3392e68daceSMoni Shoua 		caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
3402e68daceSMoni Shoua 
341938fe83cSSaeed Mahameed 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
342938fe83cSSaeed Mahameed 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
343938fe83cSSaeed Mahameed 
344938fe83cSSaeed Mahameed 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
345938fe83cSSaeed Mahameed 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
346938fe83cSSaeed Mahameed 
347938fe83cSSaeed Mahameed 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
348938fe83cSSaeed Mahameed 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
349938fe83cSSaeed Mahameed 
350938fe83cSSaeed Mahameed 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
351938fe83cSSaeed Mahameed 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
352938fe83cSSaeed Mahameed 
35317d2f88fSArtemy Kovalyov 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
35417d2f88fSArtemy Kovalyov 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
35517d2f88fSArtemy Kovalyov 
3562e68daceSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
3572e68daceSMoni Shoua 		caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
3582e68daceSMoni Shoua 
3596141f8faSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
3606141f8faSMoni Shoua 		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
3616141f8faSMoni Shoua 
3626141f8faSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
3636141f8faSMoni Shoua 		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
3646141f8faSMoni Shoua 
3656141f8faSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
3666141f8faSMoni Shoua 		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
3676141f8faSMoni Shoua 
3686141f8faSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
3696141f8faSMoni Shoua 		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
3706141f8faSMoni Shoua 
3716141f8faSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
3726141f8faSMoni Shoua 		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
3736141f8faSMoni Shoua 
3746141f8faSMoni Shoua 	if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
3756141f8faSMoni Shoua 		caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
3766141f8faSMoni Shoua 
37781713d37SArtemy Kovalyov 	if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
37881713d37SArtemy Kovalyov 	    MLX5_CAP_GEN(dev->mdev, null_mkey) &&
37900815752SMoni Shoua 	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
38000815752SMoni Shoua 	    !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
38181713d37SArtemy Kovalyov 		caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
3828cdd312cSHaggai Eran }
3836aec21f6SHaggai Eran 
mlx5_ib_page_fault_resume(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault,int error)384d9aaed83SArtemy Kovalyov static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
385d9aaed83SArtemy Kovalyov 				      struct mlx5_pagefault *pfault,
38619098df2Smajd@mellanox.com 				      int error)
38719098df2Smajd@mellanox.com {
388d9aaed83SArtemy Kovalyov 	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
389d9aaed83SArtemy Kovalyov 		     pfault->wqe.wq_num : pfault->token;
390d5d284b8SSaeed Mahameed 	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
391d5d284b8SSaeed Mahameed 	int err;
392d5d284b8SSaeed Mahameed 
393d5d284b8SSaeed Mahameed 	MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
394d5d284b8SSaeed Mahameed 	MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
395d5d284b8SSaeed Mahameed 	MLX5_SET(page_fault_resume_in, in, token, pfault->token);
396d5d284b8SSaeed Mahameed 	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
397d5d284b8SSaeed Mahameed 	MLX5_SET(page_fault_resume_in, in, error, !!error);
398d5d284b8SSaeed Mahameed 
39931578defSLeon Romanovsky 	err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
400d5d284b8SSaeed Mahameed 	if (err)
401d5d284b8SSaeed Mahameed 		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
402d5d284b8SSaeed Mahameed 			    wq_num, err);
4036aec21f6SHaggai Eran }
4046aec21f6SHaggai Eran 
implicit_get_child_mr(struct mlx5_ib_mr * imr,unsigned long idx)4053d5f3c54SJason Gunthorpe static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
4063d5f3c54SJason Gunthorpe 						unsigned long idx)
40781713d37SArtemy Kovalyov {
40856561ac6SAharon Landau 	struct mlx5_ib_dev *dev = mr_to_mdev(imr);
4093d5f3c54SJason Gunthorpe 	struct ib_umem_odp *odp;
41081713d37SArtemy Kovalyov 	struct mlx5_ib_mr *mr;
411c2edcd69SJason Gunthorpe 	struct mlx5_ib_mr *ret;
41281713d37SArtemy Kovalyov 	int err;
41381713d37SArtemy Kovalyov 
4143d5f3c54SJason Gunthorpe 	odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
4153d5f3c54SJason Gunthorpe 				      idx * MLX5_IMR_MTT_SIZE,
416f25a546eSJason Gunthorpe 				      MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
4173d5f3c54SJason Gunthorpe 	if (IS_ERR(odp))
418b5231b01SJason Gunthorpe 		return ERR_CAST(odp);
41981713d37SArtemy Kovalyov 
42056561ac6SAharon Landau 	mr = mlx5_mr_cache_alloc(dev, imr->access_flags,
42156561ac6SAharon Landau 				 MLX5_MKC_ACCESS_MODE_MTT,
422e6fb246cSJason Gunthorpe 				 MLX5_IMR_MTT_ENTRIES);
423e6fb246cSJason Gunthorpe 	if (IS_ERR(mr)) {
424e6fb246cSJason Gunthorpe 		ib_umem_odp_release(odp);
425e6fb246cSJason Gunthorpe 		return mr;
42681713d37SArtemy Kovalyov 	}
42756561ac6SAharon Landau 
428c2edcd69SJason Gunthorpe 	mr->access_flags = imr->access_flags;
429ca991a7dSMaor Gottlieb 	mr->ibmr.pd = imr->ibmr.pd;
430c2edcd69SJason Gunthorpe 	mr->ibmr.device = &mr_to_mdev(imr)->ib_dev;
431c2edcd69SJason Gunthorpe 	mr->umem = &odp->umem;
432c2edcd69SJason Gunthorpe 	mr->ibmr.lkey = mr->mmkey.key;
433cf6a8b1bSAharon Landau 	mr->ibmr.rkey = mr->mmkey.key;
434c2edcd69SJason Gunthorpe 	mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE;
435c2edcd69SJason Gunthorpe 	mr->parent = imr;
43681713d37SArtemy Kovalyov 	odp->private = mr;
437db72438cSYishai Hadas 
438db72438cSYishai Hadas 	/*
439db72438cSYishai Hadas 	 * First refcount is owned by the xarray and second refconut
440db72438cSYishai Hadas 	 * is returned to the caller.
441db72438cSYishai Hadas 	 */
442db72438cSYishai Hadas 	refcount_set(&mr->mmkey.usecount, 2);
443636bdbfcSAharon Landau 
44481713d37SArtemy Kovalyov 	err = mlx5r_umr_update_xlt(mr, 0,
44581713d37SArtemy Kovalyov 				   MLX5_IMR_MTT_ENTRIES,
44681713d37SArtemy Kovalyov 				   PAGE_SHIFT,
4473389baa8SJason Gunthorpe 				   MLX5_IB_UPD_XLT_ZAP |
448c2edcd69SJason Gunthorpe 				   MLX5_IB_UPD_XLT_ENABLE);
449c2edcd69SJason Gunthorpe 	if (err) {
450d561987fSJason Gunthorpe 		ret = ERR_PTR(err);
45181713d37SArtemy Kovalyov 		goto out_mr;
45281713d37SArtemy Kovalyov 	}
453db72438cSYishai Hadas 
454db72438cSYishai Hadas 	xa_lock(&imr->implicit_children);
4555256edcbSJason Gunthorpe 	ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
4563389baa8SJason Gunthorpe 			   GFP_KERNEL);
4573389baa8SJason Gunthorpe 	if (unlikely(ret)) {
4583389baa8SJason Gunthorpe 		if (xa_is_err(ret)) {
459db72438cSYishai Hadas 			ret = ERR_PTR(xa_err(ret));
4603389baa8SJason Gunthorpe 			goto out_lock;
4613389baa8SJason Gunthorpe 		}
4623389baa8SJason Gunthorpe 		/*
4633389baa8SJason Gunthorpe 		 * Another thread beat us to creating the child mr, use
4643389baa8SJason Gunthorpe 		 * theirs.
465db72438cSYishai Hadas 		 */
466db72438cSYishai Hadas 		refcount_inc(&ret->mmkey.usecount);
467423f52d6SJason Gunthorpe 		goto out_lock;
468db72438cSYishai Hadas 	}
46981713d37SArtemy Kovalyov 	xa_unlock(&imr->implicit_children);
470ca991a7dSMaor Gottlieb 
47181713d37SArtemy Kovalyov 	mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
47281713d37SArtemy Kovalyov 	return mr;
473db72438cSYishai Hadas 
474db72438cSYishai Hadas out_lock:
475c2edcd69SJason Gunthorpe 	xa_unlock(&imr->implicit_children);
476e6fb246cSJason Gunthorpe out_mr:
477c2edcd69SJason Gunthorpe 	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
47881713d37SArtemy Kovalyov 	return ret;
47981713d37SArtemy Kovalyov }
48081713d37SArtemy Kovalyov 
mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd * pd,int access_flags)48181713d37SArtemy Kovalyov struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
48281713d37SArtemy Kovalyov 					     int access_flags)
483c2edcd69SJason Gunthorpe {
484f20bef6aSJason Gunthorpe 	struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
485c2edcd69SJason Gunthorpe 	struct ib_umem_odp *umem_odp;
486c2edcd69SJason Gunthorpe 	struct mlx5_ib_mr *imr;
48781713d37SArtemy Kovalyov 	int err;
488f49c856aSAharon Landau 
48938f8ff5bSJason Gunthorpe 	if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE))
49038f8ff5bSJason Gunthorpe 		return ERR_PTR(-EOPNOTSUPP);
491c320e527SMoni Shoua 
492f20bef6aSJason Gunthorpe 	umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
493f20bef6aSJason Gunthorpe 	if (IS_ERR(umem_odp))
49481713d37SArtemy Kovalyov 		return ERR_CAST(umem_odp);
49556561ac6SAharon Landau 
49656561ac6SAharon Landau 	imr = mlx5_mr_cache_alloc(dev, access_flags, MLX5_MKC_ACCESS_MODE_KSM,
49756561ac6SAharon Landau 				  mlx5_imr_ksm_entries);
49881713d37SArtemy Kovalyov 	if (IS_ERR(imr)) {
499e6fb246cSJason Gunthorpe 		ib_umem_odp_release(umem_odp);
500e6fb246cSJason Gunthorpe 		return imr;
50181713d37SArtemy Kovalyov 	}
50281713d37SArtemy Kovalyov 
50356561ac6SAharon Landau 	imr->access_flags = access_flags;
504c2edcd69SJason Gunthorpe 	imr->ibmr.pd = &pd->ibpd;
505cf6a8b1bSAharon Landau 	imr->ibmr.iova = 0;
506f20bef6aSJason Gunthorpe 	imr->umem = &umem_odp->umem;
507c2edcd69SJason Gunthorpe 	imr->ibmr.lkey = imr->mmkey.key;
508c2edcd69SJason Gunthorpe 	imr->ibmr.rkey = imr->mmkey.key;
509ca991a7dSMaor Gottlieb 	imr->ibmr.device = &dev->ib_dev;
510e1b95ae0SErez Alfasi 	imr->is_odp_implicit = true;
511423f52d6SJason Gunthorpe 	xa_init(&imr->implicit_children);
512e1b95ae0SErez Alfasi 
513636bdbfcSAharon Landau 	err = mlx5r_umr_update_xlt(imr, 0,
514c2edcd69SJason Gunthorpe 				   mlx5_imr_ksm_entries,
515c2edcd69SJason Gunthorpe 				   MLX5_KSM_PAGE_SHIFT,
516c2edcd69SJason Gunthorpe 				   MLX5_IB_UPD_XLT_INDIRECT |
517c2edcd69SJason Gunthorpe 				   MLX5_IB_UPD_XLT_ZAP |
518c2edcd69SJason Gunthorpe 				   MLX5_IB_UPD_XLT_ENABLE);
519c2edcd69SJason Gunthorpe 	if (err)
520c2edcd69SJason Gunthorpe 		goto out_mr;
521c2edcd69SJason Gunthorpe 
522db72438cSYishai Hadas 	err = mlx5r_store_odp_mkey(dev, &imr->mmkey);
523c2edcd69SJason Gunthorpe 	if (err)
524c2edcd69SJason Gunthorpe 		goto out_mr;
525c2edcd69SJason Gunthorpe 
526c2edcd69SJason Gunthorpe 	mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
52781713d37SArtemy Kovalyov 	return imr;
528c2edcd69SJason Gunthorpe out_mr:
529c2edcd69SJason Gunthorpe 	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
530e6fb246cSJason Gunthorpe 	mlx5_ib_dereg_mr(&imr->ibmr, NULL);
531c2edcd69SJason Gunthorpe 	return ERR_PTR(err);
53281713d37SArtemy Kovalyov }
53381713d37SArtemy Kovalyov 
mlx5_ib_free_odp_mr(struct mlx5_ib_mr * mr)534e6fb246cSJason Gunthorpe void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
53581713d37SArtemy Kovalyov {
536423f52d6SJason Gunthorpe 	struct mlx5_ib_mr *mtt;
537423f52d6SJason Gunthorpe 	unsigned long idx;
538f993de88SJason Gunthorpe 
5395256edcbSJason Gunthorpe 	/*
540e6fb246cSJason Gunthorpe 	 * If this is an implicit MR it is already invalidated so we can just
541e6fb246cSJason Gunthorpe 	 * delete the children mkeys.
542a862192eSJason Gunthorpe 	 */
543e6fb246cSJason Gunthorpe 	xa_for_each(&mr->implicit_children, idx, mtt) {
544e6fb246cSJason Gunthorpe 		xa_erase(&mr->implicit_children, idx);
545e6fb246cSJason Gunthorpe 		mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
54690da7dc8SJianxin Xiong 	}
54790da7dc8SJianxin Xiong }
54890da7dc8SJianxin Xiong 
549813e90b1SMoni Shoua #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
550677cf51fSYishai Hadas #define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
551a03bfc37SYishai Hadas #define MLX5_PF_FLAGS_ENABLE BIT(3)
pagefault_real_mr(struct mlx5_ib_mr * mr,struct ib_umem_odp * odp,u64 user_va,size_t bcnt,u32 * bytes_mapped,u32 flags)55254375e73SJason Gunthorpe static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
55354375e73SJason Gunthorpe 			     u64 user_va, size_t bcnt, u32 *bytes_mapped,
554813e90b1SMoni Shoua 			     u32 flags)
5551b7dbc26SArtemy Kovalyov {
556f25a546eSJason Gunthorpe 	int page_shift, ret, np;
557813e90b1SMoni Shoua 	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
5581abe186eSMoni Shoua 	u64 access_mask;
5598ffc3248SJason Gunthorpe 	u64 start_idx;
560677cf51fSYishai Hadas 	bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
561a03bfc37SYishai Hadas 	u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
562a03bfc37SYishai Hadas 
563a03bfc37SYishai Hadas 	if (flags & MLX5_PF_FLAGS_ENABLE)
564a03bfc37SYishai Hadas 		xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
5651b7dbc26SArtemy Kovalyov 
566d2183c6fSJason Gunthorpe 	page_shift = odp->page_shift;
5678ffc3248SJason Gunthorpe 	start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
5681abe186eSMoni Shoua 	access_mask = ODP_READ_ALLOWED_BIT;
5691b7dbc26SArtemy Kovalyov 
570fba0e448SJason Gunthorpe 	if (odp->umem.writable && !downgrade)
5711b7dbc26SArtemy Kovalyov 		access_mask |= ODP_WRITE_ALLOWED_BIT;
5721b7dbc26SArtemy Kovalyov 
573677cf51fSYishai Hadas 	np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
57454375e73SJason Gunthorpe 	if (np < 0)
57554375e73SJason Gunthorpe 		return np;
5761b7dbc26SArtemy Kovalyov 
5771b7dbc26SArtemy Kovalyov 	/*
57836f30e48SYishai Hadas 	 * No need to check whether the MTTs really belong to this MR, since
57936f30e48SYishai Hadas 	 * ib_umem_odp_map_dma_and_lock already checks this.
5801b7dbc26SArtemy Kovalyov 	 */
581636bdbfcSAharon Landau 	ret = mlx5r_umr_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
5821b7dbc26SArtemy Kovalyov 	mutex_unlock(&odp->umem_mutex);
5831b7dbc26SArtemy Kovalyov 
5841b7dbc26SArtemy Kovalyov 	if (ret < 0) {
5851b7dbc26SArtemy Kovalyov 		if (ret != -EAGAIN)
586ca991a7dSMaor Gottlieb 			mlx5_ib_err(mr_to_mdev(mr),
587fb985e27SJason Gunthorpe 				    "Failed to update mkey page tables\n");
5881b7dbc26SArtemy Kovalyov 		goto out;
5891b7dbc26SArtemy Kovalyov 	}
5901b7dbc26SArtemy Kovalyov 
5911b7dbc26SArtemy Kovalyov 	if (bytes_mapped) {
5921b7dbc26SArtemy Kovalyov 		u32 new_mappings = (np << page_shift) -
59354375e73SJason Gunthorpe 			(user_va - round_down(user_va, 1 << page_shift));
59454375e73SJason Gunthorpe 
59554375e73SJason Gunthorpe 		*bytes_mapped += min_t(u32, new_mappings, bcnt);
5961b7dbc26SArtemy Kovalyov 	}
5971b7dbc26SArtemy Kovalyov 
59854375e73SJason Gunthorpe 	return np << (page_shift - PAGE_SHIFT);
5991b7dbc26SArtemy Kovalyov 
6001b7dbc26SArtemy Kovalyov out:
6011b7dbc26SArtemy Kovalyov 	return ret;
6021b7dbc26SArtemy Kovalyov }
6031b7dbc26SArtemy Kovalyov 
pagefault_implicit_mr(struct mlx5_ib_mr * imr,struct ib_umem_odp * odp_imr,u64 user_va,size_t bcnt,u32 * bytes_mapped,u32 flags)604b70d785dSJason Gunthorpe static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
605b70d785dSJason Gunthorpe 				 struct ib_umem_odp *odp_imr, u64 user_va,
606b70d785dSJason Gunthorpe 				 size_t bcnt, u32 *bytes_mapped, u32 flags)
607b70d785dSJason Gunthorpe {
608b70d785dSJason Gunthorpe 	unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
609b70d785dSJason Gunthorpe 	unsigned long upd_start_idx = end_idx + 1;
610b70d785dSJason Gunthorpe 	unsigned long upd_len = 0;
611b70d785dSJason Gunthorpe 	unsigned long npages = 0;
612b70d785dSJason Gunthorpe 	int err;
613b70d785dSJason Gunthorpe 	int ret;
614b70d785dSJason Gunthorpe 
615b70d785dSJason Gunthorpe 	if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
616b70d785dSJason Gunthorpe 		     mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
617b70d785dSJason Gunthorpe 		return -EFAULT;
618b70d785dSJason Gunthorpe 
619b70d785dSJason Gunthorpe 	/* Fault each child mr that intersects with our interval. */
620b70d785dSJason Gunthorpe 	while (bcnt) {
621b70d785dSJason Gunthorpe 		unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
622b70d785dSJason Gunthorpe 		struct ib_umem_odp *umem_odp;
623b70d785dSJason Gunthorpe 		struct mlx5_ib_mr *mtt;
624b70d785dSJason Gunthorpe 		u64 len;
625b70d785dSJason Gunthorpe 
626db72438cSYishai Hadas 		xa_lock(&imr->implicit_children);
627b70d785dSJason Gunthorpe 		mtt = xa_load(&imr->implicit_children, idx);
628b70d785dSJason Gunthorpe 		if (unlikely(!mtt)) {
629db72438cSYishai Hadas 			xa_unlock(&imr->implicit_children);
630b70d785dSJason Gunthorpe 			mtt = implicit_get_child_mr(imr, idx);
631b70d785dSJason Gunthorpe 			if (IS_ERR(mtt)) {
632b70d785dSJason Gunthorpe 				ret = PTR_ERR(mtt);
633b70d785dSJason Gunthorpe 				goto out;
634b70d785dSJason Gunthorpe 			}
635b70d785dSJason Gunthorpe 			upd_start_idx = min(upd_start_idx, idx);
636b70d785dSJason Gunthorpe 			upd_len = idx - upd_start_idx + 1;
637db72438cSYishai Hadas 		} else {
638db72438cSYishai Hadas 			refcount_inc(&mtt->mmkey.usecount);
639db72438cSYishai Hadas 			xa_unlock(&imr->implicit_children);
640b70d785dSJason Gunthorpe 		}
641b70d785dSJason Gunthorpe 
642b70d785dSJason Gunthorpe 		umem_odp = to_ib_umem_odp(mtt->umem);
643b70d785dSJason Gunthorpe 		len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
644b70d785dSJason Gunthorpe 		      user_va;
645b70d785dSJason Gunthorpe 
646b70d785dSJason Gunthorpe 		ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
647b70d785dSJason Gunthorpe 					bytes_mapped, flags);
648db72438cSYishai Hadas 
649db72438cSYishai Hadas 		mlx5r_deref_odp_mkey(&mtt->mmkey);
650db72438cSYishai Hadas 
651b70d785dSJason Gunthorpe 		if (ret < 0)
652b70d785dSJason Gunthorpe 			goto out;
653b70d785dSJason Gunthorpe 		user_va += len;
654b70d785dSJason Gunthorpe 		bcnt -= len;
655b70d785dSJason Gunthorpe 		npages += ret;
656b70d785dSJason Gunthorpe 	}
657b70d785dSJason Gunthorpe 
658b70d785dSJason Gunthorpe 	ret = npages;
659b70d785dSJason Gunthorpe 
660b70d785dSJason Gunthorpe 	/*
661b70d785dSJason Gunthorpe 	 * Any time the implicit_children are changed we must perform an
662b70d785dSJason Gunthorpe 	 * update of the xlt before exiting to ensure the HW and the
663b70d785dSJason Gunthorpe 	 * implicit_children remains synchronized.
664b70d785dSJason Gunthorpe 	 */
665b70d785dSJason Gunthorpe out:
666b70d785dSJason Gunthorpe 	if (likely(!upd_len))
667b70d785dSJason Gunthorpe 		return ret;
668b70d785dSJason Gunthorpe 
669b70d785dSJason Gunthorpe 	/*
670b70d785dSJason Gunthorpe 	 * Notice this is not strictly ordered right, the KSM is updated after
671b70d785dSJason Gunthorpe 	 * the implicit_children is updated, so a parallel page fault could
672b70d785dSJason Gunthorpe 	 * see a MR that is not yet visible in the KSM.  This is similar to a
673b70d785dSJason Gunthorpe 	 * parallel page fault seeing a MR that is being concurrently removed
674b70d785dSJason Gunthorpe 	 * from the KSM. Both of these improbable situations are resolved
675b70d785dSJason Gunthorpe 	 * safely by resuming the HW and then taking another page fault. The
676b70d785dSJason Gunthorpe 	 * next pagefault handler will see the new information.
677b70d785dSJason Gunthorpe 	 */
678b70d785dSJason Gunthorpe 	mutex_lock(&odp_imr->umem_mutex);
679636bdbfcSAharon Landau 	err = mlx5r_umr_update_xlt(imr, upd_start_idx, upd_len, 0,
680b70d785dSJason Gunthorpe 				   MLX5_IB_UPD_XLT_INDIRECT |
681b70d785dSJason Gunthorpe 					  MLX5_IB_UPD_XLT_ATOMIC);
682b70d785dSJason Gunthorpe 	mutex_unlock(&odp_imr->umem_mutex);
683b70d785dSJason Gunthorpe 	if (err) {
684ca991a7dSMaor Gottlieb 		mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n");
685b70d785dSJason Gunthorpe 		return err;
686b70d785dSJason Gunthorpe 	}
687b70d785dSJason Gunthorpe 	return ret;
688b70d785dSJason Gunthorpe }
689b70d785dSJason Gunthorpe 
pagefault_dmabuf_mr(struct mlx5_ib_mr * mr,size_t bcnt,u32 * bytes_mapped,u32 flags)69090da7dc8SJianxin Xiong static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
69190da7dc8SJianxin Xiong 			       u32 *bytes_mapped, u32 flags)
69290da7dc8SJianxin Xiong {
69390da7dc8SJianxin Xiong 	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
69490da7dc8SJianxin Xiong 	u32 xlt_flags = 0;
69590da7dc8SJianxin Xiong 	int err;
69690da7dc8SJianxin Xiong 	unsigned int page_size;
69790da7dc8SJianxin Xiong 
69890da7dc8SJianxin Xiong 	if (flags & MLX5_PF_FLAGS_ENABLE)
69990da7dc8SJianxin Xiong 		xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
70090da7dc8SJianxin Xiong 
70190da7dc8SJianxin Xiong 	dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
70290da7dc8SJianxin Xiong 	err = ib_umem_dmabuf_map_pages(umem_dmabuf);
70390da7dc8SJianxin Xiong 	if (err) {
70490da7dc8SJianxin Xiong 		dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
70590da7dc8SJianxin Xiong 		return err;
70690da7dc8SJianxin Xiong 	}
70790da7dc8SJianxin Xiong 
70890da7dc8SJianxin Xiong 	page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
70990da7dc8SJianxin Xiong 					     log_page_size, 0,
71090da7dc8SJianxin Xiong 					     umem_dmabuf->umem.iova);
71190da7dc8SJianxin Xiong 	if (unlikely(page_size < PAGE_SIZE)) {
71290da7dc8SJianxin Xiong 		ib_umem_dmabuf_unmap_pages(umem_dmabuf);
71390da7dc8SJianxin Xiong 		err = -EINVAL;
71490da7dc8SJianxin Xiong 	} else {
715b3d47ebdSAharon Landau 		err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
71690da7dc8SJianxin Xiong 	}
71790da7dc8SJianxin Xiong 	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
71890da7dc8SJianxin Xiong 
71990da7dc8SJianxin Xiong 	if (err)
72090da7dc8SJianxin Xiong 		return err;
72190da7dc8SJianxin Xiong 
72290da7dc8SJianxin Xiong 	if (bytes_mapped)
72390da7dc8SJianxin Xiong 		*bytes_mapped += bcnt;
72490da7dc8SJianxin Xiong 
72590da7dc8SJianxin Xiong 	return ib_umem_num_pages(mr->umem);
72690da7dc8SJianxin Xiong }
72790da7dc8SJianxin Xiong 
72854375e73SJason Gunthorpe /*
72954375e73SJason Gunthorpe  * Returns:
73054375e73SJason Gunthorpe  *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
73154375e73SJason Gunthorpe  *           not accessible, or the MR is no longer valid.
73254375e73SJason Gunthorpe  *  -EAGAIN/-ENOMEM: The operation should be retried
73354375e73SJason Gunthorpe  *
73454375e73SJason Gunthorpe  *  -EINVAL/others: General internal malfunction
73554375e73SJason Gunthorpe  *  >0: Number of pages mapped
73654375e73SJason Gunthorpe  */
pagefault_mr(struct mlx5_ib_mr * mr,u64 io_virt,size_t bcnt,u32 * bytes_mapped,u32 flags)73754375e73SJason Gunthorpe static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
73854375e73SJason Gunthorpe 			u32 *bytes_mapped, u32 flags)
73954375e73SJason Gunthorpe {
74054375e73SJason Gunthorpe 	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
74154375e73SJason Gunthorpe 
742cf6a8b1bSAharon Landau 	if (unlikely(io_virt < mr->ibmr.iova))
74354375e73SJason Gunthorpe 		return -EFAULT;
7448ffc3248SJason Gunthorpe 
74590da7dc8SJianxin Xiong 	if (mr->umem->is_dmabuf)
74690da7dc8SJianxin Xiong 		return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
74790da7dc8SJianxin Xiong 
7488ffc3248SJason Gunthorpe 	if (!odp->is_implicit_odp) {
7498ffc3248SJason Gunthorpe 		u64 user_va;
7508ffc3248SJason Gunthorpe 
751cf6a8b1bSAharon Landau 		if (check_add_overflow(io_virt - mr->ibmr.iova,
7528ffc3248SJason Gunthorpe 				       (u64)odp->umem.address, &user_va))
7538ffc3248SJason Gunthorpe 			return -EFAULT;
7548ffc3248SJason Gunthorpe 		if (unlikely(user_va >= ib_umem_end(odp) ||
7558ffc3248SJason Gunthorpe 			     ib_umem_end(odp) - user_va < bcnt))
7568ffc3248SJason Gunthorpe 			return -EFAULT;
7578ffc3248SJason Gunthorpe 		return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
75854375e73SJason Gunthorpe 					 flags);
75954375e73SJason Gunthorpe 	}
760b70d785dSJason Gunthorpe 	return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
761b70d785dSJason Gunthorpe 				     flags);
76254375e73SJason Gunthorpe }
76354375e73SJason Gunthorpe 
mlx5_ib_init_odp_mr(struct mlx5_ib_mr * mr)76438f8ff5bSJason Gunthorpe int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr)
765a03bfc37SYishai Hadas {
766a03bfc37SYishai Hadas 	int ret;
767a03bfc37SYishai Hadas 
76838f8ff5bSJason Gunthorpe 	ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), mr->umem->address,
76938f8ff5bSJason Gunthorpe 				mr->umem->length, NULL,
77038f8ff5bSJason Gunthorpe 				MLX5_PF_FLAGS_SNAPSHOT | MLX5_PF_FLAGS_ENABLE);
771a03bfc37SYishai Hadas 	return ret >= 0 ? 0 : ret;
772a03bfc37SYishai Hadas }
773a03bfc37SYishai Hadas 
mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr * mr)77490da7dc8SJianxin Xiong int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
77590da7dc8SJianxin Xiong {
77690da7dc8SJianxin Xiong 	int ret;
77790da7dc8SJianxin Xiong 
77890da7dc8SJianxin Xiong 	ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
77990da7dc8SJianxin Xiong 				  MLX5_PF_FLAGS_ENABLE);
78090da7dc8SJianxin Xiong 
78190da7dc8SJianxin Xiong 	return ret >= 0 ? 0 : ret;
78290da7dc8SJianxin Xiong }
78390da7dc8SJianxin Xiong 
784db570d7dSArtemy Kovalyov struct pf_frame {
785db570d7dSArtemy Kovalyov 	struct pf_frame *next;
786db570d7dSArtemy Kovalyov 	u32 key;
787db570d7dSArtemy Kovalyov 	u64 io_virt;
788db570d7dSArtemy Kovalyov 	size_t bcnt;
789db570d7dSArtemy Kovalyov 	int depth;
790db570d7dSArtemy Kovalyov };
791db570d7dSArtemy Kovalyov 
mkey_is_eq(struct mlx5_ib_mkey * mmkey,u32 key)7924123bfb0SAharon Landau static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key)
793d623dfd2SArtemy Kovalyov {
794d623dfd2SArtemy Kovalyov 	if (!mmkey)
795d623dfd2SArtemy Kovalyov 		return false;
79613ad1125SAharon Landau 	if (mmkey->type == MLX5_MKEY_MW ||
79713ad1125SAharon Landau 	    mmkey->type == MLX5_MKEY_INDIRECT_DEVX)
798d623dfd2SArtemy Kovalyov 		return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
799d623dfd2SArtemy Kovalyov 	return mmkey->key == key;
800d623dfd2SArtemy Kovalyov }
801d623dfd2SArtemy Kovalyov 
8027bdf65d4SHaggai Eran /*
803d9aaed83SArtemy Kovalyov  * Handle a single data segment in a page-fault WQE or RDMA region.
8047bdf65d4SHaggai Eran  *
805b2ac9188SArtemy Kovalyov  * Returns number of OS pages retrieved on success. The caller may continue to
8067bdf65d4SHaggai Eran  * the next data segment.
8077bdf65d4SHaggai Eran  * Can return the following error codes:
8087bdf65d4SHaggai Eran  * -EAGAIN to designate a temporary error. The caller will abort handling the
8097bdf65d4SHaggai Eran  *  page fault and resolve it.
8107bdf65d4SHaggai Eran  * -EFAULT when there's an error mapping the requested pages. The caller will
811d9aaed83SArtemy Kovalyov  *  abort the page fault handling.
8127bdf65d4SHaggai Eran  */
pagefault_single_data_segment(struct mlx5_ib_dev * dev,struct ib_pd * pd,u32 key,u64 io_virt,size_t bcnt,u32 * bytes_committed,u32 * bytes_mapped)81381dd4c4bSMoni Shoua static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
81481dd4c4bSMoni Shoua 					 struct ib_pd *pd, u32 key,
815813e90b1SMoni Shoua 					 u64 io_virt, size_t bcnt,
816d9aaed83SArtemy Kovalyov 					 u32 *bytes_committed,
817fb985e27SJason Gunthorpe 					 u32 *bytes_mapped)
8187bdf65d4SHaggai Eran {
819db72438cSYishai Hadas 	int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
820db570d7dSArtemy Kovalyov 	struct pf_frame *head = NULL, *frame;
8214123bfb0SAharon Landau 	struct mlx5_ib_mkey *mmkey;
8227bdf65d4SHaggai Eran 	struct mlx5_ib_mr *mr;
823db570d7dSArtemy Kovalyov 	struct mlx5_klm *pklm;
824db570d7dSArtemy Kovalyov 	u32 *out = NULL;
825db570d7dSArtemy Kovalyov 	size_t offset;
8267bdf65d4SHaggai Eran 
827d9aaed83SArtemy Kovalyov 	io_virt += *bytes_committed;
828d9aaed83SArtemy Kovalyov 	bcnt -= *bytes_committed;
8297bdf65d4SHaggai Eran 
830db570d7dSArtemy Kovalyov next_mr:
831db72438cSYishai Hadas 	xa_lock(&dev->odp_mkeys);
832806b101bSJason Gunthorpe 	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
833806b101bSJason Gunthorpe 	if (!mmkey) {
834db72438cSYishai Hadas 		xa_unlock(&dev->odp_mkeys);
835806b101bSJason Gunthorpe 		mlx5_ib_dbg(
836806b101bSJason Gunthorpe 			dev,
837806b101bSJason Gunthorpe 			"skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
838806b101bSJason Gunthorpe 			key);
839806b101bSJason Gunthorpe 		if (bytes_mapped)
840806b101bSJason Gunthorpe 			*bytes_mapped += bcnt;
841806b101bSJason Gunthorpe 		/*
842806b101bSJason Gunthorpe 		 * The user could specify a SGL with multiple lkeys and only
843806b101bSJason Gunthorpe 		 * some of them are ODP. Treat the non-ODP ones as fully
844806b101bSJason Gunthorpe 		 * faulted.
845806b101bSJason Gunthorpe 		 */
846806b101bSJason Gunthorpe 		ret = 0;
847db72438cSYishai Hadas 		goto end;
848806b101bSJason Gunthorpe 	}
849db72438cSYishai Hadas 	refcount_inc(&mmkey->usecount);
850db72438cSYishai Hadas 	xa_unlock(&dev->odp_mkeys);
851db72438cSYishai Hadas 
852d623dfd2SArtemy Kovalyov 	if (!mkey_is_eq(mmkey, key)) {
853db570d7dSArtemy Kovalyov 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
854db570d7dSArtemy Kovalyov 		ret = -EFAULT;
855db72438cSYishai Hadas 		goto end;
856db570d7dSArtemy Kovalyov 	}
857db570d7dSArtemy Kovalyov 
858db570d7dSArtemy Kovalyov 	switch (mmkey->type) {
859db570d7dSArtemy Kovalyov 	case MLX5_MKEY_MR:
860db570d7dSArtemy Kovalyov 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
861db570d7dSArtemy Kovalyov 
862fb985e27SJason Gunthorpe 		ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
863db570d7dSArtemy Kovalyov 		if (ret < 0)
864db72438cSYishai Hadas 			goto end;
865db570d7dSArtemy Kovalyov 
866a3de94e3SErez Alfasi 		mlx5_update_odp_stats(mr, faults, ret);
867a3de94e3SErez Alfasi 
868db570d7dSArtemy Kovalyov 		npages += ret;
869db570d7dSArtemy Kovalyov 		ret = 0;
870db570d7dSArtemy Kovalyov 		break;
871db570d7dSArtemy Kovalyov 
872db570d7dSArtemy Kovalyov 	case MLX5_MKEY_MW:
873414556afSYishai Hadas 	case MLX5_MKEY_INDIRECT_DEVX:
874db570d7dSArtemy Kovalyov 		if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
875db570d7dSArtemy Kovalyov 			mlx5_ib_dbg(dev, "indirection level exceeded\n");
876db570d7dSArtemy Kovalyov 			ret = -EFAULT;
877db72438cSYishai Hadas 			goto end;
878db570d7dSArtemy Kovalyov 		}
879db570d7dSArtemy Kovalyov 
880db570d7dSArtemy Kovalyov 		outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
881ae0579acSAharon Landau 			sizeof(*pklm) * (mmkey->ndescs - 2);
882db570d7dSArtemy Kovalyov 
883db570d7dSArtemy Kovalyov 		if (outlen > cur_outlen) {
884db570d7dSArtemy Kovalyov 			kfree(out);
885db570d7dSArtemy Kovalyov 			out = kzalloc(outlen, GFP_KERNEL);
886db570d7dSArtemy Kovalyov 			if (!out) {
887db570d7dSArtemy Kovalyov 				ret = -ENOMEM;
888db72438cSYishai Hadas 				goto end;
889db570d7dSArtemy Kovalyov 			}
890db570d7dSArtemy Kovalyov 			cur_outlen = outlen;
891db570d7dSArtemy Kovalyov 		}
892db570d7dSArtemy Kovalyov 
893db570d7dSArtemy Kovalyov 		pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
894db570d7dSArtemy Kovalyov 						       bsf0_klm0_pas_mtt0_1);
895db570d7dSArtemy Kovalyov 
89683fec3f1SAharon Landau 		ret = mlx5_core_query_mkey(dev->mdev, mmkey->key, out, outlen);
897db570d7dSArtemy Kovalyov 		if (ret)
898db72438cSYishai Hadas 			goto end;
899db570d7dSArtemy Kovalyov 
900db570d7dSArtemy Kovalyov 		offset = io_virt - MLX5_GET64(query_mkey_out, out,
901db570d7dSArtemy Kovalyov 					      memory_key_mkey_entry.start_addr);
902db570d7dSArtemy Kovalyov 
903ae0579acSAharon Landau 		for (i = 0; bcnt && i < mmkey->ndescs; i++, pklm++) {
904db570d7dSArtemy Kovalyov 			if (offset >= be32_to_cpu(pklm->bcount)) {
905db570d7dSArtemy Kovalyov 				offset -= be32_to_cpu(pklm->bcount);
906db570d7dSArtemy Kovalyov 				continue;
907db570d7dSArtemy Kovalyov 			}
908db570d7dSArtemy Kovalyov 
909db570d7dSArtemy Kovalyov 			frame = kzalloc(sizeof(*frame), GFP_KERNEL);
910db570d7dSArtemy Kovalyov 			if (!frame) {
911db570d7dSArtemy Kovalyov 				ret = -ENOMEM;
912db72438cSYishai Hadas 				goto end;
913db570d7dSArtemy Kovalyov 			}
914db570d7dSArtemy Kovalyov 
915db570d7dSArtemy Kovalyov 			frame->key = be32_to_cpu(pklm->key);
916db570d7dSArtemy Kovalyov 			frame->io_virt = be64_to_cpu(pklm->va) + offset;
917db570d7dSArtemy Kovalyov 			frame->bcnt = min_t(size_t, bcnt,
918db570d7dSArtemy Kovalyov 					    be32_to_cpu(pklm->bcount) - offset);
919db570d7dSArtemy Kovalyov 			frame->depth = depth + 1;
920db570d7dSArtemy Kovalyov 			frame->next = head;
921db570d7dSArtemy Kovalyov 			head = frame;
922db570d7dSArtemy Kovalyov 
923db570d7dSArtemy Kovalyov 			bcnt -= frame->bcnt;
92475b7b86bSArtemy Kovalyov 			offset = 0;
925db570d7dSArtemy Kovalyov 		}
926db570d7dSArtemy Kovalyov 		break;
927db570d7dSArtemy Kovalyov 
928db570d7dSArtemy Kovalyov 	default:
929db570d7dSArtemy Kovalyov 		mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
930db570d7dSArtemy Kovalyov 		ret = -EFAULT;
931db72438cSYishai Hadas 		goto end;
932db570d7dSArtemy Kovalyov 	}
933db570d7dSArtemy Kovalyov 
934db570d7dSArtemy Kovalyov 	if (head) {
935db570d7dSArtemy Kovalyov 		frame = head;
936db570d7dSArtemy Kovalyov 		head = frame->next;
937db570d7dSArtemy Kovalyov 
938db570d7dSArtemy Kovalyov 		key = frame->key;
939db570d7dSArtemy Kovalyov 		io_virt = frame->io_virt;
940db570d7dSArtemy Kovalyov 		bcnt = frame->bcnt;
941db570d7dSArtemy Kovalyov 		depth = frame->depth;
942db570d7dSArtemy Kovalyov 		kfree(frame);
943db570d7dSArtemy Kovalyov 
944db72438cSYishai Hadas 		mlx5r_deref_odp_mkey(mmkey);
945db570d7dSArtemy Kovalyov 		goto next_mr;
946db570d7dSArtemy Kovalyov 	}
9477bdf65d4SHaggai Eran 
948db72438cSYishai Hadas end:
949db72438cSYishai Hadas 	if (mmkey)
950db72438cSYishai Hadas 		mlx5r_deref_odp_mkey(mmkey);
951db570d7dSArtemy Kovalyov 	while (head) {
952db570d7dSArtemy Kovalyov 		frame = head;
953db570d7dSArtemy Kovalyov 		head = frame->next;
954db570d7dSArtemy Kovalyov 		kfree(frame);
955db570d7dSArtemy Kovalyov 	}
956db570d7dSArtemy Kovalyov 	kfree(out);
957db570d7dSArtemy Kovalyov 
958d9aaed83SArtemy Kovalyov 	*bytes_committed = 0;
9597bdf65d4SHaggai Eran 	return ret ? ret : npages;
9607bdf65d4SHaggai Eran }
9617bdf65d4SHaggai Eran 
962f9180399SLeon Romanovsky /*
9637bdf65d4SHaggai Eran  * Parse a series of data segments for page fault handling.
9647bdf65d4SHaggai Eran  *
9655e769e44SLee Jones  * @dev:  Pointer to mlx5 IB device
9665e769e44SLee Jones  * @pfault: contains page fault information.
9675e769e44SLee Jones  * @wqe: points at the first data segment in the WQE.
9685e769e44SLee Jones  * @wqe_end: points after the end of the WQE.
9695e769e44SLee Jones  * @bytes_mapped: receives the number of bytes that the function was able to
9707bdf65d4SHaggai Eran  *                map. This allows the caller to decide intelligently whether
9717bdf65d4SHaggai Eran  *                enough memory was mapped to resolve the page fault
9727bdf65d4SHaggai Eran  *                successfully (e.g. enough for the next MTU, or the entire
9737bdf65d4SHaggai Eran  *                WQE).
9745e769e44SLee Jones  * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus
9757bdf65d4SHaggai Eran  *                   the committed bytes).
9765e769e44SLee Jones  * @receive_queue: receive WQE end of sg list
9777bdf65d4SHaggai Eran  *
9787bdf65d4SHaggai Eran  * Returns the number of pages loaded if positive, zero for an empty WQE, or a
9797bdf65d4SHaggai Eran  * negative error code.
9807bdf65d4SHaggai Eran  */
pagefault_data_segments(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault,void * wqe,void * wqe_end,u32 * bytes_mapped,u32 * total_wqe_bytes,bool receive_queue)981d9aaed83SArtemy Kovalyov static int pagefault_data_segments(struct mlx5_ib_dev *dev,
982d9aaed83SArtemy Kovalyov 				   struct mlx5_pagefault *pfault,
983586f4e95SMoni Shoua 				   void *wqe,
9847bdf65d4SHaggai Eran 				   void *wqe_end, u32 *bytes_mapped,
9850f51427bSLeon Romanovsky 				   u32 *total_wqe_bytes, bool receive_queue)
9867bdf65d4SHaggai Eran {
9877bdf65d4SHaggai Eran 	int ret = 0, npages = 0;
9887bdf65d4SHaggai Eran 	u64 io_virt;
989a419bfb7SOr Har-Toov 	__be32 key;
9907bdf65d4SHaggai Eran 	u32 byte_count;
9917bdf65d4SHaggai Eran 	size_t bcnt;
9927bdf65d4SHaggai Eran 	int inline_segment;
9937bdf65d4SHaggai Eran 
9947bdf65d4SHaggai Eran 	if (bytes_mapped)
9957bdf65d4SHaggai Eran 		*bytes_mapped = 0;
9967bdf65d4SHaggai Eran 	if (total_wqe_bytes)
9977bdf65d4SHaggai Eran 		*total_wqe_bytes = 0;
9987bdf65d4SHaggai Eran 
9997bdf65d4SHaggai Eran 	while (wqe < wqe_end) {
10007bdf65d4SHaggai Eran 		struct mlx5_wqe_data_seg *dseg = wqe;
10017bdf65d4SHaggai Eran 
10027bdf65d4SHaggai Eran 		io_virt = be64_to_cpu(dseg->addr);
1003a419bfb7SOr Har-Toov 		key = dseg->lkey;
10047bdf65d4SHaggai Eran 		byte_count = be32_to_cpu(dseg->byte_count);
10057bdf65d4SHaggai Eran 		inline_segment = !!(byte_count &  MLX5_INLINE_SEG);
10067bdf65d4SHaggai Eran 		bcnt	       = byte_count & ~MLX5_INLINE_SEG;
10077bdf65d4SHaggai Eran 
10087bdf65d4SHaggai Eran 		if (inline_segment) {
10097bdf65d4SHaggai Eran 			bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
10107bdf65d4SHaggai Eran 			wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
10117bdf65d4SHaggai Eran 				     16);
10127bdf65d4SHaggai Eran 		} else {
10137bdf65d4SHaggai Eran 			wqe += sizeof(*dseg);
10147bdf65d4SHaggai Eran 		}
10157bdf65d4SHaggai Eran 
10167bdf65d4SHaggai Eran 		/* receive WQE end of sg list. */
1017a419bfb7SOr Har-Toov 		if (receive_queue && bcnt == 0 &&
1018*594cac11SOr Har-Toov 		    key == dev->mkeys.terminate_scatter_list_mkey &&
1019*594cac11SOr Har-Toov 		    io_virt == 0)
10207bdf65d4SHaggai Eran 			break;
10217bdf65d4SHaggai Eran 
10227bdf65d4SHaggai Eran 		if (!inline_segment && total_wqe_bytes) {
10237bdf65d4SHaggai Eran 			*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
1024d9aaed83SArtemy Kovalyov 					pfault->bytes_committed);
10257bdf65d4SHaggai Eran 		}
10267bdf65d4SHaggai Eran 
10277bdf65d4SHaggai Eran 		/* A zero length data segment designates a length of 2GB. */
10287bdf65d4SHaggai Eran 		if (bcnt == 0)
10297bdf65d4SHaggai Eran 			bcnt = 1U << 31;
10307bdf65d4SHaggai Eran 
1031d9aaed83SArtemy Kovalyov 		if (inline_segment || bcnt <= pfault->bytes_committed) {
1032d9aaed83SArtemy Kovalyov 			pfault->bytes_committed -=
10337bdf65d4SHaggai Eran 				min_t(size_t, bcnt,
1034d9aaed83SArtemy Kovalyov 				      pfault->bytes_committed);
10357bdf65d4SHaggai Eran 			continue;
10367bdf65d4SHaggai Eran 		}
10377bdf65d4SHaggai Eran 
1038a419bfb7SOr Har-Toov 		ret = pagefault_single_data_segment(dev, NULL, be32_to_cpu(key),
103981dd4c4bSMoni Shoua 						    io_virt, bcnt,
1040d9aaed83SArtemy Kovalyov 						    &pfault->bytes_committed,
1041fb985e27SJason Gunthorpe 						    bytes_mapped);
10427bdf65d4SHaggai Eran 		if (ret < 0)
10437bdf65d4SHaggai Eran 			break;
10447bdf65d4SHaggai Eran 		npages += ret;
10457bdf65d4SHaggai Eran 	}
10467bdf65d4SHaggai Eran 
10477bdf65d4SHaggai Eran 	return ret < 0 ? ret : npages;
10487bdf65d4SHaggai Eran }
10497bdf65d4SHaggai Eran 
10507bdf65d4SHaggai Eran /*
10517bdf65d4SHaggai Eran  * Parse initiator WQE. Advances the wqe pointer to point at the
10527bdf65d4SHaggai Eran  * scatter-gather list, and set wqe_end to the end of the WQE.
10537bdf65d4SHaggai Eran  */
mlx5_ib_mr_initiator_pfault_handler(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault,struct mlx5_ib_qp * qp,void ** wqe,void ** wqe_end,int wqe_length)10547bdf65d4SHaggai Eran static int mlx5_ib_mr_initiator_pfault_handler(
1055d9aaed83SArtemy Kovalyov 	struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
1056d9aaed83SArtemy Kovalyov 	struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
10577bdf65d4SHaggai Eran {
10587bdf65d4SHaggai Eran 	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
1059d9aaed83SArtemy Kovalyov 	u16 wqe_index = pfault->wqe.wqe_index;
106017d2f88fSArtemy Kovalyov 	struct mlx5_base_av *av;
10617bdf65d4SHaggai Eran 	unsigned ds, opcode;
106219098df2Smajd@mellanox.com 	u32 qpn = qp->trans_qp.base.mqp.qpn;
10637bdf65d4SHaggai Eran 
10647bdf65d4SHaggai Eran 	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
10657bdf65d4SHaggai Eran 	if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
10667bdf65d4SHaggai Eran 		mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
10677bdf65d4SHaggai Eran 			    ds, wqe_length);
10687bdf65d4SHaggai Eran 		return -EFAULT;
10697bdf65d4SHaggai Eran 	}
10707bdf65d4SHaggai Eran 
10717bdf65d4SHaggai Eran 	if (ds == 0) {
10727bdf65d4SHaggai Eran 		mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
107319098df2Smajd@mellanox.com 			    wqe_index, qpn);
10747bdf65d4SHaggai Eran 		return -EFAULT;
10757bdf65d4SHaggai Eran 	}
10767bdf65d4SHaggai Eran 
10777bdf65d4SHaggai Eran 	*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
10787bdf65d4SHaggai Eran 	*wqe += sizeof(*ctrl);
10797bdf65d4SHaggai Eran 
10807bdf65d4SHaggai Eran 	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
10817bdf65d4SHaggai Eran 		 MLX5_WQE_CTRL_OPCODE_MASK;
108217d2f88fSArtemy Kovalyov 
10839ecf6ac1SMaor Gottlieb 	if (qp->type == IB_QPT_XRC_INI)
108429917f47SMoni Shoua 		*wqe += sizeof(struct mlx5_wqe_xrc_seg);
10857bdf65d4SHaggai Eran 
10867aede1a2SLeon Romanovsky 	if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
108717d2f88fSArtemy Kovalyov 		av = *wqe;
1088931b3c1aSLeon Romanovsky 		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
108917d2f88fSArtemy Kovalyov 			*wqe += sizeof(struct mlx5_av);
109017d2f88fSArtemy Kovalyov 		else
109117d2f88fSArtemy Kovalyov 			*wqe += sizeof(struct mlx5_base_av);
109217d2f88fSArtemy Kovalyov 	}
109317d2f88fSArtemy Kovalyov 
109417d2f88fSArtemy Kovalyov 	switch (opcode) {
109517d2f88fSArtemy Kovalyov 	case MLX5_OPCODE_RDMA_WRITE:
109617d2f88fSArtemy Kovalyov 	case MLX5_OPCODE_RDMA_WRITE_IMM:
109717d2f88fSArtemy Kovalyov 	case MLX5_OPCODE_RDMA_READ:
109817d2f88fSArtemy Kovalyov 		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
109917d2f88fSArtemy Kovalyov 		break;
110017d2f88fSArtemy Kovalyov 	case MLX5_OPCODE_ATOMIC_CS:
110117d2f88fSArtemy Kovalyov 	case MLX5_OPCODE_ATOMIC_FA:
110217d2f88fSArtemy Kovalyov 		*wqe += sizeof(struct mlx5_wqe_raddr_seg);
110317d2f88fSArtemy Kovalyov 		*wqe += sizeof(struct mlx5_wqe_atomic_seg);
110417d2f88fSArtemy Kovalyov 		break;
110517d2f88fSArtemy Kovalyov 	}
110617d2f88fSArtemy Kovalyov 
11077bdf65d4SHaggai Eran 	return 0;
11087bdf65d4SHaggai Eran }
11097bdf65d4SHaggai Eran 
11107bdf65d4SHaggai Eran /*
11116ff7414aSMoni Shoua  * Parse responder WQE and set wqe_end to the end of the WQE.
11127bdf65d4SHaggai Eran  */
mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev * dev,struct mlx5_ib_srq * srq,void ** wqe,void ** wqe_end,int wqe_length)111308100fadSMoni Shoua static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
111408100fadSMoni Shoua 						   struct mlx5_ib_srq *srq,
111508100fadSMoni Shoua 						   void **wqe, void **wqe_end,
111608100fadSMoni Shoua 						   int wqe_length)
111708100fadSMoni Shoua {
111808100fadSMoni Shoua 	int wqe_size = 1 << srq->msrq.wqe_shift;
111908100fadSMoni Shoua 
112008100fadSMoni Shoua 	if (wqe_size > wqe_length) {
112108100fadSMoni Shoua 		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
112208100fadSMoni Shoua 		return -EFAULT;
112308100fadSMoni Shoua 	}
112408100fadSMoni Shoua 
112508100fadSMoni Shoua 	*wqe_end = *wqe + wqe_size;
112608100fadSMoni Shoua 	*wqe += sizeof(struct mlx5_wqe_srq_next_seg);
112708100fadSMoni Shoua 
112808100fadSMoni Shoua 	return 0;
112908100fadSMoni Shoua }
113008100fadSMoni Shoua 
mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev * dev,struct mlx5_ib_qp * qp,void * wqe,void ** wqe_end,int wqe_length)113108100fadSMoni Shoua static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
113208100fadSMoni Shoua 						  struct mlx5_ib_qp *qp,
113308100fadSMoni Shoua 						  void *wqe, void **wqe_end,
11346ff7414aSMoni Shoua 						  int wqe_length)
11357bdf65d4SHaggai Eran {
11367bdf65d4SHaggai Eran 	struct mlx5_ib_wq *wq = &qp->rq;
11377bdf65d4SHaggai Eran 	int wqe_size = 1 << wq->wqe_shift;
11387bdf65d4SHaggai Eran 
1139c95e6d53SLeon Romanovsky 	if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
11407bdf65d4SHaggai Eran 		mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
11417bdf65d4SHaggai Eran 		return -EFAULT;
11427bdf65d4SHaggai Eran 	}
11437bdf65d4SHaggai Eran 
11447bdf65d4SHaggai Eran 	if (wqe_size > wqe_length) {
11457bdf65d4SHaggai Eran 		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
11467bdf65d4SHaggai Eran 		return -EFAULT;
11477bdf65d4SHaggai Eran 	}
11487bdf65d4SHaggai Eran 
11496ff7414aSMoni Shoua 	*wqe_end = wqe + wqe_size;
11507bdf65d4SHaggai Eran 
11517bdf65d4SHaggai Eran 	return 0;
11527bdf65d4SHaggai Eran }
11537bdf65d4SHaggai Eran 
odp_get_rsc(struct mlx5_ib_dev * dev,u32 wq_num,int pf_type)1154032080abSMoni Shoua static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
1155032080abSMoni Shoua 						       u32 wq_num, int pf_type)
11567bdf65d4SHaggai Eran {
115710f56242SMoni Shoua 	struct mlx5_core_rsc_common *common = NULL;
115810f56242SMoni Shoua 	struct mlx5_core_srq *srq;
1159d9aaed83SArtemy Kovalyov 
1160032080abSMoni Shoua 	switch (pf_type) {
1161032080abSMoni Shoua 	case MLX5_WQE_PF_TYPE_RMP:
116210f56242SMoni Shoua 		srq = mlx5_cmd_get_srq(dev, wq_num);
116310f56242SMoni Shoua 		if (srq)
116410f56242SMoni Shoua 			common = &srq->common;
1165032080abSMoni Shoua 		break;
1166032080abSMoni Shoua 	case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
1167032080abSMoni Shoua 	case MLX5_WQE_PF_TYPE_RESP:
1168032080abSMoni Shoua 	case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
1169333fbaa0SLeon Romanovsky 		common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
1170032080abSMoni Shoua 		break;
1171032080abSMoni Shoua 	default:
117210f56242SMoni Shoua 		break;
1173d9aaed83SArtemy Kovalyov 	}
1174d9aaed83SArtemy Kovalyov 
117510f56242SMoni Shoua 	return common;
1176032080abSMoni Shoua }
1177032080abSMoni Shoua 
res_to_qp(struct mlx5_core_rsc_common * res)1178032080abSMoni Shoua static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
1179032080abSMoni Shoua {
1180032080abSMoni Shoua 	struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
1181032080abSMoni Shoua 
1182d9aaed83SArtemy Kovalyov 	return to_mibqp(mqp);
1183d9aaed83SArtemy Kovalyov }
1184d9aaed83SArtemy Kovalyov 
res_to_srq(struct mlx5_core_rsc_common * res)118508100fadSMoni Shoua static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
118608100fadSMoni Shoua {
118708100fadSMoni Shoua 	struct mlx5_core_srq *msrq =
118808100fadSMoni Shoua 		container_of(res, struct mlx5_core_srq, common);
118908100fadSMoni Shoua 
119008100fadSMoni Shoua 	return to_mibsrq(msrq);
119108100fadSMoni Shoua }
119208100fadSMoni Shoua 
mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault)1193d9aaed83SArtemy Kovalyov static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
1194d9aaed83SArtemy Kovalyov 					  struct mlx5_pagefault *pfault)
1195d9aaed83SArtemy Kovalyov {
11960f51427bSLeon Romanovsky 	bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
1197d9aaed83SArtemy Kovalyov 	u16 wqe_index = pfault->wqe.wqe_index;
1198130c2c57SDanit Goldberg 	void *wqe, *wqe_start = NULL, *wqe_end = NULL;
11990f51427bSLeon Romanovsky 	u32 bytes_mapped, total_wqe_bytes;
12000f51427bSLeon Romanovsky 	struct mlx5_core_rsc_common *res;
12010f51427bSLeon Romanovsky 	int resume_with_error = 1;
12020f51427bSLeon Romanovsky 	struct mlx5_ib_qp *qp;
1203fbeb4075SMoni Shoua 	size_t bytes_copied;
12040f51427bSLeon Romanovsky 	int ret = 0;
12057bdf65d4SHaggai Eran 
1206032080abSMoni Shoua 	res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
1207032080abSMoni Shoua 	if (!res) {
1208032080abSMoni Shoua 		mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
1209032080abSMoni Shoua 		return;
1210032080abSMoni Shoua 	}
1211032080abSMoni Shoua 
12120f51427bSLeon Romanovsky 	if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
12130f51427bSLeon Romanovsky 	    res->res != MLX5_RES_XSRQ) {
12140f51427bSLeon Romanovsky 		mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
12150f51427bSLeon Romanovsky 			    pfault->type);
1216032080abSMoni Shoua 		goto resolve_page_fault;
1217032080abSMoni Shoua 	}
1218032080abSMoni Shoua 
1219130c2c57SDanit Goldberg 	wqe_start = (void *)__get_free_page(GFP_KERNEL);
1220130c2c57SDanit Goldberg 	if (!wqe_start) {
12217bdf65d4SHaggai Eran 		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
12227bdf65d4SHaggai Eran 		goto resolve_page_fault;
12237bdf65d4SHaggai Eran 	}
12247bdf65d4SHaggai Eran 
1225130c2c57SDanit Goldberg 	wqe = wqe_start;
12260f51427bSLeon Romanovsky 	qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
12270f51427bSLeon Romanovsky 	if (qp && sq) {
1228da9ee9d8SMoni Shoua 		ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
1229fbeb4075SMoni Shoua 					  &bytes_copied);
12300f51427bSLeon Romanovsky 		if (ret)
12310f51427bSLeon Romanovsky 			goto read_user;
12320f51427bSLeon Romanovsky 		ret = mlx5_ib_mr_initiator_pfault_handler(
12330f51427bSLeon Romanovsky 			dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
12340f51427bSLeon Romanovsky 	} else if (qp && !sq) {
1235da9ee9d8SMoni Shoua 		ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
1236fbeb4075SMoni Shoua 					  &bytes_copied);
12370f51427bSLeon Romanovsky 		if (ret)
12380f51427bSLeon Romanovsky 			goto read_user;
12390f51427bSLeon Romanovsky 		ret = mlx5_ib_mr_responder_pfault_handler_rq(
12400f51427bSLeon Romanovsky 			dev, qp, wqe, &wqe_end, bytes_copied);
12410f51427bSLeon Romanovsky 	} else if (!qp) {
12420f51427bSLeon Romanovsky 		struct mlx5_ib_srq *srq = res_to_srq(res);
12430f51427bSLeon Romanovsky 
1244da9ee9d8SMoni Shoua 		ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
124508100fadSMoni Shoua 					   &bytes_copied);
12460f51427bSLeon Romanovsky 		if (ret)
12470f51427bSLeon Romanovsky 			goto read_user;
12480f51427bSLeon Romanovsky 		ret = mlx5_ib_mr_responder_pfault_handler_srq(
12490f51427bSLeon Romanovsky 			dev, srq, &wqe, &wqe_end, bytes_copied);
125008100fadSMoni Shoua 	}
1251fbeb4075SMoni Shoua 
12520f51427bSLeon Romanovsky 	if (ret < 0 || wqe >= wqe_end)
12530f51427bSLeon Romanovsky 		goto resolve_page_fault;
12540f51427bSLeon Romanovsky 
12550f51427bSLeon Romanovsky 	ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
12560f51427bSLeon Romanovsky 				      &total_wqe_bytes, !sq);
12570f51427bSLeon Romanovsky 	if (ret == -EAGAIN)
12580f51427bSLeon Romanovsky 		goto out;
12590f51427bSLeon Romanovsky 
12600f51427bSLeon Romanovsky 	if (ret < 0 || total_wqe_bytes > bytes_mapped)
12610f51427bSLeon Romanovsky 		goto resolve_page_fault;
12620f51427bSLeon Romanovsky 
12630f51427bSLeon Romanovsky out:
12640f51427bSLeon Romanovsky 	ret = 0;
12650f51427bSLeon Romanovsky 	resume_with_error = 0;
12660f51427bSLeon Romanovsky 
12670f51427bSLeon Romanovsky read_user:
12680f51427bSLeon Romanovsky 	if (ret)
12690f51427bSLeon Romanovsky 		mlx5_ib_err(
12700f51427bSLeon Romanovsky 			dev,
12710f51427bSLeon Romanovsky 			"Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
1272d9aaed83SArtemy Kovalyov 			ret, wqe_index, pfault->token);
12737bdf65d4SHaggai Eran 
12747bdf65d4SHaggai Eran resolve_page_fault:
1275d9aaed83SArtemy Kovalyov 	mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
1276d9aaed83SArtemy Kovalyov 	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
127781713d37SArtemy Kovalyov 		    pfault->wqe.wq_num, resume_with_error,
1278d9aaed83SArtemy Kovalyov 		    pfault->type);
1279032080abSMoni Shoua 	mlx5_core_res_put(res);
1280130c2c57SDanit Goldberg 	free_page((unsigned long)wqe_start);
12817bdf65d4SHaggai Eran }
12827bdf65d4SHaggai Eran 
pages_in_range(u64 address,u32 length)1283eab668a6SHaggai Eran static int pages_in_range(u64 address, u32 length)
1284eab668a6SHaggai Eran {
1285eab668a6SHaggai Eran 	return (ALIGN(address + length, PAGE_SIZE) -
1286eab668a6SHaggai Eran 		(address & PAGE_MASK)) >> PAGE_SHIFT;
1287eab668a6SHaggai Eran }
1288eab668a6SHaggai Eran 
mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault)1289d9aaed83SArtemy Kovalyov static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
1290d9aaed83SArtemy Kovalyov 					   struct mlx5_pagefault *pfault)
1291eab668a6SHaggai Eran {
1292eab668a6SHaggai Eran 	u64 address;
1293eab668a6SHaggai Eran 	u32 length;
1294d9aaed83SArtemy Kovalyov 	u32 prefetch_len = pfault->bytes_committed;
1295eab668a6SHaggai Eran 	int prefetch_activated = 0;
1296d9aaed83SArtemy Kovalyov 	u32 rkey = pfault->rdma.r_key;
1297eab668a6SHaggai Eran 	int ret;
1298eab668a6SHaggai Eran 
1299eab668a6SHaggai Eran 	/* The RDMA responder handler handles the page fault in two parts.
1300eab668a6SHaggai Eran 	 * First it brings the necessary pages for the current packet
1301eab668a6SHaggai Eran 	 * (and uses the pfault context), and then (after resuming the QP)
1302eab668a6SHaggai Eran 	 * prefetches more pages. The second operation cannot use the pfault
1303eab668a6SHaggai Eran 	 * context and therefore uses the dummy_pfault context allocated on
1304eab668a6SHaggai Eran 	 * the stack */
1305d9aaed83SArtemy Kovalyov 	pfault->rdma.rdma_va += pfault->bytes_committed;
1306d9aaed83SArtemy Kovalyov 	pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
1307d9aaed83SArtemy Kovalyov 					 pfault->rdma.rdma_op_len);
1308d9aaed83SArtemy Kovalyov 	pfault->bytes_committed = 0;
1309eab668a6SHaggai Eran 
1310d9aaed83SArtemy Kovalyov 	address = pfault->rdma.rdma_va;
1311d9aaed83SArtemy Kovalyov 	length  = pfault->rdma.rdma_op_len;
1312eab668a6SHaggai Eran 
1313eab668a6SHaggai Eran 	/* For some operations, the hardware cannot tell the exact message
1314eab668a6SHaggai Eran 	 * length, and in those cases it reports zero. Use prefetch
1315eab668a6SHaggai Eran 	 * logic. */
1316eab668a6SHaggai Eran 	if (length == 0) {
1317eab668a6SHaggai Eran 		prefetch_activated = 1;
1318d9aaed83SArtemy Kovalyov 		length = pfault->rdma.packet_size;
1319eab668a6SHaggai Eran 		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
1320eab668a6SHaggai Eran 	}
1321eab668a6SHaggai Eran 
132281dd4c4bSMoni Shoua 	ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
1323fb985e27SJason Gunthorpe 					    &pfault->bytes_committed, NULL);
1324eab668a6SHaggai Eran 	if (ret == -EAGAIN) {
1325eab668a6SHaggai Eran 		/* We're racing with an invalidation, don't prefetch */
1326eab668a6SHaggai Eran 		prefetch_activated = 0;
1327eab668a6SHaggai Eran 	} else if (ret < 0 || pages_in_range(address, length) > ret) {
1328d9aaed83SArtemy Kovalyov 		mlx5_ib_page_fault_resume(dev, pfault, 1);
1329d9aaed83SArtemy Kovalyov 		if (ret != -ENOENT)
13304df4a5baSArtemy Kovalyov 			mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1331d9aaed83SArtemy Kovalyov 				    ret, pfault->token, pfault->type);
1332eab668a6SHaggai Eran 		return;
1333eab668a6SHaggai Eran 	}
1334eab668a6SHaggai Eran 
1335d9aaed83SArtemy Kovalyov 	mlx5_ib_page_fault_resume(dev, pfault, 0);
1336d9aaed83SArtemy Kovalyov 	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1337d9aaed83SArtemy Kovalyov 		    pfault->token, pfault->type,
1338d9aaed83SArtemy Kovalyov 		    prefetch_activated);
1339eab668a6SHaggai Eran 
1340eab668a6SHaggai Eran 	/* At this point, there might be a new pagefault already arriving in
1341eab668a6SHaggai Eran 	 * the eq, switch to the dummy pagefault for the rest of the
1342eab668a6SHaggai Eran 	 * processing. We're still OK with the objects being alive as the
1343eab668a6SHaggai Eran 	 * work-queue is being fenced. */
1344eab668a6SHaggai Eran 
1345eab668a6SHaggai Eran 	if (prefetch_activated) {
1346d9aaed83SArtemy Kovalyov 		u32 bytes_committed = 0;
1347d9aaed83SArtemy Kovalyov 
134881dd4c4bSMoni Shoua 		ret = pagefault_single_data_segment(dev, NULL, rkey, address,
1349eab668a6SHaggai Eran 						    prefetch_len,
1350fb985e27SJason Gunthorpe 						    &bytes_committed, NULL);
135181713d37SArtemy Kovalyov 		if (ret < 0 && ret != -EAGAIN) {
13524df4a5baSArtemy Kovalyov 			mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
135381713d37SArtemy Kovalyov 				    ret, pfault->token, address, prefetch_len);
1354eab668a6SHaggai Eran 		}
1355eab668a6SHaggai Eran 	}
1356eab668a6SHaggai Eran }
1357eab668a6SHaggai Eran 
mlx5_ib_pfault(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault)1358d5d284b8SSaeed Mahameed static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
13596aec21f6SHaggai Eran {
1360d9aaed83SArtemy Kovalyov 	u8 event_subtype = pfault->event_subtype;
13616aec21f6SHaggai Eran 
13626aec21f6SHaggai Eran 	switch (event_subtype) {
13637bdf65d4SHaggai Eran 	case MLX5_PFAULT_SUBTYPE_WQE:
1364d9aaed83SArtemy Kovalyov 		mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
13657bdf65d4SHaggai Eran 		break;
1366eab668a6SHaggai Eran 	case MLX5_PFAULT_SUBTYPE_RDMA:
1367d9aaed83SArtemy Kovalyov 		mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
1368eab668a6SHaggai Eran 		break;
13696aec21f6SHaggai Eran 	default:
1370d9aaed83SArtemy Kovalyov 		mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
13716aec21f6SHaggai Eran 			    event_subtype);
1372d9aaed83SArtemy Kovalyov 		mlx5_ib_page_fault_resume(dev, pfault, 1);
13736aec21f6SHaggai Eran 	}
13746aec21f6SHaggai Eran }
13756aec21f6SHaggai Eran 
mlx5_ib_eqe_pf_action(struct work_struct * work)1376d5d284b8SSaeed Mahameed static void mlx5_ib_eqe_pf_action(struct work_struct *work)
1377d5d284b8SSaeed Mahameed {
1378d5d284b8SSaeed Mahameed 	struct mlx5_pagefault *pfault = container_of(work,
1379d5d284b8SSaeed Mahameed 						     struct mlx5_pagefault,
1380d5d284b8SSaeed Mahameed 						     work);
1381d5d284b8SSaeed Mahameed 	struct mlx5_ib_pf_eq *eq = pfault->eq;
1382d5d284b8SSaeed Mahameed 
1383d5d284b8SSaeed Mahameed 	mlx5_ib_pfault(eq->dev, pfault);
1384d5d284b8SSaeed Mahameed 	mempool_free(pfault, eq->pool);
1385d5d284b8SSaeed Mahameed }
1386d5d284b8SSaeed Mahameed 
mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq * eq)1387d5d284b8SSaeed Mahameed static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
1388d5d284b8SSaeed Mahameed {
1389d5d284b8SSaeed Mahameed 	struct mlx5_eqe_page_fault *pf_eqe;
1390d5d284b8SSaeed Mahameed 	struct mlx5_pagefault *pfault;
1391d5d284b8SSaeed Mahameed 	struct mlx5_eqe *eqe;
1392d5d284b8SSaeed Mahameed 	int cc = 0;
1393d5d284b8SSaeed Mahameed 
1394d5d284b8SSaeed Mahameed 	while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
1395d5d284b8SSaeed Mahameed 		pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
1396d5d284b8SSaeed Mahameed 		if (!pfault) {
1397d5d284b8SSaeed Mahameed 			schedule_work(&eq->work);
1398d5d284b8SSaeed Mahameed 			break;
1399d5d284b8SSaeed Mahameed 		}
1400d5d284b8SSaeed Mahameed 
1401d5d284b8SSaeed Mahameed 		pf_eqe = &eqe->data.page_fault;
1402d5d284b8SSaeed Mahameed 		pfault->event_subtype = eqe->sub_type;
1403d5d284b8SSaeed Mahameed 		pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
1404d5d284b8SSaeed Mahameed 
1405d5d284b8SSaeed Mahameed 		mlx5_ib_dbg(eq->dev,
1406d5d284b8SSaeed Mahameed 			    "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
1407d5d284b8SSaeed Mahameed 			    eqe->sub_type, pfault->bytes_committed);
1408d5d284b8SSaeed Mahameed 
1409d5d284b8SSaeed Mahameed 		switch (eqe->sub_type) {
1410d5d284b8SSaeed Mahameed 		case MLX5_PFAULT_SUBTYPE_RDMA:
1411d5d284b8SSaeed Mahameed 			/* RDMA based event */
1412d5d284b8SSaeed Mahameed 			pfault->type =
1413d5d284b8SSaeed Mahameed 				be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
1414d5d284b8SSaeed Mahameed 			pfault->token =
1415d5d284b8SSaeed Mahameed 				be32_to_cpu(pf_eqe->rdma.pftype_token) &
1416d5d284b8SSaeed Mahameed 				MLX5_24BIT_MASK;
1417d5d284b8SSaeed Mahameed 			pfault->rdma.r_key =
1418d5d284b8SSaeed Mahameed 				be32_to_cpu(pf_eqe->rdma.r_key);
1419d5d284b8SSaeed Mahameed 			pfault->rdma.packet_size =
1420d5d284b8SSaeed Mahameed 				be16_to_cpu(pf_eqe->rdma.packet_length);
1421d5d284b8SSaeed Mahameed 			pfault->rdma.rdma_op_len =
1422d5d284b8SSaeed Mahameed 				be32_to_cpu(pf_eqe->rdma.rdma_op_len);
1423d5d284b8SSaeed Mahameed 			pfault->rdma.rdma_va =
1424d5d284b8SSaeed Mahameed 				be64_to_cpu(pf_eqe->rdma.rdma_va);
1425d5d284b8SSaeed Mahameed 			mlx5_ib_dbg(eq->dev,
1426d5d284b8SSaeed Mahameed 				    "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
1427d5d284b8SSaeed Mahameed 				    pfault->type, pfault->token,
1428d5d284b8SSaeed Mahameed 				    pfault->rdma.r_key);
1429d5d284b8SSaeed Mahameed 			mlx5_ib_dbg(eq->dev,
1430d5d284b8SSaeed Mahameed 				    "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1431d5d284b8SSaeed Mahameed 				    pfault->rdma.rdma_op_len,
1432d5d284b8SSaeed Mahameed 				    pfault->rdma.rdma_va);
1433d5d284b8SSaeed Mahameed 			break;
1434d5d284b8SSaeed Mahameed 
1435d5d284b8SSaeed Mahameed 		case MLX5_PFAULT_SUBTYPE_WQE:
1436d5d284b8SSaeed Mahameed 			/* WQE based event */
1437d5d284b8SSaeed Mahameed 			pfault->type =
1438d5d284b8SSaeed Mahameed 				(be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
1439d5d284b8SSaeed Mahameed 			pfault->token =
1440d5d284b8SSaeed Mahameed 				be32_to_cpu(pf_eqe->wqe.token);
1441d5d284b8SSaeed Mahameed 			pfault->wqe.wq_num =
1442d5d284b8SSaeed Mahameed 				be32_to_cpu(pf_eqe->wqe.pftype_wq) &
1443d5d284b8SSaeed Mahameed 				MLX5_24BIT_MASK;
1444d5d284b8SSaeed Mahameed 			pfault->wqe.wqe_index =
1445d5d284b8SSaeed Mahameed 				be16_to_cpu(pf_eqe->wqe.wqe_index);
1446d5d284b8SSaeed Mahameed 			pfault->wqe.packet_size =
1447d5d284b8SSaeed Mahameed 				be16_to_cpu(pf_eqe->wqe.packet_length);
1448d5d284b8SSaeed Mahameed 			mlx5_ib_dbg(eq->dev,
1449d5d284b8SSaeed Mahameed 				    "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1450d5d284b8SSaeed Mahameed 				    pfault->type, pfault->token,
1451d5d284b8SSaeed Mahameed 				    pfault->wqe.wq_num,
1452d5d284b8SSaeed Mahameed 				    pfault->wqe.wqe_index);
1453d5d284b8SSaeed Mahameed 			break;
1454d5d284b8SSaeed Mahameed 
1455d5d284b8SSaeed Mahameed 		default:
1456d5d284b8SSaeed Mahameed 			mlx5_ib_warn(eq->dev,
1457d5d284b8SSaeed Mahameed 				     "Unsupported page fault event sub-type: 0x%02hhx\n",
1458d5d284b8SSaeed Mahameed 				     eqe->sub_type);
1459d5d284b8SSaeed Mahameed 			/* Unsupported page faults should still be
1460d5d284b8SSaeed Mahameed 			 * resolved by the page fault handler
1461d5d284b8SSaeed Mahameed 			 */
1462d5d284b8SSaeed Mahameed 		}
1463d5d284b8SSaeed Mahameed 
1464d5d284b8SSaeed Mahameed 		pfault->eq = eq;
1465d5d284b8SSaeed Mahameed 		INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
1466d5d284b8SSaeed Mahameed 		queue_work(eq->wq, &pfault->work);
1467d5d284b8SSaeed Mahameed 
1468d5d284b8SSaeed Mahameed 		cc = mlx5_eq_update_cc(eq->core, ++cc);
1469d5d284b8SSaeed Mahameed 	}
1470d5d284b8SSaeed Mahameed 
1471d5d284b8SSaeed Mahameed 	mlx5_eq_update_ci(eq->core, cc, 1);
1472d5d284b8SSaeed Mahameed }
1473d5d284b8SSaeed Mahameed 
mlx5_ib_eq_pf_int(struct notifier_block * nb,unsigned long type,void * data)1474ca390799SYuval Avnery static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
1475ca390799SYuval Avnery 			     void *data)
1476d5d284b8SSaeed Mahameed {
1477ca390799SYuval Avnery 	struct mlx5_ib_pf_eq *eq =
1478ca390799SYuval Avnery 		container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
1479d5d284b8SSaeed Mahameed 	unsigned long flags;
1480d5d284b8SSaeed Mahameed 
1481d5d284b8SSaeed Mahameed 	if (spin_trylock_irqsave(&eq->lock, flags)) {
1482d5d284b8SSaeed Mahameed 		mlx5_ib_eq_pf_process(eq);
1483d5d284b8SSaeed Mahameed 		spin_unlock_irqrestore(&eq->lock, flags);
1484d5d284b8SSaeed Mahameed 	} else {
1485d5d284b8SSaeed Mahameed 		schedule_work(&eq->work);
1486d5d284b8SSaeed Mahameed 	}
1487d5d284b8SSaeed Mahameed 
1488d5d284b8SSaeed Mahameed 	return IRQ_HANDLED;
1489d5d284b8SSaeed Mahameed }
1490d5d284b8SSaeed Mahameed 
1491d5d284b8SSaeed Mahameed /* mempool_refill() was proposed but unfortunately wasn't accepted
1492d5d284b8SSaeed Mahameed  * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
1493d5d284b8SSaeed Mahameed  * Cheap workaround.
1494d5d284b8SSaeed Mahameed  */
mempool_refill(mempool_t * pool)1495d5d284b8SSaeed Mahameed static void mempool_refill(mempool_t *pool)
1496d5d284b8SSaeed Mahameed {
1497d5d284b8SSaeed Mahameed 	while (pool->curr_nr < pool->min_nr)
1498d5d284b8SSaeed Mahameed 		mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
1499d5d284b8SSaeed Mahameed }
1500d5d284b8SSaeed Mahameed 
mlx5_ib_eq_pf_action(struct work_struct * work)1501d5d284b8SSaeed Mahameed static void mlx5_ib_eq_pf_action(struct work_struct *work)
1502d5d284b8SSaeed Mahameed {
1503d5d284b8SSaeed Mahameed 	struct mlx5_ib_pf_eq *eq =
1504d5d284b8SSaeed Mahameed 		container_of(work, struct mlx5_ib_pf_eq, work);
1505d5d284b8SSaeed Mahameed 
1506d5d284b8SSaeed Mahameed 	mempool_refill(eq->pool);
1507d5d284b8SSaeed Mahameed 
1508d5d284b8SSaeed Mahameed 	spin_lock_irq(&eq->lock);
1509d5d284b8SSaeed Mahameed 	mlx5_ib_eq_pf_process(eq);
1510d5d284b8SSaeed Mahameed 	spin_unlock_irq(&eq->lock);
1511d5d284b8SSaeed Mahameed }
1512d5d284b8SSaeed Mahameed 
1513d5d284b8SSaeed Mahameed enum {
1514d5d284b8SSaeed Mahameed 	MLX5_IB_NUM_PF_EQE	= 0x1000,
1515d5d284b8SSaeed Mahameed 	MLX5_IB_NUM_PF_DRAIN	= 64,
1516d5d284b8SSaeed Mahameed };
1517d5d284b8SSaeed Mahameed 
mlx5r_odp_create_eq(struct mlx5_ib_dev * dev,struct mlx5_ib_pf_eq * eq)1518ad50294dSShay Drory int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1519d5d284b8SSaeed Mahameed {
1520d5d284b8SSaeed Mahameed 	struct mlx5_eq_param param = {};
1521ad50294dSShay Drory 	int err = 0;
1522d5d284b8SSaeed Mahameed 
1523ad50294dSShay Drory 	mutex_lock(&dev->odp_eq_mutex);
1524ad50294dSShay Drory 	if (eq->core)
1525ad50294dSShay Drory 		goto unlock;
1526d5d284b8SSaeed Mahameed 	INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
1527d5d284b8SSaeed Mahameed 	spin_lock_init(&eq->lock);
1528d5d284b8SSaeed Mahameed 	eq->dev = dev;
1529d5d284b8SSaeed Mahameed 
1530d5d284b8SSaeed Mahameed 	eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
1531d5d284b8SSaeed Mahameed 					       sizeof(struct mlx5_pagefault));
1532ad50294dSShay Drory 	if (!eq->pool) {
1533ad50294dSShay Drory 		err = -ENOMEM;
1534ad50294dSShay Drory 		goto unlock;
1535ad50294dSShay Drory 	}
1536d5d284b8SSaeed Mahameed 
1537d5d284b8SSaeed Mahameed 	eq->wq = alloc_workqueue("mlx5_ib_page_fault",
1538d5d284b8SSaeed Mahameed 				 WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
1539d5d284b8SSaeed Mahameed 				 MLX5_NUM_CMD_EQE);
1540d5d284b8SSaeed Mahameed 	if (!eq->wq) {
1541d5d284b8SSaeed Mahameed 		err = -ENOMEM;
1542d5d284b8SSaeed Mahameed 		goto err_mempool;
1543d5d284b8SSaeed Mahameed 	}
1544d5d284b8SSaeed Mahameed 
1545ca390799SYuval Avnery 	eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
1546d5d284b8SSaeed Mahameed 	param = (struct mlx5_eq_param) {
1547d5d284b8SSaeed Mahameed 		.nent = MLX5_IB_NUM_PF_EQE,
1548d5d284b8SSaeed Mahameed 	};
1549b9a7ba55SYishai Hadas 	param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
155024163189SYuval Avnery 	eq->core = mlx5_eq_create_generic(dev->mdev, &param);
1551d5d284b8SSaeed Mahameed 	if (IS_ERR(eq->core)) {
1552d5d284b8SSaeed Mahameed 		err = PTR_ERR(eq->core);
1553d5d284b8SSaeed Mahameed 		goto err_wq;
1554d5d284b8SSaeed Mahameed 	}
15551f8a7beeSYuval Avnery 	err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
15561f8a7beeSYuval Avnery 	if (err) {
15571f8a7beeSYuval Avnery 		mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
15581f8a7beeSYuval Avnery 		goto err_eq;
15591f8a7beeSYuval Avnery 	}
1560d5d284b8SSaeed Mahameed 
1561ad50294dSShay Drory 	mutex_unlock(&dev->odp_eq_mutex);
1562d5d284b8SSaeed Mahameed 	return 0;
15631f8a7beeSYuval Avnery err_eq:
15641f8a7beeSYuval Avnery 	mlx5_eq_destroy_generic(dev->mdev, eq->core);
1565d5d284b8SSaeed Mahameed err_wq:
1566ad50294dSShay Drory 	eq->core = NULL;
1567d5d284b8SSaeed Mahameed 	destroy_workqueue(eq->wq);
1568d5d284b8SSaeed Mahameed err_mempool:
1569d5d284b8SSaeed Mahameed 	mempool_destroy(eq->pool);
1570ad50294dSShay Drory unlock:
1571ad50294dSShay Drory 	mutex_unlock(&dev->odp_eq_mutex);
1572d5d284b8SSaeed Mahameed 	return err;
1573d5d284b8SSaeed Mahameed }
1574d5d284b8SSaeed Mahameed 
1575d5d284b8SSaeed Mahameed static int
mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev * dev,struct mlx5_ib_pf_eq * eq)1576ad50294dSShay Drory mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1577d5d284b8SSaeed Mahameed {
1578d5d284b8SSaeed Mahameed 	int err;
1579d5d284b8SSaeed Mahameed 
1580ad50294dSShay Drory 	if (!eq->core)
1581ad50294dSShay Drory 		return 0;
15821f8a7beeSYuval Avnery 	mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
1583d5d284b8SSaeed Mahameed 	err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
1584d5d284b8SSaeed Mahameed 	cancel_work_sync(&eq->work);
1585d5d284b8SSaeed Mahameed 	destroy_workqueue(eq->wq);
1586d5d284b8SSaeed Mahameed 	mempool_destroy(eq->pool);
1587d5d284b8SSaeed Mahameed 
1588d5d284b8SSaeed Mahameed 	return err;
1589d5d284b8SSaeed Mahameed }
1590d5d284b8SSaeed Mahameed 
mlx5_odp_init_mkey_cache(struct mlx5_ib_dev * dev)159101137808SAharon Landau int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
159281713d37SArtemy Kovalyov {
159381713d37SArtemy Kovalyov 	struct mlx5r_cache_rb_key rb_key = {
159481713d37SArtemy Kovalyov 		.access_mode = MLX5_MKC_ACCESS_MODE_KSM,
159581713d37SArtemy Kovalyov 		.ndescs = mlx5_imr_ksm_entries,
159681713d37SArtemy Kovalyov 	};
159781713d37SArtemy Kovalyov 	struct mlx5_cache_ent *ent;
159881713d37SArtemy Kovalyov 
15999ee2516cSAharon Landau 	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
160081713d37SArtemy Kovalyov 		return 0;
160181713d37SArtemy Kovalyov 
160281713d37SArtemy Kovalyov 	ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
160381713d37SArtemy Kovalyov 	if (IS_ERR(ent))
160481713d37SArtemy Kovalyov 		return PTR_ERR(ent);
160581713d37SArtemy Kovalyov 
16069ee2516cSAharon Landau 	return 0;
160781713d37SArtemy Kovalyov }
160881713d37SArtemy Kovalyov 
160981713d37SArtemy Kovalyov static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
161081713d37SArtemy Kovalyov 	.advise_mr = mlx5_ib_advise_mr,
161181713d37SArtemy Kovalyov };
161281713d37SArtemy Kovalyov 
mlx5_ib_odp_init_one(struct mlx5_ib_dev * dev)1613813e90b1SMoni Shoua int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
1614813e90b1SMoni Shoua {
1615813e90b1SMoni Shoua 	internal_fill_odp_caps(dev);
1616813e90b1SMoni Shoua 
161781713d37SArtemy Kovalyov 	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
16186aec21f6SHaggai Eran 		return 0;
1619e5dc370bSShay Drory 
1620e5dc370bSShay Drory 	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
162100815752SMoni Shoua 
1622*594cac11SOr Har-Toov 	mutex_init(&dev->odp_eq_mutex);
162300815752SMoni Shoua 	return 0;
1624813e90b1SMoni Shoua }
1625813e90b1SMoni Shoua 
mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev * dev)1626ad50294dSShay Drory void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
1627*594cac11SOr Har-Toov {
1628d5d284b8SSaeed Mahameed 	if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1629d5d284b8SSaeed Mahameed 		return;
1630d5d284b8SSaeed Mahameed 
1631d5d284b8SSaeed Mahameed 	mlx5_ib_odp_destroy_eq(dev, &dev->odp_pf_eq);
163200815752SMoni Shoua }
1633d5d284b8SSaeed Mahameed 
mlx5_ib_odp_init(void)1634d5d284b8SSaeed Mahameed int mlx5_ib_odp_init(void)
1635ad50294dSShay Drory {
16366aec21f6SHaggai Eran 	mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
16376aec21f6SHaggai Eran 				       MLX5_IMR_MTT_BITS);
163881713d37SArtemy Kovalyov 
163981713d37SArtemy Kovalyov 	return 0;
164081713d37SArtemy Kovalyov }
164181713d37SArtemy Kovalyov 
164281713d37SArtemy Kovalyov struct prefetch_mr_work {
164381713d37SArtemy Kovalyov 	struct work_struct work;
16446aec21f6SHaggai Eran 	u32 pf_flags;
1645813e90b1SMoni Shoua 	u32 num_sge;
1646813e90b1SMoni Shoua 	struct {
1647813e90b1SMoni Shoua 		u64 io_virt;
1648813e90b1SMoni Shoua 		struct mlx5_ib_mr *mr;
1649813e90b1SMoni Shoua 		size_t length;
1650fb985e27SJason Gunthorpe 	} frags[];
1651fb985e27SJason Gunthorpe };
1652fb985e27SJason Gunthorpe 
destroy_prefetch_work(struct prefetch_mr_work * work)1653fb985e27SJason Gunthorpe static void destroy_prefetch_work(struct prefetch_mr_work *work)
1654fb985e27SJason Gunthorpe {
1655813e90b1SMoni Shoua 	u32 i;
1656813e90b1SMoni Shoua 
1657fb985e27SJason Gunthorpe 	for (i = 0; i < work->num_sge; ++i)
1658a6bc3875SMoni Shoua 		mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey);
1659a6bc3875SMoni Shoua 
1660fb985e27SJason Gunthorpe 	kvfree(work);
1661fb985e27SJason Gunthorpe }
1662db72438cSYishai Hadas 
1663db72438cSYishai Hadas static struct mlx5_ib_mr *
get_prefetchable_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 lkey)1664fb985e27SJason Gunthorpe get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
1665fb985e27SJason Gunthorpe 		    u32 lkey)
1666fb985e27SJason Gunthorpe {
1667fb985e27SJason Gunthorpe 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1668fb985e27SJason Gunthorpe 	struct mlx5_ib_mr *mr = NULL;
1669fb985e27SJason Gunthorpe 	struct mlx5_ib_mkey *mmkey;
1670fb985e27SJason Gunthorpe 
1671fb985e27SJason Gunthorpe 	xa_lock(&dev->odp_mkeys);
1672db72438cSYishai Hadas 	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
16734123bfb0SAharon Landau 	if (!mmkey || mmkey->key != lkey) {
1674fb985e27SJason Gunthorpe 		mr = ERR_PTR(-ENOENT);
1675db72438cSYishai Hadas 		goto end;
1676806b101bSJason Gunthorpe 	}
167749b99314SJason Gunthorpe 	if (mmkey->type != MLX5_MKEY_MR) {
167849b99314SJason Gunthorpe 		mr = ERR_PTR(-EINVAL);
1679db72438cSYishai Hadas 		goto end;
168049b99314SJason Gunthorpe 	}
168149b99314SJason Gunthorpe 
168249b99314SJason Gunthorpe 	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
168349b99314SJason Gunthorpe 
168449b99314SJason Gunthorpe 	if (mr->ibmr.pd != pd) {
1685fb985e27SJason Gunthorpe 		mr = ERR_PTR(-EPERM);
1686fb985e27SJason Gunthorpe 		goto end;
1687fb985e27SJason Gunthorpe 	}
1688db72438cSYishai Hadas 
168949b99314SJason Gunthorpe 	/* prefetch with write-access must be supported by the MR */
1690db72438cSYishai Hadas 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1691db72438cSYishai Hadas 	    !mr->umem->writable) {
1692fb985e27SJason Gunthorpe 		mr = ERR_PTR(-EPERM);
1693fb985e27SJason Gunthorpe 		goto end;
1694fb985e27SJason Gunthorpe 	}
1695db72438cSYishai Hadas 
169649b99314SJason Gunthorpe 	refcount_inc(&mmkey->usecount);
1697db72438cSYishai Hadas end:
1698db72438cSYishai Hadas 	xa_unlock(&dev->odp_mkeys);
1699fb985e27SJason Gunthorpe 	return mr;
1700db72438cSYishai Hadas }
1701db72438cSYishai Hadas 
mlx5_ib_prefetch_mr_work(struct work_struct * w)1702db72438cSYishai Hadas static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
1703fb985e27SJason Gunthorpe {
1704fb985e27SJason Gunthorpe 	struct prefetch_mr_work *work =
1705fb985e27SJason Gunthorpe 		container_of(w, struct prefetch_mr_work, work);
1706fb985e27SJason Gunthorpe 	u32 bytes_mapped = 0;
1707fb985e27SJason Gunthorpe 	int ret;
1708fb985e27SJason Gunthorpe 	u32 i;
1709fb985e27SJason Gunthorpe 
1710fb985e27SJason Gunthorpe 	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
1711d473f4dcSMaor Gottlieb 	WARN_ON(!work->num_sge);
1712fb985e27SJason Gunthorpe 	for (i = 0; i < work->num_sge; ++i) {
1713fb985e27SJason Gunthorpe 		ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
1714d4d7f596SMaor Gottlieb 				   work->frags[i].length, &bytes_mapped,
1715d4d7f596SMaor Gottlieb 				   work->pf_flags);
1716d473f4dcSMaor Gottlieb 		if (ret <= 0)
1717d473f4dcSMaor Gottlieb 			continue;
1718fb985e27SJason Gunthorpe 		mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
1719fb985e27SJason Gunthorpe 	}
1720d473f4dcSMaor Gottlieb 
1721d473f4dcSMaor Gottlieb 	destroy_prefetch_work(work);
1722d473f4dcSMaor Gottlieb }
1723d473f4dcSMaor Gottlieb 
init_prefetch_work(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 pf_flags,struct prefetch_mr_work * work,struct ib_sge * sg_list,u32 num_sge)1724fb985e27SJason Gunthorpe static int init_prefetch_work(struct ib_pd *pd,
1725fb985e27SJason Gunthorpe 			       enum ib_uverbs_advise_mr_advice advice,
1726fb985e27SJason Gunthorpe 			       u32 pf_flags, struct prefetch_mr_work *work,
1727fb985e27SJason Gunthorpe 			       struct ib_sge *sg_list, u32 num_sge)
172849b99314SJason Gunthorpe {
1729fb985e27SJason Gunthorpe 	u32 i;
1730fb985e27SJason Gunthorpe 
1731fb985e27SJason Gunthorpe 	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
1732fb985e27SJason Gunthorpe 	work->pf_flags = pf_flags;
1733fb985e27SJason Gunthorpe 
1734fb985e27SJason Gunthorpe 	for (i = 0; i < num_sge; ++i) {
1735fb985e27SJason Gunthorpe 		struct mlx5_ib_mr *mr;
1736fb985e27SJason Gunthorpe 
1737fb985e27SJason Gunthorpe 		mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1738fb985e27SJason Gunthorpe 		if (IS_ERR(mr)) {
173949b99314SJason Gunthorpe 			work->num_sge = i;
174049b99314SJason Gunthorpe 			return PTR_ERR(mr);
174149b99314SJason Gunthorpe 		}
174249b99314SJason Gunthorpe 		work->frags[i].io_virt = sg_list[i].addr;
174349b99314SJason Gunthorpe 		work->frags[i].length = sg_list[i].length;
174449b99314SJason Gunthorpe 		work->frags[i].mr = mr;
174549b99314SJason Gunthorpe 	}
1746fb985e27SJason Gunthorpe 	work->num_sge = num_sge;
1747fb985e27SJason Gunthorpe 	return 0;
174849b99314SJason Gunthorpe }
1749fb985e27SJason Gunthorpe 
mlx5_ib_prefetch_sg_list(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 pf_flags,struct ib_sge * sg_list,u32 num_sge)1750fb985e27SJason Gunthorpe static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
175149b99314SJason Gunthorpe 				    enum ib_uverbs_advise_mr_advice advice,
1752fb985e27SJason Gunthorpe 				    u32 pf_flags, struct ib_sge *sg_list,
1753fb985e27SJason Gunthorpe 				    u32 num_sge)
1754fb985e27SJason Gunthorpe {
1755fb985e27SJason Gunthorpe 	u32 bytes_mapped = 0;
1756fb985e27SJason Gunthorpe 	int ret = 0;
1757fb985e27SJason Gunthorpe 	u32 i;
1758fb985e27SJason Gunthorpe 
1759fb985e27SJason Gunthorpe 	for (i = 0; i < num_sge; ++i) {
1760a6bc3875SMoni Shoua 		struct mlx5_ib_mr *mr;
1761fb985e27SJason Gunthorpe 
1762813e90b1SMoni Shoua 		mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1763813e90b1SMoni Shoua 		if (IS_ERR(mr))
17648cdd312cSHaggai Eran 			return PTR_ERR(mr);
1765813e90b1SMoni Shoua 		ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
1766fb985e27SJason Gunthorpe 				   &bytes_mapped, pf_flags);
176749b99314SJason Gunthorpe 		if (ret < 0) {
176849b99314SJason Gunthorpe 			mlx5r_deref_odp_mkey(&mr->mmkey);
1769fb985e27SJason Gunthorpe 			return ret;
1770fb985e27SJason Gunthorpe 		}
1771db72438cSYishai Hadas 		mlx5_update_odp_stats(mr, prefetch, ret);
1772db72438cSYishai Hadas 		mlx5r_deref_odp_mkey(&mr->mmkey);
1773fb985e27SJason Gunthorpe 	}
1774813e90b1SMoni Shoua 
1775db72438cSYishai Hadas 	return 0;
1776db72438cSYishai Hadas }
1777db72438cSYishai Hadas 
mlx5_ib_advise_mr_prefetch(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge)1778db72438cSYishai Hadas int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1779db72438cSYishai Hadas 			       enum ib_uverbs_advise_mr_advice advice,
1780db72438cSYishai Hadas 			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
1781813e90b1SMoni Shoua {
1782813e90b1SMoni Shoua 	u32 pf_flags = 0;
1783813e90b1SMoni Shoua 	struct prefetch_mr_work *work;
1784813e90b1SMoni Shoua 	int rc;
1785813e90b1SMoni Shoua 
1786fb985e27SJason Gunthorpe 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
1787813e90b1SMoni Shoua 		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
178849b99314SJason Gunthorpe 
1789813e90b1SMoni Shoua 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1790813e90b1SMoni Shoua 		pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
1791813e90b1SMoni Shoua 
1792813e90b1SMoni Shoua 	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
1793677cf51fSYishai Hadas 		return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
1794677cf51fSYishai Hadas 						num_sge);
1795677cf51fSYishai Hadas 
1796813e90b1SMoni Shoua 	work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
1797fb985e27SJason Gunthorpe 	if (!work)
1798813e90b1SMoni Shoua 		return -ENOMEM;
1799813e90b1SMoni Shoua 
1800fb985e27SJason Gunthorpe 	rc = init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge);
1801813e90b1SMoni Shoua 	if (rc) {
1802813e90b1SMoni Shoua 		destroy_prefetch_work(work);
1803813e90b1SMoni Shoua 		return rc;
180449b99314SJason Gunthorpe 	}
180549b99314SJason Gunthorpe 	queue_work(system_unbound_wq, &work->work);
18065351a56bSJason Gunthorpe 	return 0;
180749b99314SJason Gunthorpe }
1808fb985e27SJason Gunthorpe