18cdd312cSHaggai Eran /*
26cf0a15fSSaeed Mahameed * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
38cdd312cSHaggai Eran *
48cdd312cSHaggai Eran * This software is available to you under a choice of one of two
58cdd312cSHaggai Eran * licenses. You may choose to be licensed under the terms of the GNU
68cdd312cSHaggai Eran * General Public License (GPL) Version 2, available from the file
78cdd312cSHaggai Eran * COPYING in the main directory of this source tree, or the
88cdd312cSHaggai Eran * OpenIB.org BSD license below:
98cdd312cSHaggai Eran *
108cdd312cSHaggai Eran * Redistribution and use in source and binary forms, with or
118cdd312cSHaggai Eran * without modification, are permitted provided that the following
128cdd312cSHaggai Eran * conditions are met:
138cdd312cSHaggai Eran *
148cdd312cSHaggai Eran * - Redistributions of source code must retain the above
158cdd312cSHaggai Eran * copyright notice, this list of conditions and the following
168cdd312cSHaggai Eran * disclaimer.
178cdd312cSHaggai Eran *
188cdd312cSHaggai Eran * - Redistributions in binary form must reproduce the above
198cdd312cSHaggai Eran * copyright notice, this list of conditions and the following
208cdd312cSHaggai Eran * disclaimer in the documentation and/or other materials
218cdd312cSHaggai Eran * provided with the distribution.
228cdd312cSHaggai Eran *
238cdd312cSHaggai Eran * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
248cdd312cSHaggai Eran * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
258cdd312cSHaggai Eran * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
268cdd312cSHaggai Eran * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
278cdd312cSHaggai Eran * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
288cdd312cSHaggai Eran * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
298cdd312cSHaggai Eran * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
308cdd312cSHaggai Eran * SOFTWARE.
318cdd312cSHaggai Eran */
328cdd312cSHaggai Eran
337bdf65d4SHaggai Eran #include <rdma/ib_umem_odp.h>
34e980b441SJérémy Lefaure #include <linux/kernel.h>
3590da7dc8SJianxin Xiong #include <linux/dma-buf.h>
3690da7dc8SJianxin Xiong #include <linux/dma-resv.h>
377bdf65d4SHaggai Eran
388cdd312cSHaggai Eran #include "mlx5_ib.h"
3981713d37SArtemy Kovalyov #include "cmd.h"
40f49c856aSAharon Landau #include "umr.h"
41333fbaa0SLeon Romanovsky #include "qp.h"
428cdd312cSHaggai Eran
43d5d284b8SSaeed Mahameed #include <linux/mlx5/eq.h>
44d5d284b8SSaeed Mahameed
45d5d284b8SSaeed Mahameed /* Contains the details of a pagefault. */
46d5d284b8SSaeed Mahameed struct mlx5_pagefault {
47d5d284b8SSaeed Mahameed u32 bytes_committed;
48d5d284b8SSaeed Mahameed u32 token;
49d5d284b8SSaeed Mahameed u8 event_subtype;
50d5d284b8SSaeed Mahameed u8 type;
51d5d284b8SSaeed Mahameed union {
52d5d284b8SSaeed Mahameed /* Initiator or send message responder pagefault details. */
53d5d284b8SSaeed Mahameed struct {
54d5d284b8SSaeed Mahameed /* Received packet size, only valid for responders. */
55d5d284b8SSaeed Mahameed u32 packet_size;
56d5d284b8SSaeed Mahameed /*
57d5d284b8SSaeed Mahameed * Number of resource holding WQE, depends on type.
58d5d284b8SSaeed Mahameed */
59d5d284b8SSaeed Mahameed u32 wq_num;
60d5d284b8SSaeed Mahameed /*
61d5d284b8SSaeed Mahameed * WQE index. Refers to either the send queue or
62d5d284b8SSaeed Mahameed * receive queue, according to event_subtype.
63d5d284b8SSaeed Mahameed */
64d5d284b8SSaeed Mahameed u16 wqe_index;
65d5d284b8SSaeed Mahameed } wqe;
66d5d284b8SSaeed Mahameed /* RDMA responder pagefault details */
67d5d284b8SSaeed Mahameed struct {
68d5d284b8SSaeed Mahameed u32 r_key;
69d5d284b8SSaeed Mahameed /*
70d5d284b8SSaeed Mahameed * Received packet size, minimal size page fault
71d5d284b8SSaeed Mahameed * resolution required for forward progress.
72d5d284b8SSaeed Mahameed */
73d5d284b8SSaeed Mahameed u32 packet_size;
74d5d284b8SSaeed Mahameed u32 rdma_op_len;
75d5d284b8SSaeed Mahameed u64 rdma_va;
76d5d284b8SSaeed Mahameed } rdma;
77d5d284b8SSaeed Mahameed };
78d5d284b8SSaeed Mahameed
79d5d284b8SSaeed Mahameed struct mlx5_ib_pf_eq *eq;
80d5d284b8SSaeed Mahameed struct work_struct work;
81d5d284b8SSaeed Mahameed };
82d5d284b8SSaeed Mahameed
83eab668a6SHaggai Eran #define MAX_PREFETCH_LEN (4*1024*1024U)
84eab668a6SHaggai Eran
85b4cfe447SHaggai Eran /* Timeout in ms to wait for an active mmu notifier to complete when handling
86b4cfe447SHaggai Eran * a pagefault. */
87b4cfe447SHaggai Eran #define MMU_NOTIFIER_TIMEOUT 1000
88b4cfe447SHaggai Eran
8981713d37SArtemy Kovalyov #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
9081713d37SArtemy Kovalyov #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
9181713d37SArtemy Kovalyov #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
9281713d37SArtemy Kovalyov #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
9381713d37SArtemy Kovalyov #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
9481713d37SArtemy Kovalyov
9581713d37SArtemy Kovalyov #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
9681713d37SArtemy Kovalyov
9781713d37SArtemy Kovalyov static u64 mlx5_imr_ksm_entries;
9881713d37SArtemy Kovalyov
populate_klm(struct mlx5_klm * pklm,size_t idx,size_t nentries,struct mlx5_ib_mr * imr,int flags)99cbe4b8f0SArtemy Kovalyov static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
100423f52d6SJason Gunthorpe struct mlx5_ib_mr *imr, int flags)
10181713d37SArtemy Kovalyov {
102423f52d6SJason Gunthorpe struct mlx5_klm *end = pklm + nentries;
10381713d37SArtemy Kovalyov
10481713d37SArtemy Kovalyov if (flags & MLX5_IB_UPD_XLT_ZAP) {
105423f52d6SJason Gunthorpe for (; pklm != end; pklm++, idx++) {
10681713d37SArtemy Kovalyov pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
107*594cac11SOr Har-Toov pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
10881713d37SArtemy Kovalyov pklm->va = 0;
10981713d37SArtemy Kovalyov }
11081713d37SArtemy Kovalyov return;
11181713d37SArtemy Kovalyov }
11281713d37SArtemy Kovalyov
113f28b1932SJason Gunthorpe /*
114423f52d6SJason Gunthorpe * The locking here is pretty subtle. Ideally the implicit_children
115423f52d6SJason Gunthorpe * xarray would be protected by the umem_mutex, however that is not
116f28b1932SJason Gunthorpe * possible. Instead this uses a weaker update-then-lock pattern:
117f28b1932SJason Gunthorpe *
118423f52d6SJason Gunthorpe * xa_store()
119f28b1932SJason Gunthorpe * mutex_lock(umem_mutex)
120636bdbfcSAharon Landau * mlx5r_umr_update_xlt()
121f28b1932SJason Gunthorpe * mutex_unlock(umem_mutex)
122f28b1932SJason Gunthorpe * destroy lkey
123f28b1932SJason Gunthorpe *
124423f52d6SJason Gunthorpe * ie any change the xarray must be followed by the locked update_xlt
125423f52d6SJason Gunthorpe * before destroying.
126f28b1932SJason Gunthorpe *
127f28b1932SJason Gunthorpe * The umem_mutex provides the acquire/release semantic needed to make
128db72438cSYishai Hadas * the xa_store() visible to a racing thread.
129f28b1932SJason Gunthorpe */
130423f52d6SJason Gunthorpe lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
131f28b1932SJason Gunthorpe
132423f52d6SJason Gunthorpe for (; pklm != end; pklm++, idx++) {
133423f52d6SJason Gunthorpe struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
13481713d37SArtemy Kovalyov
13581713d37SArtemy Kovalyov pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
136423f52d6SJason Gunthorpe if (mtt) {
13781713d37SArtemy Kovalyov pklm->key = cpu_to_be32(mtt->ibmr.lkey);
138423f52d6SJason Gunthorpe pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
13981713d37SArtemy Kovalyov } else {
140*594cac11SOr Har-Toov pklm->key = mr_to_mdev(imr)->mkeys.null_mkey;
1419162420dSJason Gunthorpe pklm->va = 0;
14281713d37SArtemy Kovalyov }
14381713d37SArtemy Kovalyov }
14481713d37SArtemy Kovalyov }
14581713d37SArtemy Kovalyov
umem_dma_to_mtt(dma_addr_t umem_dma)146cbe4b8f0SArtemy Kovalyov static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
147cbe4b8f0SArtemy Kovalyov {
148cbe4b8f0SArtemy Kovalyov u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
149cbe4b8f0SArtemy Kovalyov
150cbe4b8f0SArtemy Kovalyov if (umem_dma & ODP_READ_ALLOWED_BIT)
151cbe4b8f0SArtemy Kovalyov mtt_entry |= MLX5_IB_MTT_READ;
152cbe4b8f0SArtemy Kovalyov if (umem_dma & ODP_WRITE_ALLOWED_BIT)
153cbe4b8f0SArtemy Kovalyov mtt_entry |= MLX5_IB_MTT_WRITE;
154cbe4b8f0SArtemy Kovalyov
155cbe4b8f0SArtemy Kovalyov return mtt_entry;
156cbe4b8f0SArtemy Kovalyov }
157cbe4b8f0SArtemy Kovalyov
populate_mtt(__be64 * pas,size_t idx,size_t nentries,struct mlx5_ib_mr * mr,int flags)158cbe4b8f0SArtemy Kovalyov static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
159cbe4b8f0SArtemy Kovalyov struct mlx5_ib_mr *mr, int flags)
160cbe4b8f0SArtemy Kovalyov {
161cbe4b8f0SArtemy Kovalyov struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
162cbe4b8f0SArtemy Kovalyov dma_addr_t pa;
163cbe4b8f0SArtemy Kovalyov size_t i;
164cbe4b8f0SArtemy Kovalyov
165cbe4b8f0SArtemy Kovalyov if (flags & MLX5_IB_UPD_XLT_ZAP)
166cbe4b8f0SArtemy Kovalyov return;
167cbe4b8f0SArtemy Kovalyov
168cbe4b8f0SArtemy Kovalyov for (i = 0; i < nentries; i++) {
169cbe4b8f0SArtemy Kovalyov pa = odp->dma_list[idx + i];
170cbe4b8f0SArtemy Kovalyov pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
171cbe4b8f0SArtemy Kovalyov }
172cbe4b8f0SArtemy Kovalyov }
173cbe4b8f0SArtemy Kovalyov
mlx5_odp_populate_xlt(void * xlt,size_t idx,size_t nentries,struct mlx5_ib_mr * mr,int flags)174cbe4b8f0SArtemy Kovalyov void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
175cbe4b8f0SArtemy Kovalyov struct mlx5_ib_mr *mr, int flags)
176cbe4b8f0SArtemy Kovalyov {
177cbe4b8f0SArtemy Kovalyov if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
178cbe4b8f0SArtemy Kovalyov populate_klm(xlt, idx, nentries, mr, flags);
179cbe4b8f0SArtemy Kovalyov } else {
180cbe4b8f0SArtemy Kovalyov populate_mtt(xlt, idx, nentries, mr, flags);
181cbe4b8f0SArtemy Kovalyov }
182cbe4b8f0SArtemy Kovalyov }
183cbe4b8f0SArtemy Kovalyov
1845256edcbSJason Gunthorpe /*
185db72438cSYishai Hadas * This must be called after the mr has been removed from implicit_children.
186db72438cSYishai Hadas * NOTE: The MR does not necessarily have to be
187d561987fSJason Gunthorpe * empty here, parallel page faults could have raced with the free process and
188d561987fSJason Gunthorpe * added pages to it.
1895256edcbSJason Gunthorpe */
free_implicit_child_mr_work(struct work_struct * work)1905256edcbSJason Gunthorpe static void free_implicit_child_mr_work(struct work_struct *work)
1915256edcbSJason Gunthorpe {
1925256edcbSJason Gunthorpe struct mlx5_ib_mr *mr =
1935256edcbSJason Gunthorpe container_of(work, struct mlx5_ib_mr, odp_destroy.work);
194db72438cSYishai Hadas struct mlx5_ib_mr *imr = mr->parent;
195e6fb246cSJason Gunthorpe struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
196e6fb246cSJason Gunthorpe struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
1975256edcbSJason Gunthorpe
198e6fb246cSJason Gunthorpe mlx5r_deref_wait_odp_mkey(&mr->mmkey);
199e6fb246cSJason Gunthorpe
200e6fb246cSJason Gunthorpe mutex_lock(&odp_imr->umem_mutex);
201636bdbfcSAharon Landau mlx5r_umr_update_xlt(mr->parent,
202636bdbfcSAharon Landau ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 1, 0,
203e6fb246cSJason Gunthorpe MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC);
204e6fb246cSJason Gunthorpe mutex_unlock(&odp_imr->umem_mutex);
205e6fb246cSJason Gunthorpe mlx5_ib_dereg_mr(&mr->ibmr, NULL);
206e6fb246cSJason Gunthorpe
207db72438cSYishai Hadas mlx5r_deref_odp_mkey(&imr->mmkey);
2085256edcbSJason Gunthorpe }
2095256edcbSJason Gunthorpe
destroy_unused_implicit_child_mr(struct mlx5_ib_mr * mr)2105256edcbSJason Gunthorpe static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
2115256edcbSJason Gunthorpe {
2125256edcbSJason Gunthorpe struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
2135256edcbSJason Gunthorpe unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
2145256edcbSJason Gunthorpe struct mlx5_ib_mr *imr = mr->parent;
2155256edcbSJason Gunthorpe
216db72438cSYishai Hadas if (!refcount_inc_not_zero(&imr->mmkey.usecount))
217db72438cSYishai Hadas return;
2185256edcbSJason Gunthorpe
219db72438cSYishai Hadas xa_erase(&imr->implicit_children, idx);
2205256edcbSJason Gunthorpe
221db72438cSYishai Hadas /* Freeing a MR is a sleeping operation, so bounce to a work queue */
222db72438cSYishai Hadas INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
223db72438cSYishai Hadas queue_work(system_unbound_wq, &mr->odp_destroy.work);
22481713d37SArtemy Kovalyov }
22581713d37SArtemy Kovalyov
mlx5_ib_invalidate_range(struct mmu_interval_notifier * mni,const struct mmu_notifier_range * range,unsigned long cur_seq)226f25a546eSJason Gunthorpe static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
227f25a546eSJason Gunthorpe const struct mmu_notifier_range *range,
228f25a546eSJason Gunthorpe unsigned long cur_seq)
229b4cfe447SHaggai Eran {
230f25a546eSJason Gunthorpe struct ib_umem_odp *umem_odp =
231f25a546eSJason Gunthorpe container_of(mni, struct ib_umem_odp, notifier);
232b4cfe447SHaggai Eran struct mlx5_ib_mr *mr;
23302648b4bSTariq Toukan const u64 umr_block_mask = MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT - 1;
234b4cfe447SHaggai Eran u64 idx = 0, blk_start_idx = 0;
235a3de94e3SErez Alfasi u64 invalidations = 0;
236f25a546eSJason Gunthorpe unsigned long start;
237f25a546eSJason Gunthorpe unsigned long end;
238b4cfe447SHaggai Eran int in_block = 0;
239b4cfe447SHaggai Eran u64 addr;
240b4cfe447SHaggai Eran
241f25a546eSJason Gunthorpe if (!mmu_notifier_range_blockable(range))
242f25a546eSJason Gunthorpe return false;
243f25a546eSJason Gunthorpe
24409689703SJason Gunthorpe mutex_lock(&umem_odp->umem_mutex);
245f25a546eSJason Gunthorpe mmu_interval_set_seq(mni, cur_seq);
24609689703SJason Gunthorpe /*
24709689703SJason Gunthorpe * If npages is zero then umem_odp->private may not be setup yet. This
24809689703SJason Gunthorpe * does not complete until after the first page is mapped for DMA.
24909689703SJason Gunthorpe */
25009689703SJason Gunthorpe if (!umem_odp->npages)
25109689703SJason Gunthorpe goto out;
252b5231b01SJason Gunthorpe mr = umem_odp->private;
253b4cfe447SHaggai Eran
254f25a546eSJason Gunthorpe start = max_t(u64, ib_umem_start(umem_odp), range->start);
255f25a546eSJason Gunthorpe end = min_t(u64, ib_umem_end(umem_odp), range->end);
256b4cfe447SHaggai Eran
257b4cfe447SHaggai Eran /*
258b4cfe447SHaggai Eran * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
259b4cfe447SHaggai Eran * while we are doing the invalidation, no page fault will attempt to
260b4cfe447SHaggai Eran * overwrite the same MTTs. Concurent invalidations might race us,
261b4cfe447SHaggai Eran * but they will write 0s as well, so no difference in the end result.
262b4cfe447SHaggai Eran */
263d2183c6fSJason Gunthorpe for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
264d2183c6fSJason Gunthorpe idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
265b4cfe447SHaggai Eran /*
266b4cfe447SHaggai Eran * Strive to write the MTTs in chunks, but avoid overwriting
267b4cfe447SHaggai Eran * non-existing MTTs. The huristic here can be improved to
268b4cfe447SHaggai Eran * estimate the cost of another UMR vs. the cost of bigger
269b4cfe447SHaggai Eran * UMR.
270b4cfe447SHaggai Eran */
271b5231b01SJason Gunthorpe if (umem_odp->dma_list[idx] &
272b4cfe447SHaggai Eran (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
273b4cfe447SHaggai Eran if (!in_block) {
274b4cfe447SHaggai Eran blk_start_idx = idx;
275b4cfe447SHaggai Eran in_block = 1;
276b4cfe447SHaggai Eran }
277a3de94e3SErez Alfasi
278a3de94e3SErez Alfasi /* Count page invalidations */
279a3de94e3SErez Alfasi invalidations += idx - blk_start_idx + 1;
280b4cfe447SHaggai Eran } else {
281b4cfe447SHaggai Eran u64 umr_offset = idx & umr_block_mask;
282b4cfe447SHaggai Eran
283b4cfe447SHaggai Eran if (in_block && umr_offset == 0) {
284636bdbfcSAharon Landau mlx5r_umr_update_xlt(mr, blk_start_idx,
285b2ac9188SArtemy Kovalyov idx - blk_start_idx, 0,
2867d0cc6edSArtemy Kovalyov MLX5_IB_UPD_XLT_ZAP |
2877d0cc6edSArtemy Kovalyov MLX5_IB_UPD_XLT_ATOMIC);
288b4cfe447SHaggai Eran in_block = 0;
289b4cfe447SHaggai Eran }
290b4cfe447SHaggai Eran }
291b4cfe447SHaggai Eran }
292b4cfe447SHaggai Eran if (in_block)
293636bdbfcSAharon Landau mlx5r_umr_update_xlt(mr, blk_start_idx,
294b2ac9188SArtemy Kovalyov idx - blk_start_idx + 1, 0,
2957d0cc6edSArtemy Kovalyov MLX5_IB_UPD_XLT_ZAP |
2967d0cc6edSArtemy Kovalyov MLX5_IB_UPD_XLT_ATOMIC);
297a3de94e3SErez Alfasi
298a3de94e3SErez Alfasi mlx5_update_odp_stats(mr, invalidations, invalidations);
299a3de94e3SErez Alfasi
300b4cfe447SHaggai Eran /*
301b4cfe447SHaggai Eran * We are now sure that the device will not access the
302b4cfe447SHaggai Eran * memory. We can safely unmap it, and mark it as dirty if
303b4cfe447SHaggai Eran * needed.
304b4cfe447SHaggai Eran */
305b4cfe447SHaggai Eran
306b5231b01SJason Gunthorpe ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
30781713d37SArtemy Kovalyov
3085256edcbSJason Gunthorpe if (unlikely(!umem_odp->npages && mr->parent))
3095256edcbSJason Gunthorpe destroy_unused_implicit_child_mr(mr);
31009689703SJason Gunthorpe out:
3119dc775e7SJason Gunthorpe mutex_unlock(&umem_odp->umem_mutex);
312f25a546eSJason Gunthorpe return true;
313b4cfe447SHaggai Eran }
314b4cfe447SHaggai Eran
315f25a546eSJason Gunthorpe const struct mmu_interval_notifier_ops mlx5_mn_ops = {
316f25a546eSJason Gunthorpe .invalidate = mlx5_ib_invalidate_range,
317f25a546eSJason Gunthorpe };
318f25a546eSJason Gunthorpe
internal_fill_odp_caps(struct mlx5_ib_dev * dev)319e5dc370bSShay Drory static void internal_fill_odp_caps(struct mlx5_ib_dev *dev)
3208cdd312cSHaggai Eran {
3218cdd312cSHaggai Eran struct ib_odp_caps *caps = &dev->odp_caps;
3228cdd312cSHaggai Eran
3238cdd312cSHaggai Eran memset(caps, 0, sizeof(*caps));
3248cdd312cSHaggai Eran
325f49c856aSAharon Landau if (!MLX5_CAP_GEN(dev->mdev, pg) || !mlx5r_umr_can_load_pas(dev, 0))
326938fe83cSSaeed Mahameed return;
3278cdd312cSHaggai Eran
328b4cfe447SHaggai Eran caps->general_caps = IB_ODP_SUPPORT;
329b4cfe447SHaggai Eran
330c438fde1SArtemy Kovalyov if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset))
331c438fde1SArtemy Kovalyov dev->odp_max_size = U64_MAX;
332c438fde1SArtemy Kovalyov else
333c438fde1SArtemy Kovalyov dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT);
334c438fde1SArtemy Kovalyov
335938fe83cSSaeed Mahameed if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send))
336938fe83cSSaeed Mahameed caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
337938fe83cSSaeed Mahameed
3382e68daceSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive))
3392e68daceSMoni Shoua caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
3402e68daceSMoni Shoua
341938fe83cSSaeed Mahameed if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send))
342938fe83cSSaeed Mahameed caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
343938fe83cSSaeed Mahameed
344938fe83cSSaeed Mahameed if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive))
345938fe83cSSaeed Mahameed caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
346938fe83cSSaeed Mahameed
347938fe83cSSaeed Mahameed if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write))
348938fe83cSSaeed Mahameed caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
349938fe83cSSaeed Mahameed
350938fe83cSSaeed Mahameed if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read))
351938fe83cSSaeed Mahameed caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
352938fe83cSSaeed Mahameed
35317d2f88fSArtemy Kovalyov if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic))
35417d2f88fSArtemy Kovalyov caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
35517d2f88fSArtemy Kovalyov
3562e68daceSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive))
3572e68daceSMoni Shoua caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
3582e68daceSMoni Shoua
3596141f8faSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send))
3606141f8faSMoni Shoua caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND;
3616141f8faSMoni Shoua
3626141f8faSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive))
3636141f8faSMoni Shoua caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV;
3646141f8faSMoni Shoua
3656141f8faSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write))
3666141f8faSMoni Shoua caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE;
3676141f8faSMoni Shoua
3686141f8faSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read))
3696141f8faSMoni Shoua caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ;
3706141f8faSMoni Shoua
3716141f8faSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic))
3726141f8faSMoni Shoua caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
3736141f8faSMoni Shoua
3746141f8faSMoni Shoua if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive))
3756141f8faSMoni Shoua caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
3766141f8faSMoni Shoua
37781713d37SArtemy Kovalyov if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) &&
37881713d37SArtemy Kovalyov MLX5_CAP_GEN(dev->mdev, null_mkey) &&
37900815752SMoni Shoua MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
38000815752SMoni Shoua !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
38181713d37SArtemy Kovalyov caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
3828cdd312cSHaggai Eran }
3836aec21f6SHaggai Eran
mlx5_ib_page_fault_resume(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault,int error)384d9aaed83SArtemy Kovalyov static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
385d9aaed83SArtemy Kovalyov struct mlx5_pagefault *pfault,
38619098df2Smajd@mellanox.com int error)
38719098df2Smajd@mellanox.com {
388d9aaed83SArtemy Kovalyov int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
389d9aaed83SArtemy Kovalyov pfault->wqe.wq_num : pfault->token;
390d5d284b8SSaeed Mahameed u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
391d5d284b8SSaeed Mahameed int err;
392d5d284b8SSaeed Mahameed
393d5d284b8SSaeed Mahameed MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
394d5d284b8SSaeed Mahameed MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type);
395d5d284b8SSaeed Mahameed MLX5_SET(page_fault_resume_in, in, token, pfault->token);
396d5d284b8SSaeed Mahameed MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
397d5d284b8SSaeed Mahameed MLX5_SET(page_fault_resume_in, in, error, !!error);
398d5d284b8SSaeed Mahameed
39931578defSLeon Romanovsky err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
400d5d284b8SSaeed Mahameed if (err)
401d5d284b8SSaeed Mahameed mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
402d5d284b8SSaeed Mahameed wq_num, err);
4036aec21f6SHaggai Eran }
4046aec21f6SHaggai Eran
implicit_get_child_mr(struct mlx5_ib_mr * imr,unsigned long idx)4053d5f3c54SJason Gunthorpe static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
4063d5f3c54SJason Gunthorpe unsigned long idx)
40781713d37SArtemy Kovalyov {
40856561ac6SAharon Landau struct mlx5_ib_dev *dev = mr_to_mdev(imr);
4093d5f3c54SJason Gunthorpe struct ib_umem_odp *odp;
41081713d37SArtemy Kovalyov struct mlx5_ib_mr *mr;
411c2edcd69SJason Gunthorpe struct mlx5_ib_mr *ret;
41281713d37SArtemy Kovalyov int err;
41381713d37SArtemy Kovalyov
4143d5f3c54SJason Gunthorpe odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
4153d5f3c54SJason Gunthorpe idx * MLX5_IMR_MTT_SIZE,
416f25a546eSJason Gunthorpe MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
4173d5f3c54SJason Gunthorpe if (IS_ERR(odp))
418b5231b01SJason Gunthorpe return ERR_CAST(odp);
41981713d37SArtemy Kovalyov
42056561ac6SAharon Landau mr = mlx5_mr_cache_alloc(dev, imr->access_flags,
42156561ac6SAharon Landau MLX5_MKC_ACCESS_MODE_MTT,
422e6fb246cSJason Gunthorpe MLX5_IMR_MTT_ENTRIES);
423e6fb246cSJason Gunthorpe if (IS_ERR(mr)) {
424e6fb246cSJason Gunthorpe ib_umem_odp_release(odp);
425e6fb246cSJason Gunthorpe return mr;
42681713d37SArtemy Kovalyov }
42756561ac6SAharon Landau
428c2edcd69SJason Gunthorpe mr->access_flags = imr->access_flags;
429ca991a7dSMaor Gottlieb mr->ibmr.pd = imr->ibmr.pd;
430c2edcd69SJason Gunthorpe mr->ibmr.device = &mr_to_mdev(imr)->ib_dev;
431c2edcd69SJason Gunthorpe mr->umem = &odp->umem;
432c2edcd69SJason Gunthorpe mr->ibmr.lkey = mr->mmkey.key;
433cf6a8b1bSAharon Landau mr->ibmr.rkey = mr->mmkey.key;
434c2edcd69SJason Gunthorpe mr->ibmr.iova = idx * MLX5_IMR_MTT_SIZE;
435c2edcd69SJason Gunthorpe mr->parent = imr;
43681713d37SArtemy Kovalyov odp->private = mr;
437db72438cSYishai Hadas
438db72438cSYishai Hadas /*
439db72438cSYishai Hadas * First refcount is owned by the xarray and second refconut
440db72438cSYishai Hadas * is returned to the caller.
441db72438cSYishai Hadas */
442db72438cSYishai Hadas refcount_set(&mr->mmkey.usecount, 2);
443636bdbfcSAharon Landau
44481713d37SArtemy Kovalyov err = mlx5r_umr_update_xlt(mr, 0,
44581713d37SArtemy Kovalyov MLX5_IMR_MTT_ENTRIES,
44681713d37SArtemy Kovalyov PAGE_SHIFT,
4473389baa8SJason Gunthorpe MLX5_IB_UPD_XLT_ZAP |
448c2edcd69SJason Gunthorpe MLX5_IB_UPD_XLT_ENABLE);
449c2edcd69SJason Gunthorpe if (err) {
450d561987fSJason Gunthorpe ret = ERR_PTR(err);
45181713d37SArtemy Kovalyov goto out_mr;
45281713d37SArtemy Kovalyov }
453db72438cSYishai Hadas
454db72438cSYishai Hadas xa_lock(&imr->implicit_children);
4555256edcbSJason Gunthorpe ret = __xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
4563389baa8SJason Gunthorpe GFP_KERNEL);
4573389baa8SJason Gunthorpe if (unlikely(ret)) {
4583389baa8SJason Gunthorpe if (xa_is_err(ret)) {
459db72438cSYishai Hadas ret = ERR_PTR(xa_err(ret));
4603389baa8SJason Gunthorpe goto out_lock;
4613389baa8SJason Gunthorpe }
4623389baa8SJason Gunthorpe /*
4633389baa8SJason Gunthorpe * Another thread beat us to creating the child mr, use
4643389baa8SJason Gunthorpe * theirs.
465db72438cSYishai Hadas */
466db72438cSYishai Hadas refcount_inc(&ret->mmkey.usecount);
467423f52d6SJason Gunthorpe goto out_lock;
468db72438cSYishai Hadas }
46981713d37SArtemy Kovalyov xa_unlock(&imr->implicit_children);
470ca991a7dSMaor Gottlieb
47181713d37SArtemy Kovalyov mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
47281713d37SArtemy Kovalyov return mr;
473db72438cSYishai Hadas
474db72438cSYishai Hadas out_lock:
475c2edcd69SJason Gunthorpe xa_unlock(&imr->implicit_children);
476e6fb246cSJason Gunthorpe out_mr:
477c2edcd69SJason Gunthorpe mlx5_ib_dereg_mr(&mr->ibmr, NULL);
47881713d37SArtemy Kovalyov return ret;
47981713d37SArtemy Kovalyov }
48081713d37SArtemy Kovalyov
mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd * pd,int access_flags)48181713d37SArtemy Kovalyov struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
48281713d37SArtemy Kovalyov int access_flags)
483c2edcd69SJason Gunthorpe {
484f20bef6aSJason Gunthorpe struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
485c2edcd69SJason Gunthorpe struct ib_umem_odp *umem_odp;
486c2edcd69SJason Gunthorpe struct mlx5_ib_mr *imr;
48781713d37SArtemy Kovalyov int err;
488f49c856aSAharon Landau
48938f8ff5bSJason Gunthorpe if (!mlx5r_umr_can_load_pas(dev, MLX5_IMR_MTT_ENTRIES * PAGE_SIZE))
49038f8ff5bSJason Gunthorpe return ERR_PTR(-EOPNOTSUPP);
491c320e527SMoni Shoua
492f20bef6aSJason Gunthorpe umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
493f20bef6aSJason Gunthorpe if (IS_ERR(umem_odp))
49481713d37SArtemy Kovalyov return ERR_CAST(umem_odp);
49556561ac6SAharon Landau
49656561ac6SAharon Landau imr = mlx5_mr_cache_alloc(dev, access_flags, MLX5_MKC_ACCESS_MODE_KSM,
49756561ac6SAharon Landau mlx5_imr_ksm_entries);
49881713d37SArtemy Kovalyov if (IS_ERR(imr)) {
499e6fb246cSJason Gunthorpe ib_umem_odp_release(umem_odp);
500e6fb246cSJason Gunthorpe return imr;
50181713d37SArtemy Kovalyov }
50281713d37SArtemy Kovalyov
50356561ac6SAharon Landau imr->access_flags = access_flags;
504c2edcd69SJason Gunthorpe imr->ibmr.pd = &pd->ibpd;
505cf6a8b1bSAharon Landau imr->ibmr.iova = 0;
506f20bef6aSJason Gunthorpe imr->umem = &umem_odp->umem;
507c2edcd69SJason Gunthorpe imr->ibmr.lkey = imr->mmkey.key;
508c2edcd69SJason Gunthorpe imr->ibmr.rkey = imr->mmkey.key;
509ca991a7dSMaor Gottlieb imr->ibmr.device = &dev->ib_dev;
510e1b95ae0SErez Alfasi imr->is_odp_implicit = true;
511423f52d6SJason Gunthorpe xa_init(&imr->implicit_children);
512e1b95ae0SErez Alfasi
513636bdbfcSAharon Landau err = mlx5r_umr_update_xlt(imr, 0,
514c2edcd69SJason Gunthorpe mlx5_imr_ksm_entries,
515c2edcd69SJason Gunthorpe MLX5_KSM_PAGE_SHIFT,
516c2edcd69SJason Gunthorpe MLX5_IB_UPD_XLT_INDIRECT |
517c2edcd69SJason Gunthorpe MLX5_IB_UPD_XLT_ZAP |
518c2edcd69SJason Gunthorpe MLX5_IB_UPD_XLT_ENABLE);
519c2edcd69SJason Gunthorpe if (err)
520c2edcd69SJason Gunthorpe goto out_mr;
521c2edcd69SJason Gunthorpe
522db72438cSYishai Hadas err = mlx5r_store_odp_mkey(dev, &imr->mmkey);
523c2edcd69SJason Gunthorpe if (err)
524c2edcd69SJason Gunthorpe goto out_mr;
525c2edcd69SJason Gunthorpe
526c2edcd69SJason Gunthorpe mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
52781713d37SArtemy Kovalyov return imr;
528c2edcd69SJason Gunthorpe out_mr:
529c2edcd69SJason Gunthorpe mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
530e6fb246cSJason Gunthorpe mlx5_ib_dereg_mr(&imr->ibmr, NULL);
531c2edcd69SJason Gunthorpe return ERR_PTR(err);
53281713d37SArtemy Kovalyov }
53381713d37SArtemy Kovalyov
mlx5_ib_free_odp_mr(struct mlx5_ib_mr * mr)534e6fb246cSJason Gunthorpe void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr)
53581713d37SArtemy Kovalyov {
536423f52d6SJason Gunthorpe struct mlx5_ib_mr *mtt;
537423f52d6SJason Gunthorpe unsigned long idx;
538f993de88SJason Gunthorpe
5395256edcbSJason Gunthorpe /*
540e6fb246cSJason Gunthorpe * If this is an implicit MR it is already invalidated so we can just
541e6fb246cSJason Gunthorpe * delete the children mkeys.
542a862192eSJason Gunthorpe */
543e6fb246cSJason Gunthorpe xa_for_each(&mr->implicit_children, idx, mtt) {
544e6fb246cSJason Gunthorpe xa_erase(&mr->implicit_children, idx);
545e6fb246cSJason Gunthorpe mlx5_ib_dereg_mr(&mtt->ibmr, NULL);
54690da7dc8SJianxin Xiong }
54790da7dc8SJianxin Xiong }
54890da7dc8SJianxin Xiong
549813e90b1SMoni Shoua #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
550677cf51fSYishai Hadas #define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
551a03bfc37SYishai Hadas #define MLX5_PF_FLAGS_ENABLE BIT(3)
pagefault_real_mr(struct mlx5_ib_mr * mr,struct ib_umem_odp * odp,u64 user_va,size_t bcnt,u32 * bytes_mapped,u32 flags)55254375e73SJason Gunthorpe static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
55354375e73SJason Gunthorpe u64 user_va, size_t bcnt, u32 *bytes_mapped,
554813e90b1SMoni Shoua u32 flags)
5551b7dbc26SArtemy Kovalyov {
556f25a546eSJason Gunthorpe int page_shift, ret, np;
557813e90b1SMoni Shoua bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
5581abe186eSMoni Shoua u64 access_mask;
5598ffc3248SJason Gunthorpe u64 start_idx;
560677cf51fSYishai Hadas bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
561a03bfc37SYishai Hadas u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
562a03bfc37SYishai Hadas
563a03bfc37SYishai Hadas if (flags & MLX5_PF_FLAGS_ENABLE)
564a03bfc37SYishai Hadas xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
5651b7dbc26SArtemy Kovalyov
566d2183c6fSJason Gunthorpe page_shift = odp->page_shift;
5678ffc3248SJason Gunthorpe start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
5681abe186eSMoni Shoua access_mask = ODP_READ_ALLOWED_BIT;
5691b7dbc26SArtemy Kovalyov
570fba0e448SJason Gunthorpe if (odp->umem.writable && !downgrade)
5711b7dbc26SArtemy Kovalyov access_mask |= ODP_WRITE_ALLOWED_BIT;
5721b7dbc26SArtemy Kovalyov
573677cf51fSYishai Hadas np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
57454375e73SJason Gunthorpe if (np < 0)
57554375e73SJason Gunthorpe return np;
5761b7dbc26SArtemy Kovalyov
5771b7dbc26SArtemy Kovalyov /*
57836f30e48SYishai Hadas * No need to check whether the MTTs really belong to this MR, since
57936f30e48SYishai Hadas * ib_umem_odp_map_dma_and_lock already checks this.
5801b7dbc26SArtemy Kovalyov */
581636bdbfcSAharon Landau ret = mlx5r_umr_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
5821b7dbc26SArtemy Kovalyov mutex_unlock(&odp->umem_mutex);
5831b7dbc26SArtemy Kovalyov
5841b7dbc26SArtemy Kovalyov if (ret < 0) {
5851b7dbc26SArtemy Kovalyov if (ret != -EAGAIN)
586ca991a7dSMaor Gottlieb mlx5_ib_err(mr_to_mdev(mr),
587fb985e27SJason Gunthorpe "Failed to update mkey page tables\n");
5881b7dbc26SArtemy Kovalyov goto out;
5891b7dbc26SArtemy Kovalyov }
5901b7dbc26SArtemy Kovalyov
5911b7dbc26SArtemy Kovalyov if (bytes_mapped) {
5921b7dbc26SArtemy Kovalyov u32 new_mappings = (np << page_shift) -
59354375e73SJason Gunthorpe (user_va - round_down(user_va, 1 << page_shift));
59454375e73SJason Gunthorpe
59554375e73SJason Gunthorpe *bytes_mapped += min_t(u32, new_mappings, bcnt);
5961b7dbc26SArtemy Kovalyov }
5971b7dbc26SArtemy Kovalyov
59854375e73SJason Gunthorpe return np << (page_shift - PAGE_SHIFT);
5991b7dbc26SArtemy Kovalyov
6001b7dbc26SArtemy Kovalyov out:
6011b7dbc26SArtemy Kovalyov return ret;
6021b7dbc26SArtemy Kovalyov }
6031b7dbc26SArtemy Kovalyov
pagefault_implicit_mr(struct mlx5_ib_mr * imr,struct ib_umem_odp * odp_imr,u64 user_va,size_t bcnt,u32 * bytes_mapped,u32 flags)604b70d785dSJason Gunthorpe static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
605b70d785dSJason Gunthorpe struct ib_umem_odp *odp_imr, u64 user_va,
606b70d785dSJason Gunthorpe size_t bcnt, u32 *bytes_mapped, u32 flags)
607b70d785dSJason Gunthorpe {
608b70d785dSJason Gunthorpe unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
609b70d785dSJason Gunthorpe unsigned long upd_start_idx = end_idx + 1;
610b70d785dSJason Gunthorpe unsigned long upd_len = 0;
611b70d785dSJason Gunthorpe unsigned long npages = 0;
612b70d785dSJason Gunthorpe int err;
613b70d785dSJason Gunthorpe int ret;
614b70d785dSJason Gunthorpe
615b70d785dSJason Gunthorpe if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
616b70d785dSJason Gunthorpe mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
617b70d785dSJason Gunthorpe return -EFAULT;
618b70d785dSJason Gunthorpe
619b70d785dSJason Gunthorpe /* Fault each child mr that intersects with our interval. */
620b70d785dSJason Gunthorpe while (bcnt) {
621b70d785dSJason Gunthorpe unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
622b70d785dSJason Gunthorpe struct ib_umem_odp *umem_odp;
623b70d785dSJason Gunthorpe struct mlx5_ib_mr *mtt;
624b70d785dSJason Gunthorpe u64 len;
625b70d785dSJason Gunthorpe
626db72438cSYishai Hadas xa_lock(&imr->implicit_children);
627b70d785dSJason Gunthorpe mtt = xa_load(&imr->implicit_children, idx);
628b70d785dSJason Gunthorpe if (unlikely(!mtt)) {
629db72438cSYishai Hadas xa_unlock(&imr->implicit_children);
630b70d785dSJason Gunthorpe mtt = implicit_get_child_mr(imr, idx);
631b70d785dSJason Gunthorpe if (IS_ERR(mtt)) {
632b70d785dSJason Gunthorpe ret = PTR_ERR(mtt);
633b70d785dSJason Gunthorpe goto out;
634b70d785dSJason Gunthorpe }
635b70d785dSJason Gunthorpe upd_start_idx = min(upd_start_idx, idx);
636b70d785dSJason Gunthorpe upd_len = idx - upd_start_idx + 1;
637db72438cSYishai Hadas } else {
638db72438cSYishai Hadas refcount_inc(&mtt->mmkey.usecount);
639db72438cSYishai Hadas xa_unlock(&imr->implicit_children);
640b70d785dSJason Gunthorpe }
641b70d785dSJason Gunthorpe
642b70d785dSJason Gunthorpe umem_odp = to_ib_umem_odp(mtt->umem);
643b70d785dSJason Gunthorpe len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
644b70d785dSJason Gunthorpe user_va;
645b70d785dSJason Gunthorpe
646b70d785dSJason Gunthorpe ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
647b70d785dSJason Gunthorpe bytes_mapped, flags);
648db72438cSYishai Hadas
649db72438cSYishai Hadas mlx5r_deref_odp_mkey(&mtt->mmkey);
650db72438cSYishai Hadas
651b70d785dSJason Gunthorpe if (ret < 0)
652b70d785dSJason Gunthorpe goto out;
653b70d785dSJason Gunthorpe user_va += len;
654b70d785dSJason Gunthorpe bcnt -= len;
655b70d785dSJason Gunthorpe npages += ret;
656b70d785dSJason Gunthorpe }
657b70d785dSJason Gunthorpe
658b70d785dSJason Gunthorpe ret = npages;
659b70d785dSJason Gunthorpe
660b70d785dSJason Gunthorpe /*
661b70d785dSJason Gunthorpe * Any time the implicit_children are changed we must perform an
662b70d785dSJason Gunthorpe * update of the xlt before exiting to ensure the HW and the
663b70d785dSJason Gunthorpe * implicit_children remains synchronized.
664b70d785dSJason Gunthorpe */
665b70d785dSJason Gunthorpe out:
666b70d785dSJason Gunthorpe if (likely(!upd_len))
667b70d785dSJason Gunthorpe return ret;
668b70d785dSJason Gunthorpe
669b70d785dSJason Gunthorpe /*
670b70d785dSJason Gunthorpe * Notice this is not strictly ordered right, the KSM is updated after
671b70d785dSJason Gunthorpe * the implicit_children is updated, so a parallel page fault could
672b70d785dSJason Gunthorpe * see a MR that is not yet visible in the KSM. This is similar to a
673b70d785dSJason Gunthorpe * parallel page fault seeing a MR that is being concurrently removed
674b70d785dSJason Gunthorpe * from the KSM. Both of these improbable situations are resolved
675b70d785dSJason Gunthorpe * safely by resuming the HW and then taking another page fault. The
676b70d785dSJason Gunthorpe * next pagefault handler will see the new information.
677b70d785dSJason Gunthorpe */
678b70d785dSJason Gunthorpe mutex_lock(&odp_imr->umem_mutex);
679636bdbfcSAharon Landau err = mlx5r_umr_update_xlt(imr, upd_start_idx, upd_len, 0,
680b70d785dSJason Gunthorpe MLX5_IB_UPD_XLT_INDIRECT |
681b70d785dSJason Gunthorpe MLX5_IB_UPD_XLT_ATOMIC);
682b70d785dSJason Gunthorpe mutex_unlock(&odp_imr->umem_mutex);
683b70d785dSJason Gunthorpe if (err) {
684ca991a7dSMaor Gottlieb mlx5_ib_err(mr_to_mdev(imr), "Failed to update PAS\n");
685b70d785dSJason Gunthorpe return err;
686b70d785dSJason Gunthorpe }
687b70d785dSJason Gunthorpe return ret;
688b70d785dSJason Gunthorpe }
689b70d785dSJason Gunthorpe
pagefault_dmabuf_mr(struct mlx5_ib_mr * mr,size_t bcnt,u32 * bytes_mapped,u32 flags)69090da7dc8SJianxin Xiong static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
69190da7dc8SJianxin Xiong u32 *bytes_mapped, u32 flags)
69290da7dc8SJianxin Xiong {
69390da7dc8SJianxin Xiong struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
69490da7dc8SJianxin Xiong u32 xlt_flags = 0;
69590da7dc8SJianxin Xiong int err;
69690da7dc8SJianxin Xiong unsigned int page_size;
69790da7dc8SJianxin Xiong
69890da7dc8SJianxin Xiong if (flags & MLX5_PF_FLAGS_ENABLE)
69990da7dc8SJianxin Xiong xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
70090da7dc8SJianxin Xiong
70190da7dc8SJianxin Xiong dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL);
70290da7dc8SJianxin Xiong err = ib_umem_dmabuf_map_pages(umem_dmabuf);
70390da7dc8SJianxin Xiong if (err) {
70490da7dc8SJianxin Xiong dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
70590da7dc8SJianxin Xiong return err;
70690da7dc8SJianxin Xiong }
70790da7dc8SJianxin Xiong
70890da7dc8SJianxin Xiong page_size = mlx5_umem_find_best_pgsz(&umem_dmabuf->umem, mkc,
70990da7dc8SJianxin Xiong log_page_size, 0,
71090da7dc8SJianxin Xiong umem_dmabuf->umem.iova);
71190da7dc8SJianxin Xiong if (unlikely(page_size < PAGE_SIZE)) {
71290da7dc8SJianxin Xiong ib_umem_dmabuf_unmap_pages(umem_dmabuf);
71390da7dc8SJianxin Xiong err = -EINVAL;
71490da7dc8SJianxin Xiong } else {
715b3d47ebdSAharon Landau err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
71690da7dc8SJianxin Xiong }
71790da7dc8SJianxin Xiong dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
71890da7dc8SJianxin Xiong
71990da7dc8SJianxin Xiong if (err)
72090da7dc8SJianxin Xiong return err;
72190da7dc8SJianxin Xiong
72290da7dc8SJianxin Xiong if (bytes_mapped)
72390da7dc8SJianxin Xiong *bytes_mapped += bcnt;
72490da7dc8SJianxin Xiong
72590da7dc8SJianxin Xiong return ib_umem_num_pages(mr->umem);
72690da7dc8SJianxin Xiong }
72790da7dc8SJianxin Xiong
72854375e73SJason Gunthorpe /*
72954375e73SJason Gunthorpe * Returns:
73054375e73SJason Gunthorpe * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
73154375e73SJason Gunthorpe * not accessible, or the MR is no longer valid.
73254375e73SJason Gunthorpe * -EAGAIN/-ENOMEM: The operation should be retried
73354375e73SJason Gunthorpe *
73454375e73SJason Gunthorpe * -EINVAL/others: General internal malfunction
73554375e73SJason Gunthorpe * >0: Number of pages mapped
73654375e73SJason Gunthorpe */
pagefault_mr(struct mlx5_ib_mr * mr,u64 io_virt,size_t bcnt,u32 * bytes_mapped,u32 flags)73754375e73SJason Gunthorpe static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
73854375e73SJason Gunthorpe u32 *bytes_mapped, u32 flags)
73954375e73SJason Gunthorpe {
74054375e73SJason Gunthorpe struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
74154375e73SJason Gunthorpe
742cf6a8b1bSAharon Landau if (unlikely(io_virt < mr->ibmr.iova))
74354375e73SJason Gunthorpe return -EFAULT;
7448ffc3248SJason Gunthorpe
74590da7dc8SJianxin Xiong if (mr->umem->is_dmabuf)
74690da7dc8SJianxin Xiong return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags);
74790da7dc8SJianxin Xiong
7488ffc3248SJason Gunthorpe if (!odp->is_implicit_odp) {
7498ffc3248SJason Gunthorpe u64 user_va;
7508ffc3248SJason Gunthorpe
751cf6a8b1bSAharon Landau if (check_add_overflow(io_virt - mr->ibmr.iova,
7528ffc3248SJason Gunthorpe (u64)odp->umem.address, &user_va))
7538ffc3248SJason Gunthorpe return -EFAULT;
7548ffc3248SJason Gunthorpe if (unlikely(user_va >= ib_umem_end(odp) ||
7558ffc3248SJason Gunthorpe ib_umem_end(odp) - user_va < bcnt))
7568ffc3248SJason Gunthorpe return -EFAULT;
7578ffc3248SJason Gunthorpe return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
75854375e73SJason Gunthorpe flags);
75954375e73SJason Gunthorpe }
760b70d785dSJason Gunthorpe return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
761b70d785dSJason Gunthorpe flags);
76254375e73SJason Gunthorpe }
76354375e73SJason Gunthorpe
mlx5_ib_init_odp_mr(struct mlx5_ib_mr * mr)76438f8ff5bSJason Gunthorpe int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr)
765a03bfc37SYishai Hadas {
766a03bfc37SYishai Hadas int ret;
767a03bfc37SYishai Hadas
76838f8ff5bSJason Gunthorpe ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem), mr->umem->address,
76938f8ff5bSJason Gunthorpe mr->umem->length, NULL,
77038f8ff5bSJason Gunthorpe MLX5_PF_FLAGS_SNAPSHOT | MLX5_PF_FLAGS_ENABLE);
771a03bfc37SYishai Hadas return ret >= 0 ? 0 : ret;
772a03bfc37SYishai Hadas }
773a03bfc37SYishai Hadas
mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr * mr)77490da7dc8SJianxin Xiong int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr *mr)
77590da7dc8SJianxin Xiong {
77690da7dc8SJianxin Xiong int ret;
77790da7dc8SJianxin Xiong
77890da7dc8SJianxin Xiong ret = pagefault_dmabuf_mr(mr, mr->umem->length, NULL,
77990da7dc8SJianxin Xiong MLX5_PF_FLAGS_ENABLE);
78090da7dc8SJianxin Xiong
78190da7dc8SJianxin Xiong return ret >= 0 ? 0 : ret;
78290da7dc8SJianxin Xiong }
78390da7dc8SJianxin Xiong
784db570d7dSArtemy Kovalyov struct pf_frame {
785db570d7dSArtemy Kovalyov struct pf_frame *next;
786db570d7dSArtemy Kovalyov u32 key;
787db570d7dSArtemy Kovalyov u64 io_virt;
788db570d7dSArtemy Kovalyov size_t bcnt;
789db570d7dSArtemy Kovalyov int depth;
790db570d7dSArtemy Kovalyov };
791db570d7dSArtemy Kovalyov
mkey_is_eq(struct mlx5_ib_mkey * mmkey,u32 key)7924123bfb0SAharon Landau static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key)
793d623dfd2SArtemy Kovalyov {
794d623dfd2SArtemy Kovalyov if (!mmkey)
795d623dfd2SArtemy Kovalyov return false;
79613ad1125SAharon Landau if (mmkey->type == MLX5_MKEY_MW ||
79713ad1125SAharon Landau mmkey->type == MLX5_MKEY_INDIRECT_DEVX)
798d623dfd2SArtemy Kovalyov return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
799d623dfd2SArtemy Kovalyov return mmkey->key == key;
800d623dfd2SArtemy Kovalyov }
801d623dfd2SArtemy Kovalyov
8027bdf65d4SHaggai Eran /*
803d9aaed83SArtemy Kovalyov * Handle a single data segment in a page-fault WQE or RDMA region.
8047bdf65d4SHaggai Eran *
805b2ac9188SArtemy Kovalyov * Returns number of OS pages retrieved on success. The caller may continue to
8067bdf65d4SHaggai Eran * the next data segment.
8077bdf65d4SHaggai Eran * Can return the following error codes:
8087bdf65d4SHaggai Eran * -EAGAIN to designate a temporary error. The caller will abort handling the
8097bdf65d4SHaggai Eran * page fault and resolve it.
8107bdf65d4SHaggai Eran * -EFAULT when there's an error mapping the requested pages. The caller will
811d9aaed83SArtemy Kovalyov * abort the page fault handling.
8127bdf65d4SHaggai Eran */
pagefault_single_data_segment(struct mlx5_ib_dev * dev,struct ib_pd * pd,u32 key,u64 io_virt,size_t bcnt,u32 * bytes_committed,u32 * bytes_mapped)81381dd4c4bSMoni Shoua static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
81481dd4c4bSMoni Shoua struct ib_pd *pd, u32 key,
815813e90b1SMoni Shoua u64 io_virt, size_t bcnt,
816d9aaed83SArtemy Kovalyov u32 *bytes_committed,
817fb985e27SJason Gunthorpe u32 *bytes_mapped)
8187bdf65d4SHaggai Eran {
819db72438cSYishai Hadas int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
820db570d7dSArtemy Kovalyov struct pf_frame *head = NULL, *frame;
8214123bfb0SAharon Landau struct mlx5_ib_mkey *mmkey;
8227bdf65d4SHaggai Eran struct mlx5_ib_mr *mr;
823db570d7dSArtemy Kovalyov struct mlx5_klm *pklm;
824db570d7dSArtemy Kovalyov u32 *out = NULL;
825db570d7dSArtemy Kovalyov size_t offset;
8267bdf65d4SHaggai Eran
827d9aaed83SArtemy Kovalyov io_virt += *bytes_committed;
828d9aaed83SArtemy Kovalyov bcnt -= *bytes_committed;
8297bdf65d4SHaggai Eran
830db570d7dSArtemy Kovalyov next_mr:
831db72438cSYishai Hadas xa_lock(&dev->odp_mkeys);
832806b101bSJason Gunthorpe mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
833806b101bSJason Gunthorpe if (!mmkey) {
834db72438cSYishai Hadas xa_unlock(&dev->odp_mkeys);
835806b101bSJason Gunthorpe mlx5_ib_dbg(
836806b101bSJason Gunthorpe dev,
837806b101bSJason Gunthorpe "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
838806b101bSJason Gunthorpe key);
839806b101bSJason Gunthorpe if (bytes_mapped)
840806b101bSJason Gunthorpe *bytes_mapped += bcnt;
841806b101bSJason Gunthorpe /*
842806b101bSJason Gunthorpe * The user could specify a SGL with multiple lkeys and only
843806b101bSJason Gunthorpe * some of them are ODP. Treat the non-ODP ones as fully
844806b101bSJason Gunthorpe * faulted.
845806b101bSJason Gunthorpe */
846806b101bSJason Gunthorpe ret = 0;
847db72438cSYishai Hadas goto end;
848806b101bSJason Gunthorpe }
849db72438cSYishai Hadas refcount_inc(&mmkey->usecount);
850db72438cSYishai Hadas xa_unlock(&dev->odp_mkeys);
851db72438cSYishai Hadas
852d623dfd2SArtemy Kovalyov if (!mkey_is_eq(mmkey, key)) {
853db570d7dSArtemy Kovalyov mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
854db570d7dSArtemy Kovalyov ret = -EFAULT;
855db72438cSYishai Hadas goto end;
856db570d7dSArtemy Kovalyov }
857db570d7dSArtemy Kovalyov
858db570d7dSArtemy Kovalyov switch (mmkey->type) {
859db570d7dSArtemy Kovalyov case MLX5_MKEY_MR:
860db570d7dSArtemy Kovalyov mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
861db570d7dSArtemy Kovalyov
862fb985e27SJason Gunthorpe ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
863db570d7dSArtemy Kovalyov if (ret < 0)
864db72438cSYishai Hadas goto end;
865db570d7dSArtemy Kovalyov
866a3de94e3SErez Alfasi mlx5_update_odp_stats(mr, faults, ret);
867a3de94e3SErez Alfasi
868db570d7dSArtemy Kovalyov npages += ret;
869db570d7dSArtemy Kovalyov ret = 0;
870db570d7dSArtemy Kovalyov break;
871db570d7dSArtemy Kovalyov
872db570d7dSArtemy Kovalyov case MLX5_MKEY_MW:
873414556afSYishai Hadas case MLX5_MKEY_INDIRECT_DEVX:
874db570d7dSArtemy Kovalyov if (depth >= MLX5_CAP_GEN(dev->mdev, max_indirection)) {
875db570d7dSArtemy Kovalyov mlx5_ib_dbg(dev, "indirection level exceeded\n");
876db570d7dSArtemy Kovalyov ret = -EFAULT;
877db72438cSYishai Hadas goto end;
878db570d7dSArtemy Kovalyov }
879db570d7dSArtemy Kovalyov
880db570d7dSArtemy Kovalyov outlen = MLX5_ST_SZ_BYTES(query_mkey_out) +
881ae0579acSAharon Landau sizeof(*pklm) * (mmkey->ndescs - 2);
882db570d7dSArtemy Kovalyov
883db570d7dSArtemy Kovalyov if (outlen > cur_outlen) {
884db570d7dSArtemy Kovalyov kfree(out);
885db570d7dSArtemy Kovalyov out = kzalloc(outlen, GFP_KERNEL);
886db570d7dSArtemy Kovalyov if (!out) {
887db570d7dSArtemy Kovalyov ret = -ENOMEM;
888db72438cSYishai Hadas goto end;
889db570d7dSArtemy Kovalyov }
890db570d7dSArtemy Kovalyov cur_outlen = outlen;
891db570d7dSArtemy Kovalyov }
892db570d7dSArtemy Kovalyov
893db570d7dSArtemy Kovalyov pklm = (struct mlx5_klm *)MLX5_ADDR_OF(query_mkey_out, out,
894db570d7dSArtemy Kovalyov bsf0_klm0_pas_mtt0_1);
895db570d7dSArtemy Kovalyov
89683fec3f1SAharon Landau ret = mlx5_core_query_mkey(dev->mdev, mmkey->key, out, outlen);
897db570d7dSArtemy Kovalyov if (ret)
898db72438cSYishai Hadas goto end;
899db570d7dSArtemy Kovalyov
900db570d7dSArtemy Kovalyov offset = io_virt - MLX5_GET64(query_mkey_out, out,
901db570d7dSArtemy Kovalyov memory_key_mkey_entry.start_addr);
902db570d7dSArtemy Kovalyov
903ae0579acSAharon Landau for (i = 0; bcnt && i < mmkey->ndescs; i++, pklm++) {
904db570d7dSArtemy Kovalyov if (offset >= be32_to_cpu(pklm->bcount)) {
905db570d7dSArtemy Kovalyov offset -= be32_to_cpu(pklm->bcount);
906db570d7dSArtemy Kovalyov continue;
907db570d7dSArtemy Kovalyov }
908db570d7dSArtemy Kovalyov
909db570d7dSArtemy Kovalyov frame = kzalloc(sizeof(*frame), GFP_KERNEL);
910db570d7dSArtemy Kovalyov if (!frame) {
911db570d7dSArtemy Kovalyov ret = -ENOMEM;
912db72438cSYishai Hadas goto end;
913db570d7dSArtemy Kovalyov }
914db570d7dSArtemy Kovalyov
915db570d7dSArtemy Kovalyov frame->key = be32_to_cpu(pklm->key);
916db570d7dSArtemy Kovalyov frame->io_virt = be64_to_cpu(pklm->va) + offset;
917db570d7dSArtemy Kovalyov frame->bcnt = min_t(size_t, bcnt,
918db570d7dSArtemy Kovalyov be32_to_cpu(pklm->bcount) - offset);
919db570d7dSArtemy Kovalyov frame->depth = depth + 1;
920db570d7dSArtemy Kovalyov frame->next = head;
921db570d7dSArtemy Kovalyov head = frame;
922db570d7dSArtemy Kovalyov
923db570d7dSArtemy Kovalyov bcnt -= frame->bcnt;
92475b7b86bSArtemy Kovalyov offset = 0;
925db570d7dSArtemy Kovalyov }
926db570d7dSArtemy Kovalyov break;
927db570d7dSArtemy Kovalyov
928db570d7dSArtemy Kovalyov default:
929db570d7dSArtemy Kovalyov mlx5_ib_dbg(dev, "wrong mkey type %d\n", mmkey->type);
930db570d7dSArtemy Kovalyov ret = -EFAULT;
931db72438cSYishai Hadas goto end;
932db570d7dSArtemy Kovalyov }
933db570d7dSArtemy Kovalyov
934db570d7dSArtemy Kovalyov if (head) {
935db570d7dSArtemy Kovalyov frame = head;
936db570d7dSArtemy Kovalyov head = frame->next;
937db570d7dSArtemy Kovalyov
938db570d7dSArtemy Kovalyov key = frame->key;
939db570d7dSArtemy Kovalyov io_virt = frame->io_virt;
940db570d7dSArtemy Kovalyov bcnt = frame->bcnt;
941db570d7dSArtemy Kovalyov depth = frame->depth;
942db570d7dSArtemy Kovalyov kfree(frame);
943db570d7dSArtemy Kovalyov
944db72438cSYishai Hadas mlx5r_deref_odp_mkey(mmkey);
945db570d7dSArtemy Kovalyov goto next_mr;
946db570d7dSArtemy Kovalyov }
9477bdf65d4SHaggai Eran
948db72438cSYishai Hadas end:
949db72438cSYishai Hadas if (mmkey)
950db72438cSYishai Hadas mlx5r_deref_odp_mkey(mmkey);
951db570d7dSArtemy Kovalyov while (head) {
952db570d7dSArtemy Kovalyov frame = head;
953db570d7dSArtemy Kovalyov head = frame->next;
954db570d7dSArtemy Kovalyov kfree(frame);
955db570d7dSArtemy Kovalyov }
956db570d7dSArtemy Kovalyov kfree(out);
957db570d7dSArtemy Kovalyov
958d9aaed83SArtemy Kovalyov *bytes_committed = 0;
9597bdf65d4SHaggai Eran return ret ? ret : npages;
9607bdf65d4SHaggai Eran }
9617bdf65d4SHaggai Eran
962f9180399SLeon Romanovsky /*
9637bdf65d4SHaggai Eran * Parse a series of data segments for page fault handling.
9647bdf65d4SHaggai Eran *
9655e769e44SLee Jones * @dev: Pointer to mlx5 IB device
9665e769e44SLee Jones * @pfault: contains page fault information.
9675e769e44SLee Jones * @wqe: points at the first data segment in the WQE.
9685e769e44SLee Jones * @wqe_end: points after the end of the WQE.
9695e769e44SLee Jones * @bytes_mapped: receives the number of bytes that the function was able to
9707bdf65d4SHaggai Eran * map. This allows the caller to decide intelligently whether
9717bdf65d4SHaggai Eran * enough memory was mapped to resolve the page fault
9727bdf65d4SHaggai Eran * successfully (e.g. enough for the next MTU, or the entire
9737bdf65d4SHaggai Eran * WQE).
9745e769e44SLee Jones * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus
9757bdf65d4SHaggai Eran * the committed bytes).
9765e769e44SLee Jones * @receive_queue: receive WQE end of sg list
9777bdf65d4SHaggai Eran *
9787bdf65d4SHaggai Eran * Returns the number of pages loaded if positive, zero for an empty WQE, or a
9797bdf65d4SHaggai Eran * negative error code.
9807bdf65d4SHaggai Eran */
pagefault_data_segments(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault,void * wqe,void * wqe_end,u32 * bytes_mapped,u32 * total_wqe_bytes,bool receive_queue)981d9aaed83SArtemy Kovalyov static int pagefault_data_segments(struct mlx5_ib_dev *dev,
982d9aaed83SArtemy Kovalyov struct mlx5_pagefault *pfault,
983586f4e95SMoni Shoua void *wqe,
9847bdf65d4SHaggai Eran void *wqe_end, u32 *bytes_mapped,
9850f51427bSLeon Romanovsky u32 *total_wqe_bytes, bool receive_queue)
9867bdf65d4SHaggai Eran {
9877bdf65d4SHaggai Eran int ret = 0, npages = 0;
9887bdf65d4SHaggai Eran u64 io_virt;
989a419bfb7SOr Har-Toov __be32 key;
9907bdf65d4SHaggai Eran u32 byte_count;
9917bdf65d4SHaggai Eran size_t bcnt;
9927bdf65d4SHaggai Eran int inline_segment;
9937bdf65d4SHaggai Eran
9947bdf65d4SHaggai Eran if (bytes_mapped)
9957bdf65d4SHaggai Eran *bytes_mapped = 0;
9967bdf65d4SHaggai Eran if (total_wqe_bytes)
9977bdf65d4SHaggai Eran *total_wqe_bytes = 0;
9987bdf65d4SHaggai Eran
9997bdf65d4SHaggai Eran while (wqe < wqe_end) {
10007bdf65d4SHaggai Eran struct mlx5_wqe_data_seg *dseg = wqe;
10017bdf65d4SHaggai Eran
10027bdf65d4SHaggai Eran io_virt = be64_to_cpu(dseg->addr);
1003a419bfb7SOr Har-Toov key = dseg->lkey;
10047bdf65d4SHaggai Eran byte_count = be32_to_cpu(dseg->byte_count);
10057bdf65d4SHaggai Eran inline_segment = !!(byte_count & MLX5_INLINE_SEG);
10067bdf65d4SHaggai Eran bcnt = byte_count & ~MLX5_INLINE_SEG;
10077bdf65d4SHaggai Eran
10087bdf65d4SHaggai Eran if (inline_segment) {
10097bdf65d4SHaggai Eran bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
10107bdf65d4SHaggai Eran wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
10117bdf65d4SHaggai Eran 16);
10127bdf65d4SHaggai Eran } else {
10137bdf65d4SHaggai Eran wqe += sizeof(*dseg);
10147bdf65d4SHaggai Eran }
10157bdf65d4SHaggai Eran
10167bdf65d4SHaggai Eran /* receive WQE end of sg list. */
1017a419bfb7SOr Har-Toov if (receive_queue && bcnt == 0 &&
1018*594cac11SOr Har-Toov key == dev->mkeys.terminate_scatter_list_mkey &&
1019*594cac11SOr Har-Toov io_virt == 0)
10207bdf65d4SHaggai Eran break;
10217bdf65d4SHaggai Eran
10227bdf65d4SHaggai Eran if (!inline_segment && total_wqe_bytes) {
10237bdf65d4SHaggai Eran *total_wqe_bytes += bcnt - min_t(size_t, bcnt,
1024d9aaed83SArtemy Kovalyov pfault->bytes_committed);
10257bdf65d4SHaggai Eran }
10267bdf65d4SHaggai Eran
10277bdf65d4SHaggai Eran /* A zero length data segment designates a length of 2GB. */
10287bdf65d4SHaggai Eran if (bcnt == 0)
10297bdf65d4SHaggai Eran bcnt = 1U << 31;
10307bdf65d4SHaggai Eran
1031d9aaed83SArtemy Kovalyov if (inline_segment || bcnt <= pfault->bytes_committed) {
1032d9aaed83SArtemy Kovalyov pfault->bytes_committed -=
10337bdf65d4SHaggai Eran min_t(size_t, bcnt,
1034d9aaed83SArtemy Kovalyov pfault->bytes_committed);
10357bdf65d4SHaggai Eran continue;
10367bdf65d4SHaggai Eran }
10377bdf65d4SHaggai Eran
1038a419bfb7SOr Har-Toov ret = pagefault_single_data_segment(dev, NULL, be32_to_cpu(key),
103981dd4c4bSMoni Shoua io_virt, bcnt,
1040d9aaed83SArtemy Kovalyov &pfault->bytes_committed,
1041fb985e27SJason Gunthorpe bytes_mapped);
10427bdf65d4SHaggai Eran if (ret < 0)
10437bdf65d4SHaggai Eran break;
10447bdf65d4SHaggai Eran npages += ret;
10457bdf65d4SHaggai Eran }
10467bdf65d4SHaggai Eran
10477bdf65d4SHaggai Eran return ret < 0 ? ret : npages;
10487bdf65d4SHaggai Eran }
10497bdf65d4SHaggai Eran
10507bdf65d4SHaggai Eran /*
10517bdf65d4SHaggai Eran * Parse initiator WQE. Advances the wqe pointer to point at the
10527bdf65d4SHaggai Eran * scatter-gather list, and set wqe_end to the end of the WQE.
10537bdf65d4SHaggai Eran */
mlx5_ib_mr_initiator_pfault_handler(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault,struct mlx5_ib_qp * qp,void ** wqe,void ** wqe_end,int wqe_length)10547bdf65d4SHaggai Eran static int mlx5_ib_mr_initiator_pfault_handler(
1055d9aaed83SArtemy Kovalyov struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault,
1056d9aaed83SArtemy Kovalyov struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length)
10577bdf65d4SHaggai Eran {
10587bdf65d4SHaggai Eran struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
1059d9aaed83SArtemy Kovalyov u16 wqe_index = pfault->wqe.wqe_index;
106017d2f88fSArtemy Kovalyov struct mlx5_base_av *av;
10617bdf65d4SHaggai Eran unsigned ds, opcode;
106219098df2Smajd@mellanox.com u32 qpn = qp->trans_qp.base.mqp.qpn;
10637bdf65d4SHaggai Eran
10647bdf65d4SHaggai Eran ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
10657bdf65d4SHaggai Eran if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
10667bdf65d4SHaggai Eran mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
10677bdf65d4SHaggai Eran ds, wqe_length);
10687bdf65d4SHaggai Eran return -EFAULT;
10697bdf65d4SHaggai Eran }
10707bdf65d4SHaggai Eran
10717bdf65d4SHaggai Eran if (ds == 0) {
10727bdf65d4SHaggai Eran mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
107319098df2Smajd@mellanox.com wqe_index, qpn);
10747bdf65d4SHaggai Eran return -EFAULT;
10757bdf65d4SHaggai Eran }
10767bdf65d4SHaggai Eran
10777bdf65d4SHaggai Eran *wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
10787bdf65d4SHaggai Eran *wqe += sizeof(*ctrl);
10797bdf65d4SHaggai Eran
10807bdf65d4SHaggai Eran opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
10817bdf65d4SHaggai Eran MLX5_WQE_CTRL_OPCODE_MASK;
108217d2f88fSArtemy Kovalyov
10839ecf6ac1SMaor Gottlieb if (qp->type == IB_QPT_XRC_INI)
108429917f47SMoni Shoua *wqe += sizeof(struct mlx5_wqe_xrc_seg);
10857bdf65d4SHaggai Eran
10867aede1a2SLeon Romanovsky if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
108717d2f88fSArtemy Kovalyov av = *wqe;
1088931b3c1aSLeon Romanovsky if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
108917d2f88fSArtemy Kovalyov *wqe += sizeof(struct mlx5_av);
109017d2f88fSArtemy Kovalyov else
109117d2f88fSArtemy Kovalyov *wqe += sizeof(struct mlx5_base_av);
109217d2f88fSArtemy Kovalyov }
109317d2f88fSArtemy Kovalyov
109417d2f88fSArtemy Kovalyov switch (opcode) {
109517d2f88fSArtemy Kovalyov case MLX5_OPCODE_RDMA_WRITE:
109617d2f88fSArtemy Kovalyov case MLX5_OPCODE_RDMA_WRITE_IMM:
109717d2f88fSArtemy Kovalyov case MLX5_OPCODE_RDMA_READ:
109817d2f88fSArtemy Kovalyov *wqe += sizeof(struct mlx5_wqe_raddr_seg);
109917d2f88fSArtemy Kovalyov break;
110017d2f88fSArtemy Kovalyov case MLX5_OPCODE_ATOMIC_CS:
110117d2f88fSArtemy Kovalyov case MLX5_OPCODE_ATOMIC_FA:
110217d2f88fSArtemy Kovalyov *wqe += sizeof(struct mlx5_wqe_raddr_seg);
110317d2f88fSArtemy Kovalyov *wqe += sizeof(struct mlx5_wqe_atomic_seg);
110417d2f88fSArtemy Kovalyov break;
110517d2f88fSArtemy Kovalyov }
110617d2f88fSArtemy Kovalyov
11077bdf65d4SHaggai Eran return 0;
11087bdf65d4SHaggai Eran }
11097bdf65d4SHaggai Eran
11107bdf65d4SHaggai Eran /*
11116ff7414aSMoni Shoua * Parse responder WQE and set wqe_end to the end of the WQE.
11127bdf65d4SHaggai Eran */
mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev * dev,struct mlx5_ib_srq * srq,void ** wqe,void ** wqe_end,int wqe_length)111308100fadSMoni Shoua static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev *dev,
111408100fadSMoni Shoua struct mlx5_ib_srq *srq,
111508100fadSMoni Shoua void **wqe, void **wqe_end,
111608100fadSMoni Shoua int wqe_length)
111708100fadSMoni Shoua {
111808100fadSMoni Shoua int wqe_size = 1 << srq->msrq.wqe_shift;
111908100fadSMoni Shoua
112008100fadSMoni Shoua if (wqe_size > wqe_length) {
112108100fadSMoni Shoua mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
112208100fadSMoni Shoua return -EFAULT;
112308100fadSMoni Shoua }
112408100fadSMoni Shoua
112508100fadSMoni Shoua *wqe_end = *wqe + wqe_size;
112608100fadSMoni Shoua *wqe += sizeof(struct mlx5_wqe_srq_next_seg);
112708100fadSMoni Shoua
112808100fadSMoni Shoua return 0;
112908100fadSMoni Shoua }
113008100fadSMoni Shoua
mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev * dev,struct mlx5_ib_qp * qp,void * wqe,void ** wqe_end,int wqe_length)113108100fadSMoni Shoua static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev,
113208100fadSMoni Shoua struct mlx5_ib_qp *qp,
113308100fadSMoni Shoua void *wqe, void **wqe_end,
11346ff7414aSMoni Shoua int wqe_length)
11357bdf65d4SHaggai Eran {
11367bdf65d4SHaggai Eran struct mlx5_ib_wq *wq = &qp->rq;
11377bdf65d4SHaggai Eran int wqe_size = 1 << wq->wqe_shift;
11387bdf65d4SHaggai Eran
1139c95e6d53SLeon Romanovsky if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
11407bdf65d4SHaggai Eran mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
11417bdf65d4SHaggai Eran return -EFAULT;
11427bdf65d4SHaggai Eran }
11437bdf65d4SHaggai Eran
11447bdf65d4SHaggai Eran if (wqe_size > wqe_length) {
11457bdf65d4SHaggai Eran mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
11467bdf65d4SHaggai Eran return -EFAULT;
11477bdf65d4SHaggai Eran }
11487bdf65d4SHaggai Eran
11496ff7414aSMoni Shoua *wqe_end = wqe + wqe_size;
11507bdf65d4SHaggai Eran
11517bdf65d4SHaggai Eran return 0;
11527bdf65d4SHaggai Eran }
11537bdf65d4SHaggai Eran
odp_get_rsc(struct mlx5_ib_dev * dev,u32 wq_num,int pf_type)1154032080abSMoni Shoua static inline struct mlx5_core_rsc_common *odp_get_rsc(struct mlx5_ib_dev *dev,
1155032080abSMoni Shoua u32 wq_num, int pf_type)
11567bdf65d4SHaggai Eran {
115710f56242SMoni Shoua struct mlx5_core_rsc_common *common = NULL;
115810f56242SMoni Shoua struct mlx5_core_srq *srq;
1159d9aaed83SArtemy Kovalyov
1160032080abSMoni Shoua switch (pf_type) {
1161032080abSMoni Shoua case MLX5_WQE_PF_TYPE_RMP:
116210f56242SMoni Shoua srq = mlx5_cmd_get_srq(dev, wq_num);
116310f56242SMoni Shoua if (srq)
116410f56242SMoni Shoua common = &srq->common;
1165032080abSMoni Shoua break;
1166032080abSMoni Shoua case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
1167032080abSMoni Shoua case MLX5_WQE_PF_TYPE_RESP:
1168032080abSMoni Shoua case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
1169333fbaa0SLeon Romanovsky common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
1170032080abSMoni Shoua break;
1171032080abSMoni Shoua default:
117210f56242SMoni Shoua break;
1173d9aaed83SArtemy Kovalyov }
1174d9aaed83SArtemy Kovalyov
117510f56242SMoni Shoua return common;
1176032080abSMoni Shoua }
1177032080abSMoni Shoua
res_to_qp(struct mlx5_core_rsc_common * res)1178032080abSMoni Shoua static inline struct mlx5_ib_qp *res_to_qp(struct mlx5_core_rsc_common *res)
1179032080abSMoni Shoua {
1180032080abSMoni Shoua struct mlx5_core_qp *mqp = (struct mlx5_core_qp *)res;
1181032080abSMoni Shoua
1182d9aaed83SArtemy Kovalyov return to_mibqp(mqp);
1183d9aaed83SArtemy Kovalyov }
1184d9aaed83SArtemy Kovalyov
res_to_srq(struct mlx5_core_rsc_common * res)118508100fadSMoni Shoua static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
118608100fadSMoni Shoua {
118708100fadSMoni Shoua struct mlx5_core_srq *msrq =
118808100fadSMoni Shoua container_of(res, struct mlx5_core_srq, common);
118908100fadSMoni Shoua
119008100fadSMoni Shoua return to_mibsrq(msrq);
119108100fadSMoni Shoua }
119208100fadSMoni Shoua
mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault)1193d9aaed83SArtemy Kovalyov static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
1194d9aaed83SArtemy Kovalyov struct mlx5_pagefault *pfault)
1195d9aaed83SArtemy Kovalyov {
11960f51427bSLeon Romanovsky bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
1197d9aaed83SArtemy Kovalyov u16 wqe_index = pfault->wqe.wqe_index;
1198130c2c57SDanit Goldberg void *wqe, *wqe_start = NULL, *wqe_end = NULL;
11990f51427bSLeon Romanovsky u32 bytes_mapped, total_wqe_bytes;
12000f51427bSLeon Romanovsky struct mlx5_core_rsc_common *res;
12010f51427bSLeon Romanovsky int resume_with_error = 1;
12020f51427bSLeon Romanovsky struct mlx5_ib_qp *qp;
1203fbeb4075SMoni Shoua size_t bytes_copied;
12040f51427bSLeon Romanovsky int ret = 0;
12057bdf65d4SHaggai Eran
1206032080abSMoni Shoua res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
1207032080abSMoni Shoua if (!res) {
1208032080abSMoni Shoua mlx5_ib_dbg(dev, "wqe page fault for missing resource %d\n", pfault->wqe.wq_num);
1209032080abSMoni Shoua return;
1210032080abSMoni Shoua }
1211032080abSMoni Shoua
12120f51427bSLeon Romanovsky if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
12130f51427bSLeon Romanovsky res->res != MLX5_RES_XSRQ) {
12140f51427bSLeon Romanovsky mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
12150f51427bSLeon Romanovsky pfault->type);
1216032080abSMoni Shoua goto resolve_page_fault;
1217032080abSMoni Shoua }
1218032080abSMoni Shoua
1219130c2c57SDanit Goldberg wqe_start = (void *)__get_free_page(GFP_KERNEL);
1220130c2c57SDanit Goldberg if (!wqe_start) {
12217bdf65d4SHaggai Eran mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
12227bdf65d4SHaggai Eran goto resolve_page_fault;
12237bdf65d4SHaggai Eran }
12247bdf65d4SHaggai Eran
1225130c2c57SDanit Goldberg wqe = wqe_start;
12260f51427bSLeon Romanovsky qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
12270f51427bSLeon Romanovsky if (qp && sq) {
1228da9ee9d8SMoni Shoua ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
1229fbeb4075SMoni Shoua &bytes_copied);
12300f51427bSLeon Romanovsky if (ret)
12310f51427bSLeon Romanovsky goto read_user;
12320f51427bSLeon Romanovsky ret = mlx5_ib_mr_initiator_pfault_handler(
12330f51427bSLeon Romanovsky dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
12340f51427bSLeon Romanovsky } else if (qp && !sq) {
1235da9ee9d8SMoni Shoua ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
1236fbeb4075SMoni Shoua &bytes_copied);
12370f51427bSLeon Romanovsky if (ret)
12380f51427bSLeon Romanovsky goto read_user;
12390f51427bSLeon Romanovsky ret = mlx5_ib_mr_responder_pfault_handler_rq(
12400f51427bSLeon Romanovsky dev, qp, wqe, &wqe_end, bytes_copied);
12410f51427bSLeon Romanovsky } else if (!qp) {
12420f51427bSLeon Romanovsky struct mlx5_ib_srq *srq = res_to_srq(res);
12430f51427bSLeon Romanovsky
1244da9ee9d8SMoni Shoua ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
124508100fadSMoni Shoua &bytes_copied);
12460f51427bSLeon Romanovsky if (ret)
12470f51427bSLeon Romanovsky goto read_user;
12480f51427bSLeon Romanovsky ret = mlx5_ib_mr_responder_pfault_handler_srq(
12490f51427bSLeon Romanovsky dev, srq, &wqe, &wqe_end, bytes_copied);
125008100fadSMoni Shoua }
1251fbeb4075SMoni Shoua
12520f51427bSLeon Romanovsky if (ret < 0 || wqe >= wqe_end)
12530f51427bSLeon Romanovsky goto resolve_page_fault;
12540f51427bSLeon Romanovsky
12550f51427bSLeon Romanovsky ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
12560f51427bSLeon Romanovsky &total_wqe_bytes, !sq);
12570f51427bSLeon Romanovsky if (ret == -EAGAIN)
12580f51427bSLeon Romanovsky goto out;
12590f51427bSLeon Romanovsky
12600f51427bSLeon Romanovsky if (ret < 0 || total_wqe_bytes > bytes_mapped)
12610f51427bSLeon Romanovsky goto resolve_page_fault;
12620f51427bSLeon Romanovsky
12630f51427bSLeon Romanovsky out:
12640f51427bSLeon Romanovsky ret = 0;
12650f51427bSLeon Romanovsky resume_with_error = 0;
12660f51427bSLeon Romanovsky
12670f51427bSLeon Romanovsky read_user:
12680f51427bSLeon Romanovsky if (ret)
12690f51427bSLeon Romanovsky mlx5_ib_err(
12700f51427bSLeon Romanovsky dev,
12710f51427bSLeon Romanovsky "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
1272d9aaed83SArtemy Kovalyov ret, wqe_index, pfault->token);
12737bdf65d4SHaggai Eran
12747bdf65d4SHaggai Eran resolve_page_fault:
1275d9aaed83SArtemy Kovalyov mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
1276d9aaed83SArtemy Kovalyov mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
127781713d37SArtemy Kovalyov pfault->wqe.wq_num, resume_with_error,
1278d9aaed83SArtemy Kovalyov pfault->type);
1279032080abSMoni Shoua mlx5_core_res_put(res);
1280130c2c57SDanit Goldberg free_page((unsigned long)wqe_start);
12817bdf65d4SHaggai Eran }
12827bdf65d4SHaggai Eran
pages_in_range(u64 address,u32 length)1283eab668a6SHaggai Eran static int pages_in_range(u64 address, u32 length)
1284eab668a6SHaggai Eran {
1285eab668a6SHaggai Eran return (ALIGN(address + length, PAGE_SIZE) -
1286eab668a6SHaggai Eran (address & PAGE_MASK)) >> PAGE_SHIFT;
1287eab668a6SHaggai Eran }
1288eab668a6SHaggai Eran
mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault)1289d9aaed83SArtemy Kovalyov static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
1290d9aaed83SArtemy Kovalyov struct mlx5_pagefault *pfault)
1291eab668a6SHaggai Eran {
1292eab668a6SHaggai Eran u64 address;
1293eab668a6SHaggai Eran u32 length;
1294d9aaed83SArtemy Kovalyov u32 prefetch_len = pfault->bytes_committed;
1295eab668a6SHaggai Eran int prefetch_activated = 0;
1296d9aaed83SArtemy Kovalyov u32 rkey = pfault->rdma.r_key;
1297eab668a6SHaggai Eran int ret;
1298eab668a6SHaggai Eran
1299eab668a6SHaggai Eran /* The RDMA responder handler handles the page fault in two parts.
1300eab668a6SHaggai Eran * First it brings the necessary pages for the current packet
1301eab668a6SHaggai Eran * (and uses the pfault context), and then (after resuming the QP)
1302eab668a6SHaggai Eran * prefetches more pages. The second operation cannot use the pfault
1303eab668a6SHaggai Eran * context and therefore uses the dummy_pfault context allocated on
1304eab668a6SHaggai Eran * the stack */
1305d9aaed83SArtemy Kovalyov pfault->rdma.rdma_va += pfault->bytes_committed;
1306d9aaed83SArtemy Kovalyov pfault->rdma.rdma_op_len -= min(pfault->bytes_committed,
1307d9aaed83SArtemy Kovalyov pfault->rdma.rdma_op_len);
1308d9aaed83SArtemy Kovalyov pfault->bytes_committed = 0;
1309eab668a6SHaggai Eran
1310d9aaed83SArtemy Kovalyov address = pfault->rdma.rdma_va;
1311d9aaed83SArtemy Kovalyov length = pfault->rdma.rdma_op_len;
1312eab668a6SHaggai Eran
1313eab668a6SHaggai Eran /* For some operations, the hardware cannot tell the exact message
1314eab668a6SHaggai Eran * length, and in those cases it reports zero. Use prefetch
1315eab668a6SHaggai Eran * logic. */
1316eab668a6SHaggai Eran if (length == 0) {
1317eab668a6SHaggai Eran prefetch_activated = 1;
1318d9aaed83SArtemy Kovalyov length = pfault->rdma.packet_size;
1319eab668a6SHaggai Eran prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
1320eab668a6SHaggai Eran }
1321eab668a6SHaggai Eran
132281dd4c4bSMoni Shoua ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
1323fb985e27SJason Gunthorpe &pfault->bytes_committed, NULL);
1324eab668a6SHaggai Eran if (ret == -EAGAIN) {
1325eab668a6SHaggai Eran /* We're racing with an invalidation, don't prefetch */
1326eab668a6SHaggai Eran prefetch_activated = 0;
1327eab668a6SHaggai Eran } else if (ret < 0 || pages_in_range(address, length) > ret) {
1328d9aaed83SArtemy Kovalyov mlx5_ib_page_fault_resume(dev, pfault, 1);
1329d9aaed83SArtemy Kovalyov if (ret != -ENOENT)
13304df4a5baSArtemy Kovalyov mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1331d9aaed83SArtemy Kovalyov ret, pfault->token, pfault->type);
1332eab668a6SHaggai Eran return;
1333eab668a6SHaggai Eran }
1334eab668a6SHaggai Eran
1335d9aaed83SArtemy Kovalyov mlx5_ib_page_fault_resume(dev, pfault, 0);
1336d9aaed83SArtemy Kovalyov mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1337d9aaed83SArtemy Kovalyov pfault->token, pfault->type,
1338d9aaed83SArtemy Kovalyov prefetch_activated);
1339eab668a6SHaggai Eran
1340eab668a6SHaggai Eran /* At this point, there might be a new pagefault already arriving in
1341eab668a6SHaggai Eran * the eq, switch to the dummy pagefault for the rest of the
1342eab668a6SHaggai Eran * processing. We're still OK with the objects being alive as the
1343eab668a6SHaggai Eran * work-queue is being fenced. */
1344eab668a6SHaggai Eran
1345eab668a6SHaggai Eran if (prefetch_activated) {
1346d9aaed83SArtemy Kovalyov u32 bytes_committed = 0;
1347d9aaed83SArtemy Kovalyov
134881dd4c4bSMoni Shoua ret = pagefault_single_data_segment(dev, NULL, rkey, address,
1349eab668a6SHaggai Eran prefetch_len,
1350fb985e27SJason Gunthorpe &bytes_committed, NULL);
135181713d37SArtemy Kovalyov if (ret < 0 && ret != -EAGAIN) {
13524df4a5baSArtemy Kovalyov mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
135381713d37SArtemy Kovalyov ret, pfault->token, address, prefetch_len);
1354eab668a6SHaggai Eran }
1355eab668a6SHaggai Eran }
1356eab668a6SHaggai Eran }
1357eab668a6SHaggai Eran
mlx5_ib_pfault(struct mlx5_ib_dev * dev,struct mlx5_pagefault * pfault)1358d5d284b8SSaeed Mahameed static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault)
13596aec21f6SHaggai Eran {
1360d9aaed83SArtemy Kovalyov u8 event_subtype = pfault->event_subtype;
13616aec21f6SHaggai Eran
13626aec21f6SHaggai Eran switch (event_subtype) {
13637bdf65d4SHaggai Eran case MLX5_PFAULT_SUBTYPE_WQE:
1364d9aaed83SArtemy Kovalyov mlx5_ib_mr_wqe_pfault_handler(dev, pfault);
13657bdf65d4SHaggai Eran break;
1366eab668a6SHaggai Eran case MLX5_PFAULT_SUBTYPE_RDMA:
1367d9aaed83SArtemy Kovalyov mlx5_ib_mr_rdma_pfault_handler(dev, pfault);
1368eab668a6SHaggai Eran break;
13696aec21f6SHaggai Eran default:
1370d9aaed83SArtemy Kovalyov mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n",
13716aec21f6SHaggai Eran event_subtype);
1372d9aaed83SArtemy Kovalyov mlx5_ib_page_fault_resume(dev, pfault, 1);
13736aec21f6SHaggai Eran }
13746aec21f6SHaggai Eran }
13756aec21f6SHaggai Eran
mlx5_ib_eqe_pf_action(struct work_struct * work)1376d5d284b8SSaeed Mahameed static void mlx5_ib_eqe_pf_action(struct work_struct *work)
1377d5d284b8SSaeed Mahameed {
1378d5d284b8SSaeed Mahameed struct mlx5_pagefault *pfault = container_of(work,
1379d5d284b8SSaeed Mahameed struct mlx5_pagefault,
1380d5d284b8SSaeed Mahameed work);
1381d5d284b8SSaeed Mahameed struct mlx5_ib_pf_eq *eq = pfault->eq;
1382d5d284b8SSaeed Mahameed
1383d5d284b8SSaeed Mahameed mlx5_ib_pfault(eq->dev, pfault);
1384d5d284b8SSaeed Mahameed mempool_free(pfault, eq->pool);
1385d5d284b8SSaeed Mahameed }
1386d5d284b8SSaeed Mahameed
mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq * eq)1387d5d284b8SSaeed Mahameed static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq)
1388d5d284b8SSaeed Mahameed {
1389d5d284b8SSaeed Mahameed struct mlx5_eqe_page_fault *pf_eqe;
1390d5d284b8SSaeed Mahameed struct mlx5_pagefault *pfault;
1391d5d284b8SSaeed Mahameed struct mlx5_eqe *eqe;
1392d5d284b8SSaeed Mahameed int cc = 0;
1393d5d284b8SSaeed Mahameed
1394d5d284b8SSaeed Mahameed while ((eqe = mlx5_eq_get_eqe(eq->core, cc))) {
1395d5d284b8SSaeed Mahameed pfault = mempool_alloc(eq->pool, GFP_ATOMIC);
1396d5d284b8SSaeed Mahameed if (!pfault) {
1397d5d284b8SSaeed Mahameed schedule_work(&eq->work);
1398d5d284b8SSaeed Mahameed break;
1399d5d284b8SSaeed Mahameed }
1400d5d284b8SSaeed Mahameed
1401d5d284b8SSaeed Mahameed pf_eqe = &eqe->data.page_fault;
1402d5d284b8SSaeed Mahameed pfault->event_subtype = eqe->sub_type;
1403d5d284b8SSaeed Mahameed pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed);
1404d5d284b8SSaeed Mahameed
1405d5d284b8SSaeed Mahameed mlx5_ib_dbg(eq->dev,
1406d5d284b8SSaeed Mahameed "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
1407d5d284b8SSaeed Mahameed eqe->sub_type, pfault->bytes_committed);
1408d5d284b8SSaeed Mahameed
1409d5d284b8SSaeed Mahameed switch (eqe->sub_type) {
1410d5d284b8SSaeed Mahameed case MLX5_PFAULT_SUBTYPE_RDMA:
1411d5d284b8SSaeed Mahameed /* RDMA based event */
1412d5d284b8SSaeed Mahameed pfault->type =
1413d5d284b8SSaeed Mahameed be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24;
1414d5d284b8SSaeed Mahameed pfault->token =
1415d5d284b8SSaeed Mahameed be32_to_cpu(pf_eqe->rdma.pftype_token) &
1416d5d284b8SSaeed Mahameed MLX5_24BIT_MASK;
1417d5d284b8SSaeed Mahameed pfault->rdma.r_key =
1418d5d284b8SSaeed Mahameed be32_to_cpu(pf_eqe->rdma.r_key);
1419d5d284b8SSaeed Mahameed pfault->rdma.packet_size =
1420d5d284b8SSaeed Mahameed be16_to_cpu(pf_eqe->rdma.packet_length);
1421d5d284b8SSaeed Mahameed pfault->rdma.rdma_op_len =
1422d5d284b8SSaeed Mahameed be32_to_cpu(pf_eqe->rdma.rdma_op_len);
1423d5d284b8SSaeed Mahameed pfault->rdma.rdma_va =
1424d5d284b8SSaeed Mahameed be64_to_cpu(pf_eqe->rdma.rdma_va);
1425d5d284b8SSaeed Mahameed mlx5_ib_dbg(eq->dev,
1426d5d284b8SSaeed Mahameed "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
1427d5d284b8SSaeed Mahameed pfault->type, pfault->token,
1428d5d284b8SSaeed Mahameed pfault->rdma.r_key);
1429d5d284b8SSaeed Mahameed mlx5_ib_dbg(eq->dev,
1430d5d284b8SSaeed Mahameed "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1431d5d284b8SSaeed Mahameed pfault->rdma.rdma_op_len,
1432d5d284b8SSaeed Mahameed pfault->rdma.rdma_va);
1433d5d284b8SSaeed Mahameed break;
1434d5d284b8SSaeed Mahameed
1435d5d284b8SSaeed Mahameed case MLX5_PFAULT_SUBTYPE_WQE:
1436d5d284b8SSaeed Mahameed /* WQE based event */
1437d5d284b8SSaeed Mahameed pfault->type =
1438d5d284b8SSaeed Mahameed (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7;
1439d5d284b8SSaeed Mahameed pfault->token =
1440d5d284b8SSaeed Mahameed be32_to_cpu(pf_eqe->wqe.token);
1441d5d284b8SSaeed Mahameed pfault->wqe.wq_num =
1442d5d284b8SSaeed Mahameed be32_to_cpu(pf_eqe->wqe.pftype_wq) &
1443d5d284b8SSaeed Mahameed MLX5_24BIT_MASK;
1444d5d284b8SSaeed Mahameed pfault->wqe.wqe_index =
1445d5d284b8SSaeed Mahameed be16_to_cpu(pf_eqe->wqe.wqe_index);
1446d5d284b8SSaeed Mahameed pfault->wqe.packet_size =
1447d5d284b8SSaeed Mahameed be16_to_cpu(pf_eqe->wqe.packet_length);
1448d5d284b8SSaeed Mahameed mlx5_ib_dbg(eq->dev,
1449d5d284b8SSaeed Mahameed "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1450d5d284b8SSaeed Mahameed pfault->type, pfault->token,
1451d5d284b8SSaeed Mahameed pfault->wqe.wq_num,
1452d5d284b8SSaeed Mahameed pfault->wqe.wqe_index);
1453d5d284b8SSaeed Mahameed break;
1454d5d284b8SSaeed Mahameed
1455d5d284b8SSaeed Mahameed default:
1456d5d284b8SSaeed Mahameed mlx5_ib_warn(eq->dev,
1457d5d284b8SSaeed Mahameed "Unsupported page fault event sub-type: 0x%02hhx\n",
1458d5d284b8SSaeed Mahameed eqe->sub_type);
1459d5d284b8SSaeed Mahameed /* Unsupported page faults should still be
1460d5d284b8SSaeed Mahameed * resolved by the page fault handler
1461d5d284b8SSaeed Mahameed */
1462d5d284b8SSaeed Mahameed }
1463d5d284b8SSaeed Mahameed
1464d5d284b8SSaeed Mahameed pfault->eq = eq;
1465d5d284b8SSaeed Mahameed INIT_WORK(&pfault->work, mlx5_ib_eqe_pf_action);
1466d5d284b8SSaeed Mahameed queue_work(eq->wq, &pfault->work);
1467d5d284b8SSaeed Mahameed
1468d5d284b8SSaeed Mahameed cc = mlx5_eq_update_cc(eq->core, ++cc);
1469d5d284b8SSaeed Mahameed }
1470d5d284b8SSaeed Mahameed
1471d5d284b8SSaeed Mahameed mlx5_eq_update_ci(eq->core, cc, 1);
1472d5d284b8SSaeed Mahameed }
1473d5d284b8SSaeed Mahameed
mlx5_ib_eq_pf_int(struct notifier_block * nb,unsigned long type,void * data)1474ca390799SYuval Avnery static int mlx5_ib_eq_pf_int(struct notifier_block *nb, unsigned long type,
1475ca390799SYuval Avnery void *data)
1476d5d284b8SSaeed Mahameed {
1477ca390799SYuval Avnery struct mlx5_ib_pf_eq *eq =
1478ca390799SYuval Avnery container_of(nb, struct mlx5_ib_pf_eq, irq_nb);
1479d5d284b8SSaeed Mahameed unsigned long flags;
1480d5d284b8SSaeed Mahameed
1481d5d284b8SSaeed Mahameed if (spin_trylock_irqsave(&eq->lock, flags)) {
1482d5d284b8SSaeed Mahameed mlx5_ib_eq_pf_process(eq);
1483d5d284b8SSaeed Mahameed spin_unlock_irqrestore(&eq->lock, flags);
1484d5d284b8SSaeed Mahameed } else {
1485d5d284b8SSaeed Mahameed schedule_work(&eq->work);
1486d5d284b8SSaeed Mahameed }
1487d5d284b8SSaeed Mahameed
1488d5d284b8SSaeed Mahameed return IRQ_HANDLED;
1489d5d284b8SSaeed Mahameed }
1490d5d284b8SSaeed Mahameed
1491d5d284b8SSaeed Mahameed /* mempool_refill() was proposed but unfortunately wasn't accepted
1492d5d284b8SSaeed Mahameed * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
1493d5d284b8SSaeed Mahameed * Cheap workaround.
1494d5d284b8SSaeed Mahameed */
mempool_refill(mempool_t * pool)1495d5d284b8SSaeed Mahameed static void mempool_refill(mempool_t *pool)
1496d5d284b8SSaeed Mahameed {
1497d5d284b8SSaeed Mahameed while (pool->curr_nr < pool->min_nr)
1498d5d284b8SSaeed Mahameed mempool_free(mempool_alloc(pool, GFP_KERNEL), pool);
1499d5d284b8SSaeed Mahameed }
1500d5d284b8SSaeed Mahameed
mlx5_ib_eq_pf_action(struct work_struct * work)1501d5d284b8SSaeed Mahameed static void mlx5_ib_eq_pf_action(struct work_struct *work)
1502d5d284b8SSaeed Mahameed {
1503d5d284b8SSaeed Mahameed struct mlx5_ib_pf_eq *eq =
1504d5d284b8SSaeed Mahameed container_of(work, struct mlx5_ib_pf_eq, work);
1505d5d284b8SSaeed Mahameed
1506d5d284b8SSaeed Mahameed mempool_refill(eq->pool);
1507d5d284b8SSaeed Mahameed
1508d5d284b8SSaeed Mahameed spin_lock_irq(&eq->lock);
1509d5d284b8SSaeed Mahameed mlx5_ib_eq_pf_process(eq);
1510d5d284b8SSaeed Mahameed spin_unlock_irq(&eq->lock);
1511d5d284b8SSaeed Mahameed }
1512d5d284b8SSaeed Mahameed
1513d5d284b8SSaeed Mahameed enum {
1514d5d284b8SSaeed Mahameed MLX5_IB_NUM_PF_EQE = 0x1000,
1515d5d284b8SSaeed Mahameed MLX5_IB_NUM_PF_DRAIN = 64,
1516d5d284b8SSaeed Mahameed };
1517d5d284b8SSaeed Mahameed
mlx5r_odp_create_eq(struct mlx5_ib_dev * dev,struct mlx5_ib_pf_eq * eq)1518ad50294dSShay Drory int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1519d5d284b8SSaeed Mahameed {
1520d5d284b8SSaeed Mahameed struct mlx5_eq_param param = {};
1521ad50294dSShay Drory int err = 0;
1522d5d284b8SSaeed Mahameed
1523ad50294dSShay Drory mutex_lock(&dev->odp_eq_mutex);
1524ad50294dSShay Drory if (eq->core)
1525ad50294dSShay Drory goto unlock;
1526d5d284b8SSaeed Mahameed INIT_WORK(&eq->work, mlx5_ib_eq_pf_action);
1527d5d284b8SSaeed Mahameed spin_lock_init(&eq->lock);
1528d5d284b8SSaeed Mahameed eq->dev = dev;
1529d5d284b8SSaeed Mahameed
1530d5d284b8SSaeed Mahameed eq->pool = mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN,
1531d5d284b8SSaeed Mahameed sizeof(struct mlx5_pagefault));
1532ad50294dSShay Drory if (!eq->pool) {
1533ad50294dSShay Drory err = -ENOMEM;
1534ad50294dSShay Drory goto unlock;
1535ad50294dSShay Drory }
1536d5d284b8SSaeed Mahameed
1537d5d284b8SSaeed Mahameed eq->wq = alloc_workqueue("mlx5_ib_page_fault",
1538d5d284b8SSaeed Mahameed WQ_HIGHPRI | WQ_UNBOUND | WQ_MEM_RECLAIM,
1539d5d284b8SSaeed Mahameed MLX5_NUM_CMD_EQE);
1540d5d284b8SSaeed Mahameed if (!eq->wq) {
1541d5d284b8SSaeed Mahameed err = -ENOMEM;
1542d5d284b8SSaeed Mahameed goto err_mempool;
1543d5d284b8SSaeed Mahameed }
1544d5d284b8SSaeed Mahameed
1545ca390799SYuval Avnery eq->irq_nb.notifier_call = mlx5_ib_eq_pf_int;
1546d5d284b8SSaeed Mahameed param = (struct mlx5_eq_param) {
1547d5d284b8SSaeed Mahameed .nent = MLX5_IB_NUM_PF_EQE,
1548d5d284b8SSaeed Mahameed };
1549b9a7ba55SYishai Hadas param.mask[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT;
155024163189SYuval Avnery eq->core = mlx5_eq_create_generic(dev->mdev, ¶m);
1551d5d284b8SSaeed Mahameed if (IS_ERR(eq->core)) {
1552d5d284b8SSaeed Mahameed err = PTR_ERR(eq->core);
1553d5d284b8SSaeed Mahameed goto err_wq;
1554d5d284b8SSaeed Mahameed }
15551f8a7beeSYuval Avnery err = mlx5_eq_enable(dev->mdev, eq->core, &eq->irq_nb);
15561f8a7beeSYuval Avnery if (err) {
15571f8a7beeSYuval Avnery mlx5_ib_err(dev, "failed to enable odp EQ %d\n", err);
15581f8a7beeSYuval Avnery goto err_eq;
15591f8a7beeSYuval Avnery }
1560d5d284b8SSaeed Mahameed
1561ad50294dSShay Drory mutex_unlock(&dev->odp_eq_mutex);
1562d5d284b8SSaeed Mahameed return 0;
15631f8a7beeSYuval Avnery err_eq:
15641f8a7beeSYuval Avnery mlx5_eq_destroy_generic(dev->mdev, eq->core);
1565d5d284b8SSaeed Mahameed err_wq:
1566ad50294dSShay Drory eq->core = NULL;
1567d5d284b8SSaeed Mahameed destroy_workqueue(eq->wq);
1568d5d284b8SSaeed Mahameed err_mempool:
1569d5d284b8SSaeed Mahameed mempool_destroy(eq->pool);
1570ad50294dSShay Drory unlock:
1571ad50294dSShay Drory mutex_unlock(&dev->odp_eq_mutex);
1572d5d284b8SSaeed Mahameed return err;
1573d5d284b8SSaeed Mahameed }
1574d5d284b8SSaeed Mahameed
1575d5d284b8SSaeed Mahameed static int
mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev * dev,struct mlx5_ib_pf_eq * eq)1576ad50294dSShay Drory mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq)
1577d5d284b8SSaeed Mahameed {
1578d5d284b8SSaeed Mahameed int err;
1579d5d284b8SSaeed Mahameed
1580ad50294dSShay Drory if (!eq->core)
1581ad50294dSShay Drory return 0;
15821f8a7beeSYuval Avnery mlx5_eq_disable(dev->mdev, eq->core, &eq->irq_nb);
1583d5d284b8SSaeed Mahameed err = mlx5_eq_destroy_generic(dev->mdev, eq->core);
1584d5d284b8SSaeed Mahameed cancel_work_sync(&eq->work);
1585d5d284b8SSaeed Mahameed destroy_workqueue(eq->wq);
1586d5d284b8SSaeed Mahameed mempool_destroy(eq->pool);
1587d5d284b8SSaeed Mahameed
1588d5d284b8SSaeed Mahameed return err;
1589d5d284b8SSaeed Mahameed }
1590d5d284b8SSaeed Mahameed
mlx5_odp_init_mkey_cache(struct mlx5_ib_dev * dev)159101137808SAharon Landau int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
159281713d37SArtemy Kovalyov {
159381713d37SArtemy Kovalyov struct mlx5r_cache_rb_key rb_key = {
159481713d37SArtemy Kovalyov .access_mode = MLX5_MKC_ACCESS_MODE_KSM,
159581713d37SArtemy Kovalyov .ndescs = mlx5_imr_ksm_entries,
159681713d37SArtemy Kovalyov };
159781713d37SArtemy Kovalyov struct mlx5_cache_ent *ent;
159881713d37SArtemy Kovalyov
15999ee2516cSAharon Landau if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
160081713d37SArtemy Kovalyov return 0;
160181713d37SArtemy Kovalyov
160281713d37SArtemy Kovalyov ent = mlx5r_cache_create_ent_locked(dev, rb_key, true);
160381713d37SArtemy Kovalyov if (IS_ERR(ent))
160481713d37SArtemy Kovalyov return PTR_ERR(ent);
160581713d37SArtemy Kovalyov
16069ee2516cSAharon Landau return 0;
160781713d37SArtemy Kovalyov }
160881713d37SArtemy Kovalyov
160981713d37SArtemy Kovalyov static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
161081713d37SArtemy Kovalyov .advise_mr = mlx5_ib_advise_mr,
161181713d37SArtemy Kovalyov };
161281713d37SArtemy Kovalyov
mlx5_ib_odp_init_one(struct mlx5_ib_dev * dev)1613813e90b1SMoni Shoua int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
1614813e90b1SMoni Shoua {
1615813e90b1SMoni Shoua internal_fill_odp_caps(dev);
1616813e90b1SMoni Shoua
161781713d37SArtemy Kovalyov if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
16186aec21f6SHaggai Eran return 0;
1619e5dc370bSShay Drory
1620e5dc370bSShay Drory ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops);
162100815752SMoni Shoua
1622*594cac11SOr Har-Toov mutex_init(&dev->odp_eq_mutex);
162300815752SMoni Shoua return 0;
1624813e90b1SMoni Shoua }
1625813e90b1SMoni Shoua
mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev * dev)1626ad50294dSShay Drory void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev)
1627*594cac11SOr Har-Toov {
1628d5d284b8SSaeed Mahameed if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT))
1629d5d284b8SSaeed Mahameed return;
1630d5d284b8SSaeed Mahameed
1631d5d284b8SSaeed Mahameed mlx5_ib_odp_destroy_eq(dev, &dev->odp_pf_eq);
163200815752SMoni Shoua }
1633d5d284b8SSaeed Mahameed
mlx5_ib_odp_init(void)1634d5d284b8SSaeed Mahameed int mlx5_ib_odp_init(void)
1635ad50294dSShay Drory {
16366aec21f6SHaggai Eran mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) -
16376aec21f6SHaggai Eran MLX5_IMR_MTT_BITS);
163881713d37SArtemy Kovalyov
163981713d37SArtemy Kovalyov return 0;
164081713d37SArtemy Kovalyov }
164181713d37SArtemy Kovalyov
164281713d37SArtemy Kovalyov struct prefetch_mr_work {
164381713d37SArtemy Kovalyov struct work_struct work;
16446aec21f6SHaggai Eran u32 pf_flags;
1645813e90b1SMoni Shoua u32 num_sge;
1646813e90b1SMoni Shoua struct {
1647813e90b1SMoni Shoua u64 io_virt;
1648813e90b1SMoni Shoua struct mlx5_ib_mr *mr;
1649813e90b1SMoni Shoua size_t length;
1650fb985e27SJason Gunthorpe } frags[];
1651fb985e27SJason Gunthorpe };
1652fb985e27SJason Gunthorpe
destroy_prefetch_work(struct prefetch_mr_work * work)1653fb985e27SJason Gunthorpe static void destroy_prefetch_work(struct prefetch_mr_work *work)
1654fb985e27SJason Gunthorpe {
1655813e90b1SMoni Shoua u32 i;
1656813e90b1SMoni Shoua
1657fb985e27SJason Gunthorpe for (i = 0; i < work->num_sge; ++i)
1658a6bc3875SMoni Shoua mlx5r_deref_odp_mkey(&work->frags[i].mr->mmkey);
1659a6bc3875SMoni Shoua
1660fb985e27SJason Gunthorpe kvfree(work);
1661fb985e27SJason Gunthorpe }
1662db72438cSYishai Hadas
1663db72438cSYishai Hadas static struct mlx5_ib_mr *
get_prefetchable_mr(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 lkey)1664fb985e27SJason Gunthorpe get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
1665fb985e27SJason Gunthorpe u32 lkey)
1666fb985e27SJason Gunthorpe {
1667fb985e27SJason Gunthorpe struct mlx5_ib_dev *dev = to_mdev(pd->device);
1668fb985e27SJason Gunthorpe struct mlx5_ib_mr *mr = NULL;
1669fb985e27SJason Gunthorpe struct mlx5_ib_mkey *mmkey;
1670fb985e27SJason Gunthorpe
1671fb985e27SJason Gunthorpe xa_lock(&dev->odp_mkeys);
1672db72438cSYishai Hadas mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
16734123bfb0SAharon Landau if (!mmkey || mmkey->key != lkey) {
1674fb985e27SJason Gunthorpe mr = ERR_PTR(-ENOENT);
1675db72438cSYishai Hadas goto end;
1676806b101bSJason Gunthorpe }
167749b99314SJason Gunthorpe if (mmkey->type != MLX5_MKEY_MR) {
167849b99314SJason Gunthorpe mr = ERR_PTR(-EINVAL);
1679db72438cSYishai Hadas goto end;
168049b99314SJason Gunthorpe }
168149b99314SJason Gunthorpe
168249b99314SJason Gunthorpe mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
168349b99314SJason Gunthorpe
168449b99314SJason Gunthorpe if (mr->ibmr.pd != pd) {
1685fb985e27SJason Gunthorpe mr = ERR_PTR(-EPERM);
1686fb985e27SJason Gunthorpe goto end;
1687fb985e27SJason Gunthorpe }
1688db72438cSYishai Hadas
168949b99314SJason Gunthorpe /* prefetch with write-access must be supported by the MR */
1690db72438cSYishai Hadas if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
1691db72438cSYishai Hadas !mr->umem->writable) {
1692fb985e27SJason Gunthorpe mr = ERR_PTR(-EPERM);
1693fb985e27SJason Gunthorpe goto end;
1694fb985e27SJason Gunthorpe }
1695db72438cSYishai Hadas
169649b99314SJason Gunthorpe refcount_inc(&mmkey->usecount);
1697db72438cSYishai Hadas end:
1698db72438cSYishai Hadas xa_unlock(&dev->odp_mkeys);
1699fb985e27SJason Gunthorpe return mr;
1700db72438cSYishai Hadas }
1701db72438cSYishai Hadas
mlx5_ib_prefetch_mr_work(struct work_struct * w)1702db72438cSYishai Hadas static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
1703fb985e27SJason Gunthorpe {
1704fb985e27SJason Gunthorpe struct prefetch_mr_work *work =
1705fb985e27SJason Gunthorpe container_of(w, struct prefetch_mr_work, work);
1706fb985e27SJason Gunthorpe u32 bytes_mapped = 0;
1707fb985e27SJason Gunthorpe int ret;
1708fb985e27SJason Gunthorpe u32 i;
1709fb985e27SJason Gunthorpe
1710fb985e27SJason Gunthorpe /* We rely on IB/core that work is executed if we have num_sge != 0 only. */
1711d473f4dcSMaor Gottlieb WARN_ON(!work->num_sge);
1712fb985e27SJason Gunthorpe for (i = 0; i < work->num_sge; ++i) {
1713fb985e27SJason Gunthorpe ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
1714d4d7f596SMaor Gottlieb work->frags[i].length, &bytes_mapped,
1715d4d7f596SMaor Gottlieb work->pf_flags);
1716d473f4dcSMaor Gottlieb if (ret <= 0)
1717d473f4dcSMaor Gottlieb continue;
1718fb985e27SJason Gunthorpe mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
1719fb985e27SJason Gunthorpe }
1720d473f4dcSMaor Gottlieb
1721d473f4dcSMaor Gottlieb destroy_prefetch_work(work);
1722d473f4dcSMaor Gottlieb }
1723d473f4dcSMaor Gottlieb
init_prefetch_work(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 pf_flags,struct prefetch_mr_work * work,struct ib_sge * sg_list,u32 num_sge)1724fb985e27SJason Gunthorpe static int init_prefetch_work(struct ib_pd *pd,
1725fb985e27SJason Gunthorpe enum ib_uverbs_advise_mr_advice advice,
1726fb985e27SJason Gunthorpe u32 pf_flags, struct prefetch_mr_work *work,
1727fb985e27SJason Gunthorpe struct ib_sge *sg_list, u32 num_sge)
172849b99314SJason Gunthorpe {
1729fb985e27SJason Gunthorpe u32 i;
1730fb985e27SJason Gunthorpe
1731fb985e27SJason Gunthorpe INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
1732fb985e27SJason Gunthorpe work->pf_flags = pf_flags;
1733fb985e27SJason Gunthorpe
1734fb985e27SJason Gunthorpe for (i = 0; i < num_sge; ++i) {
1735fb985e27SJason Gunthorpe struct mlx5_ib_mr *mr;
1736fb985e27SJason Gunthorpe
1737fb985e27SJason Gunthorpe mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1738fb985e27SJason Gunthorpe if (IS_ERR(mr)) {
173949b99314SJason Gunthorpe work->num_sge = i;
174049b99314SJason Gunthorpe return PTR_ERR(mr);
174149b99314SJason Gunthorpe }
174249b99314SJason Gunthorpe work->frags[i].io_virt = sg_list[i].addr;
174349b99314SJason Gunthorpe work->frags[i].length = sg_list[i].length;
174449b99314SJason Gunthorpe work->frags[i].mr = mr;
174549b99314SJason Gunthorpe }
1746fb985e27SJason Gunthorpe work->num_sge = num_sge;
1747fb985e27SJason Gunthorpe return 0;
174849b99314SJason Gunthorpe }
1749fb985e27SJason Gunthorpe
mlx5_ib_prefetch_sg_list(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 pf_flags,struct ib_sge * sg_list,u32 num_sge)1750fb985e27SJason Gunthorpe static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
175149b99314SJason Gunthorpe enum ib_uverbs_advise_mr_advice advice,
1752fb985e27SJason Gunthorpe u32 pf_flags, struct ib_sge *sg_list,
1753fb985e27SJason Gunthorpe u32 num_sge)
1754fb985e27SJason Gunthorpe {
1755fb985e27SJason Gunthorpe u32 bytes_mapped = 0;
1756fb985e27SJason Gunthorpe int ret = 0;
1757fb985e27SJason Gunthorpe u32 i;
1758fb985e27SJason Gunthorpe
1759fb985e27SJason Gunthorpe for (i = 0; i < num_sge; ++i) {
1760a6bc3875SMoni Shoua struct mlx5_ib_mr *mr;
1761fb985e27SJason Gunthorpe
1762813e90b1SMoni Shoua mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
1763813e90b1SMoni Shoua if (IS_ERR(mr))
17648cdd312cSHaggai Eran return PTR_ERR(mr);
1765813e90b1SMoni Shoua ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
1766fb985e27SJason Gunthorpe &bytes_mapped, pf_flags);
176749b99314SJason Gunthorpe if (ret < 0) {
176849b99314SJason Gunthorpe mlx5r_deref_odp_mkey(&mr->mmkey);
1769fb985e27SJason Gunthorpe return ret;
1770fb985e27SJason Gunthorpe }
1771db72438cSYishai Hadas mlx5_update_odp_stats(mr, prefetch, ret);
1772db72438cSYishai Hadas mlx5r_deref_odp_mkey(&mr->mmkey);
1773fb985e27SJason Gunthorpe }
1774813e90b1SMoni Shoua
1775db72438cSYishai Hadas return 0;
1776db72438cSYishai Hadas }
1777db72438cSYishai Hadas
mlx5_ib_advise_mr_prefetch(struct ib_pd * pd,enum ib_uverbs_advise_mr_advice advice,u32 flags,struct ib_sge * sg_list,u32 num_sge)1778db72438cSYishai Hadas int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
1779db72438cSYishai Hadas enum ib_uverbs_advise_mr_advice advice,
1780db72438cSYishai Hadas u32 flags, struct ib_sge *sg_list, u32 num_sge)
1781813e90b1SMoni Shoua {
1782813e90b1SMoni Shoua u32 pf_flags = 0;
1783813e90b1SMoni Shoua struct prefetch_mr_work *work;
1784813e90b1SMoni Shoua int rc;
1785813e90b1SMoni Shoua
1786fb985e27SJason Gunthorpe if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
1787813e90b1SMoni Shoua pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
178849b99314SJason Gunthorpe
1789813e90b1SMoni Shoua if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
1790813e90b1SMoni Shoua pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
1791813e90b1SMoni Shoua
1792813e90b1SMoni Shoua if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
1793677cf51fSYishai Hadas return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
1794677cf51fSYishai Hadas num_sge);
1795677cf51fSYishai Hadas
1796813e90b1SMoni Shoua work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
1797fb985e27SJason Gunthorpe if (!work)
1798813e90b1SMoni Shoua return -ENOMEM;
1799813e90b1SMoni Shoua
1800fb985e27SJason Gunthorpe rc = init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge);
1801813e90b1SMoni Shoua if (rc) {
1802813e90b1SMoni Shoua destroy_prefetch_work(work);
1803813e90b1SMoni Shoua return rc;
180449b99314SJason Gunthorpe }
180549b99314SJason Gunthorpe queue_work(system_unbound_wq, &work->work);
18065351a56bSJason Gunthorpe return 0;
180749b99314SJason Gunthorpe }
1808fb985e27SJason Gunthorpe