xref: /openbmc/linux/drivers/infiniband/sw/siw/siw_mem.c (revision 04eb94d526423ff082efce61f4f26b0369d0bfdd)
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5 
6 #include <linux/gfp.h>
7 #include <rdma/ib_verbs.h>
8 #include <linux/dma-mapping.h>
9 #include <linux/slab.h>
10 #include <linux/sched/mm.h>
11 #include <linux/resource.h>
12 
13 #include "siw.h"
14 #include "siw_mem.h"
15 
16 /*
17  * Stag lookup is based on its index part only (24 bits).
18  * The code avoids special Stag of zero and tries to randomize
19  * STag values between 1 and SIW_STAG_MAX_INDEX.
20  */
21 int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
22 {
23 	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
24 	u32 id, next;
25 
26 	get_random_bytes(&next, 4);
27 	next &= 0x00ffffff;
28 
29 	if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
30 	    GFP_KERNEL) < 0)
31 		return -ENOMEM;
32 
33 	/* Set the STag index part */
34 	m->stag = id << 8;
35 
36 	siw_dbg_mem(m, "new MEM object\n");
37 
38 	return 0;
39 }
40 
41 /*
42  * siw_mem_id2obj()
43  *
44  * resolves memory from stag given by id. might be called from:
45  * o process context before sending out of sgl, or
46  * o in softirq when resolving target memory
47  */
48 struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
49 {
50 	struct siw_mem *mem;
51 
52 	rcu_read_lock();
53 	mem = xa_load(&sdev->mem_xa, stag_index);
54 	if (likely(mem && kref_get_unless_zero(&mem->ref))) {
55 		rcu_read_unlock();
56 		return mem;
57 	}
58 	rcu_read_unlock();
59 
60 	return NULL;
61 }
62 
63 static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
64 			   bool dirty)
65 {
66 	struct page **p = chunk->plist;
67 
68 	while (num_pages--) {
69 		if (!PageDirty(*p) && dirty)
70 			put_user_pages_dirty_lock(p, 1);
71 		else
72 			put_user_page(*p);
73 		p++;
74 	}
75 }
76 
77 void siw_umem_release(struct siw_umem *umem, bool dirty)
78 {
79 	struct mm_struct *mm_s = umem->owning_mm;
80 	int i, num_pages = umem->num_pages;
81 
82 	for (i = 0; num_pages; i++) {
83 		int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
84 
85 		siw_free_plist(&umem->page_chunk[i], to_free,
86 			       umem->writable && dirty);
87 		kfree(umem->page_chunk[i].plist);
88 		num_pages -= to_free;
89 	}
90 	atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
91 
92 	mmdrop(mm_s);
93 	kfree(umem->page_chunk);
94 	kfree(umem);
95 }
96 
97 int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
98 		   u64 start, u64 len, int rights)
99 {
100 	struct siw_device *sdev = to_siw_dev(pd->device);
101 	struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
102 	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
103 	u32 id, next;
104 
105 	if (!mem)
106 		return -ENOMEM;
107 
108 	mem->mem_obj = mem_obj;
109 	mem->stag_valid = 0;
110 	mem->sdev = sdev;
111 	mem->va = start;
112 	mem->len = len;
113 	mem->pd = pd;
114 	mem->perms = rights & IWARP_ACCESS_MASK;
115 	kref_init(&mem->ref);
116 
117 	mr->mem = mem;
118 
119 	get_random_bytes(&next, 4);
120 	next &= 0x00ffffff;
121 
122 	if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
123 	    GFP_KERNEL) < 0) {
124 		kfree(mem);
125 		return -ENOMEM;
126 	}
127 	/* Set the STag index part */
128 	mem->stag = id << 8;
129 	mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
130 
131 	return 0;
132 }
133 
134 void siw_mr_drop_mem(struct siw_mr *mr)
135 {
136 	struct siw_mem *mem = mr->mem, *found;
137 
138 	mem->stag_valid = 0;
139 
140 	/* make STag invalid visible asap */
141 	smp_mb();
142 
143 	found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
144 	WARN_ON(found != mem);
145 	siw_mem_put(mem);
146 }
147 
148 void siw_free_mem(struct kref *ref)
149 {
150 	struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
151 
152 	siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
153 
154 	if (!mem->is_mw && mem->mem_obj) {
155 		if (mem->is_pbl == 0)
156 			siw_umem_release(mem->umem, true);
157 		else
158 			kfree(mem->pbl);
159 	}
160 	kfree(mem);
161 }
162 
163 /*
164  * siw_check_mem()
165  *
166  * Check protection domain, STAG state, access permissions and
167  * address range for memory object.
168  *
169  * @pd:		Protection Domain memory should belong to
170  * @mem:	memory to be checked
171  * @addr:	starting addr of mem
172  * @perms:	requested access permissions
173  * @len:	len of memory interval to be checked
174  *
175  */
176 int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
177 		  enum ib_access_flags perms, int len)
178 {
179 	if (!mem->stag_valid) {
180 		siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
181 		return -E_STAG_INVALID;
182 	}
183 	if (mem->pd != pd) {
184 		siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
185 		return -E_PD_MISMATCH;
186 	}
187 	/*
188 	 * check access permissions
189 	 */
190 	if ((mem->perms & perms) < perms) {
191 		siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
192 			   mem->perms, perms);
193 		return -E_ACCESS_PERM;
194 	}
195 	/*
196 	 * Check if access falls into valid memory interval.
197 	 */
198 	if (addr < mem->va || addr + len > mem->va + mem->len) {
199 		siw_dbg_pd(pd, "MEM interval len %d\n", len);
200 		siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n",
201 			   (unsigned long long)addr,
202 			   (unsigned long long)(addr + len));
203 		siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n",
204 			   (unsigned long long)mem->va,
205 			   (unsigned long long)(mem->va + mem->len),
206 			   mem->stag);
207 
208 		return -E_BASE_BOUNDS;
209 	}
210 	return E_ACCESS_OK;
211 }
212 
213 /*
214  * siw_check_sge()
215  *
216  * Check SGE for access rights in given interval
217  *
218  * @pd:		Protection Domain memory should belong to
219  * @sge:	SGE to be checked
220  * @mem:	location of memory reference within array
221  * @perms:	requested access permissions
222  * @off:	starting offset in SGE
223  * @len:	len of memory interval to be checked
224  *
225  * NOTE: Function references SGE's memory object (mem->obj)
226  * if not yet done. New reference is kept if check went ok and
227  * released if check failed. If mem->obj is already valid, no new
228  * lookup is being done and mem is not released it check fails.
229  */
230 int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
231 		  enum ib_access_flags perms, u32 off, int len)
232 {
233 	struct siw_device *sdev = to_siw_dev(pd->device);
234 	struct siw_mem *new = NULL;
235 	int rv = E_ACCESS_OK;
236 
237 	if (len + off > sge->length) {
238 		rv = -E_BASE_BOUNDS;
239 		goto fail;
240 	}
241 	if (*mem == NULL) {
242 		new = siw_mem_id2obj(sdev, sge->lkey >> 8);
243 		if (unlikely(!new)) {
244 			siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
245 			rv = -E_STAG_INVALID;
246 			goto fail;
247 		}
248 		*mem = new;
249 	}
250 	/* Check if user re-registered with different STag key */
251 	if (unlikely((*mem)->stag != sge->lkey)) {
252 		siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
253 		rv = -E_STAG_INVALID;
254 		goto fail;
255 	}
256 	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
257 	if (unlikely(rv))
258 		goto fail;
259 
260 	return 0;
261 
262 fail:
263 	if (new) {
264 		*mem = NULL;
265 		siw_mem_put(new);
266 	}
267 	return rv;
268 }
269 
270 void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
271 {
272 	switch (op) {
273 	case SIW_OP_SEND:
274 	case SIW_OP_WRITE:
275 	case SIW_OP_SEND_WITH_IMM:
276 	case SIW_OP_SEND_REMOTE_INV:
277 	case SIW_OP_READ:
278 	case SIW_OP_READ_LOCAL_INV:
279 		if (!(wqe->sqe.flags & SIW_WQE_INLINE))
280 			siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
281 		break;
282 
283 	case SIW_OP_RECEIVE:
284 		siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
285 		break;
286 
287 	case SIW_OP_READ_RESPONSE:
288 		siw_unref_mem_sgl(wqe->mem, 1);
289 		break;
290 
291 	default:
292 		/*
293 		 * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
294 		 * do not hold memory references
295 		 */
296 		break;
297 	}
298 }
299 
300 int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
301 {
302 	struct siw_device *sdev = to_siw_dev(pd->device);
303 	struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
304 	int rv = 0;
305 
306 	if (unlikely(!mem)) {
307 		siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
308 		return -EINVAL;
309 	}
310 	if (unlikely(mem->pd != pd)) {
311 		siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
312 		rv = -EACCES;
313 		goto out;
314 	}
315 	/*
316 	 * Per RDMA verbs definition, an STag may already be in invalid
317 	 * state if invalidation is requested. So no state check here.
318 	 */
319 	mem->stag_valid = 0;
320 
321 	siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
322 out:
323 	siw_mem_put(mem);
324 	return rv;
325 }
326 
327 /*
328  * Gets physical address backed by PBL element. Address is referenced
329  * by linear byte offset into list of variably sized PB elements.
330  * Optionally, provides remaining len within current element, and
331  * current PBL index for later resume at same element.
332  */
333 u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
334 {
335 	int i = idx ? *idx : 0;
336 
337 	while (i < pbl->num_buf) {
338 		struct siw_pble *pble = &pbl->pbe[i];
339 
340 		if (pble->pbl_off + pble->size > off) {
341 			u64 pble_off = off - pble->pbl_off;
342 
343 			if (len)
344 				*len = pble->size - pble_off;
345 			if (idx)
346 				*idx = i;
347 
348 			return pble->addr + pble_off;
349 		}
350 		i++;
351 	}
352 	if (len)
353 		*len = 0;
354 	return 0;
355 }
356 
357 struct siw_pbl *siw_pbl_alloc(u32 num_buf)
358 {
359 	struct siw_pbl *pbl;
360 	int buf_size = sizeof(*pbl);
361 
362 	if (num_buf == 0)
363 		return ERR_PTR(-EINVAL);
364 
365 	buf_size += ((num_buf - 1) * sizeof(struct siw_pble));
366 
367 	pbl = kzalloc(buf_size, GFP_KERNEL);
368 	if (!pbl)
369 		return ERR_PTR(-ENOMEM);
370 
371 	pbl->max_buf = num_buf;
372 
373 	return pbl;
374 }
375 
376 struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
377 {
378 	struct siw_umem *umem;
379 	struct mm_struct *mm_s;
380 	u64 first_page_va;
381 	unsigned long mlock_limit;
382 	unsigned int foll_flags = FOLL_WRITE;
383 	int num_pages, num_chunks, i, rv = 0;
384 
385 	if (!can_do_mlock())
386 		return ERR_PTR(-EPERM);
387 
388 	if (!len)
389 		return ERR_PTR(-EINVAL);
390 
391 	first_page_va = start & PAGE_MASK;
392 	num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
393 	num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
394 
395 	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
396 	if (!umem)
397 		return ERR_PTR(-ENOMEM);
398 
399 	mm_s = current->mm;
400 	umem->owning_mm = mm_s;
401 	umem->writable = writable;
402 
403 	mmgrab(mm_s);
404 
405 	if (!writable)
406 		foll_flags |= FOLL_FORCE;
407 
408 	down_read(&mm_s->mmap_sem);
409 
410 	mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
411 
412 	if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
413 		rv = -ENOMEM;
414 		goto out_sem_up;
415 	}
416 	umem->fp_addr = first_page_va;
417 
418 	umem->page_chunk =
419 		kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
420 	if (!umem->page_chunk) {
421 		rv = -ENOMEM;
422 		goto out_sem_up;
423 	}
424 	for (i = 0; num_pages; i++) {
425 		int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK);
426 
427 		umem->page_chunk[i].plist =
428 			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
429 		if (!umem->page_chunk[i].plist) {
430 			rv = -ENOMEM;
431 			goto out_sem_up;
432 		}
433 		got = 0;
434 		while (nents) {
435 			struct page **plist = &umem->page_chunk[i].plist[got];
436 
437 			rv = get_user_pages(first_page_va, nents,
438 					    foll_flags | FOLL_LONGTERM,
439 					    plist, NULL);
440 			if (rv < 0)
441 				goto out_sem_up;
442 
443 			umem->num_pages += rv;
444 			atomic64_add(rv, &mm_s->pinned_vm);
445 			first_page_va += rv * PAGE_SIZE;
446 			nents -= rv;
447 			got += rv;
448 		}
449 		num_pages -= got;
450 	}
451 out_sem_up:
452 	up_read(&mm_s->mmap_sem);
453 
454 	if (rv > 0)
455 		return umem;
456 
457 	siw_umem_release(umem, false);
458 
459 	return ERR_PTR(rv);
460 }
461