1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5 
6 #include <linux/gfp.h>
7 #include <rdma/ib_verbs.h>
8 #include <linux/dma-mapping.h>
9 #include <linux/slab.h>
10 #include <linux/sched/mm.h>
11 #include <linux/resource.h>
12 
13 #include "siw.h"
14 #include "siw_mem.h"
15 
16 /*
17  * Stag lookup is based on its index part only (24 bits).
18  * The code avoids special Stag of zero and tries to randomize
19  * STag values between 1 and SIW_STAG_MAX_INDEX.
20  */
siw_mem_add(struct siw_device * sdev,struct siw_mem * m)21 int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
22 {
23 	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
24 	u32 id, next;
25 
26 	get_random_bytes(&next, 4);
27 	next &= 0x00ffffff;
28 
29 	if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
30 	    GFP_KERNEL) < 0)
31 		return -ENOMEM;
32 
33 	/* Set the STag index part */
34 	m->stag = id << 8;
35 
36 	siw_dbg_mem(m, "new MEM object\n");
37 
38 	return 0;
39 }
40 
41 /*
42  * siw_mem_id2obj()
43  *
44  * resolves memory from stag given by id. might be called from:
45  * o process context before sending out of sgl, or
46  * o in softirq when resolving target memory
47  */
siw_mem_id2obj(struct siw_device * sdev,int stag_index)48 struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
49 {
50 	struct siw_mem *mem;
51 
52 	rcu_read_lock();
53 	mem = xa_load(&sdev->mem_xa, stag_index);
54 	if (likely(mem && kref_get_unless_zero(&mem->ref))) {
55 		rcu_read_unlock();
56 		return mem;
57 	}
58 	rcu_read_unlock();
59 
60 	return NULL;
61 }
62 
siw_free_plist(struct siw_page_chunk * chunk,int num_pages,bool dirty)63 static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
64 			   bool dirty)
65 {
66 	unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
67 }
68 
siw_umem_release(struct siw_umem * umem,bool dirty)69 void siw_umem_release(struct siw_umem *umem, bool dirty)
70 {
71 	struct mm_struct *mm_s = umem->owning_mm;
72 	int i, num_pages = umem->num_pages;
73 
74 	for (i = 0; num_pages; i++) {
75 		int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
76 
77 		siw_free_plist(&umem->page_chunk[i], to_free,
78 			       umem->writable && dirty);
79 		kfree(umem->page_chunk[i].plist);
80 		num_pages -= to_free;
81 	}
82 	atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
83 
84 	mmdrop(mm_s);
85 	kfree(umem->page_chunk);
86 	kfree(umem);
87 }
88 
siw_mr_add_mem(struct siw_mr * mr,struct ib_pd * pd,void * mem_obj,u64 start,u64 len,int rights)89 int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
90 		   u64 start, u64 len, int rights)
91 {
92 	struct siw_device *sdev = to_siw_dev(pd->device);
93 	struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
94 	struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
95 	u32 id, next;
96 
97 	if (!mem)
98 		return -ENOMEM;
99 
100 	mem->mem_obj = mem_obj;
101 	mem->stag_valid = 0;
102 	mem->sdev = sdev;
103 	mem->va = start;
104 	mem->len = len;
105 	mem->pd = pd;
106 	mem->perms = rights & IWARP_ACCESS_MASK;
107 	kref_init(&mem->ref);
108 
109 	get_random_bytes(&next, 4);
110 	next &= 0x00ffffff;
111 
112 	if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
113 	    GFP_KERNEL) < 0) {
114 		kfree(mem);
115 		return -ENOMEM;
116 	}
117 
118 	mr->mem = mem;
119 	/* Set the STag index part */
120 	mem->stag = id << 8;
121 	mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
122 
123 	return 0;
124 }
125 
siw_mr_drop_mem(struct siw_mr * mr)126 void siw_mr_drop_mem(struct siw_mr *mr)
127 {
128 	struct siw_mem *mem = mr->mem, *found;
129 
130 	mem->stag_valid = 0;
131 
132 	/* make STag invalid visible asap */
133 	smp_mb();
134 
135 	found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
136 	WARN_ON(found != mem);
137 	siw_mem_put(mem);
138 }
139 
siw_free_mem(struct kref * ref)140 void siw_free_mem(struct kref *ref)
141 {
142 	struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
143 
144 	siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
145 
146 	if (!mem->is_mw && mem->mem_obj) {
147 		if (mem->is_pbl == 0)
148 			siw_umem_release(mem->umem, true);
149 		else
150 			kfree(mem->pbl);
151 	}
152 	kfree(mem);
153 }
154 
155 /*
156  * siw_check_mem()
157  *
158  * Check protection domain, STAG state, access permissions and
159  * address range for memory object.
160  *
161  * @pd:		Protection Domain memory should belong to
162  * @mem:	memory to be checked
163  * @addr:	starting addr of mem
164  * @perms:	requested access permissions
165  * @len:	len of memory interval to be checked
166  *
167  */
siw_check_mem(struct ib_pd * pd,struct siw_mem * mem,u64 addr,enum ib_access_flags perms,int len)168 int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
169 		  enum ib_access_flags perms, int len)
170 {
171 	if (!mem->stag_valid) {
172 		siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
173 		return -E_STAG_INVALID;
174 	}
175 	if (mem->pd != pd) {
176 		siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
177 		return -E_PD_MISMATCH;
178 	}
179 	/*
180 	 * check access permissions
181 	 */
182 	if ((mem->perms & perms) < perms) {
183 		siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
184 			   mem->perms, perms);
185 		return -E_ACCESS_PERM;
186 	}
187 	/*
188 	 * Check if access falls into valid memory interval.
189 	 */
190 	if (addr < mem->va || addr + len > mem->va + mem->len) {
191 		siw_dbg_pd(pd, "MEM interval len %d\n", len);
192 		siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n",
193 			   (void *)(uintptr_t)addr,
194 			   (void *)(uintptr_t)(addr + len));
195 		siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n",
196 			   (void *)(uintptr_t)mem->va,
197 			   (void *)(uintptr_t)(mem->va + mem->len),
198 			   mem->stag);
199 
200 		return -E_BASE_BOUNDS;
201 	}
202 	return E_ACCESS_OK;
203 }
204 
205 /*
206  * siw_check_sge()
207  *
208  * Check SGE for access rights in given interval
209  *
210  * @pd:		Protection Domain memory should belong to
211  * @sge:	SGE to be checked
212  * @mem:	location of memory reference within array
213  * @perms:	requested access permissions
214  * @off:	starting offset in SGE
215  * @len:	len of memory interval to be checked
216  *
217  * NOTE: Function references SGE's memory object (mem->obj)
218  * if not yet done. New reference is kept if check went ok and
219  * released if check failed. If mem->obj is already valid, no new
220  * lookup is being done and mem is not released it check fails.
221  */
siw_check_sge(struct ib_pd * pd,struct siw_sge * sge,struct siw_mem * mem[],enum ib_access_flags perms,u32 off,int len)222 int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
223 		  enum ib_access_flags perms, u32 off, int len)
224 {
225 	struct siw_device *sdev = to_siw_dev(pd->device);
226 	struct siw_mem *new = NULL;
227 	int rv = E_ACCESS_OK;
228 
229 	if (len + off > sge->length) {
230 		rv = -E_BASE_BOUNDS;
231 		goto fail;
232 	}
233 	if (*mem == NULL) {
234 		new = siw_mem_id2obj(sdev, sge->lkey >> 8);
235 		if (unlikely(!new)) {
236 			siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
237 			rv = -E_STAG_INVALID;
238 			goto fail;
239 		}
240 		*mem = new;
241 	}
242 	/* Check if user re-registered with different STag key */
243 	if (unlikely((*mem)->stag != sge->lkey)) {
244 		siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
245 		rv = -E_STAG_INVALID;
246 		goto fail;
247 	}
248 	rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
249 	if (unlikely(rv))
250 		goto fail;
251 
252 	return 0;
253 
254 fail:
255 	if (new) {
256 		*mem = NULL;
257 		siw_mem_put(new);
258 	}
259 	return rv;
260 }
261 
siw_wqe_put_mem(struct siw_wqe * wqe,enum siw_opcode op)262 void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
263 {
264 	switch (op) {
265 	case SIW_OP_SEND:
266 	case SIW_OP_WRITE:
267 	case SIW_OP_SEND_WITH_IMM:
268 	case SIW_OP_SEND_REMOTE_INV:
269 	case SIW_OP_READ:
270 	case SIW_OP_READ_LOCAL_INV:
271 		if (!(wqe->sqe.flags & SIW_WQE_INLINE))
272 			siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
273 		break;
274 
275 	case SIW_OP_RECEIVE:
276 		siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
277 		break;
278 
279 	case SIW_OP_READ_RESPONSE:
280 		siw_unref_mem_sgl(wqe->mem, 1);
281 		break;
282 
283 	default:
284 		/*
285 		 * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
286 		 * do not hold memory references
287 		 */
288 		break;
289 	}
290 }
291 
siw_invalidate_stag(struct ib_pd * pd,u32 stag)292 int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
293 {
294 	struct siw_device *sdev = to_siw_dev(pd->device);
295 	struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
296 	int rv = 0;
297 
298 	if (unlikely(!mem)) {
299 		siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
300 		return -EINVAL;
301 	}
302 	if (unlikely(mem->pd != pd)) {
303 		siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
304 		rv = -EACCES;
305 		goto out;
306 	}
307 	/*
308 	 * Per RDMA verbs definition, an STag may already be in invalid
309 	 * state if invalidation is requested. So no state check here.
310 	 */
311 	mem->stag_valid = 0;
312 
313 	siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
314 out:
315 	siw_mem_put(mem);
316 	return rv;
317 }
318 
319 /*
320  * Gets physical address backed by PBL element. Address is referenced
321  * by linear byte offset into list of variably sized PB elements.
322  * Optionally, provides remaining len within current element, and
323  * current PBL index for later resume at same element.
324  */
siw_pbl_get_buffer(struct siw_pbl * pbl,u64 off,int * len,int * idx)325 dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
326 {
327 	int i = idx ? *idx : 0;
328 
329 	while (i < pbl->num_buf) {
330 		struct siw_pble *pble = &pbl->pbe[i];
331 
332 		if (pble->pbl_off + pble->size > off) {
333 			u64 pble_off = off - pble->pbl_off;
334 
335 			if (len)
336 				*len = pble->size - pble_off;
337 			if (idx)
338 				*idx = i;
339 
340 			return pble->addr + pble_off;
341 		}
342 		i++;
343 	}
344 	if (len)
345 		*len = 0;
346 	return 0;
347 }
348 
siw_pbl_alloc(u32 num_buf)349 struct siw_pbl *siw_pbl_alloc(u32 num_buf)
350 {
351 	struct siw_pbl *pbl;
352 
353 	if (num_buf == 0)
354 		return ERR_PTR(-EINVAL);
355 
356 	pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL);
357 	if (!pbl)
358 		return ERR_PTR(-ENOMEM);
359 
360 	pbl->max_buf = num_buf;
361 
362 	return pbl;
363 }
364 
siw_umem_get(u64 start,u64 len,bool writable)365 struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
366 {
367 	struct siw_umem *umem;
368 	struct mm_struct *mm_s;
369 	u64 first_page_va;
370 	unsigned long mlock_limit;
371 	unsigned int foll_flags = FOLL_LONGTERM;
372 	int num_pages, num_chunks, i, rv = 0;
373 
374 	if (!can_do_mlock())
375 		return ERR_PTR(-EPERM);
376 
377 	if (!len)
378 		return ERR_PTR(-EINVAL);
379 
380 	first_page_va = start & PAGE_MASK;
381 	num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
382 	num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
383 
384 	umem = kzalloc(sizeof(*umem), GFP_KERNEL);
385 	if (!umem)
386 		return ERR_PTR(-ENOMEM);
387 
388 	mm_s = current->mm;
389 	umem->owning_mm = mm_s;
390 	umem->writable = writable;
391 
392 	mmgrab(mm_s);
393 
394 	if (writable)
395 		foll_flags |= FOLL_WRITE;
396 
397 	mmap_read_lock(mm_s);
398 
399 	mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
400 
401 	if (atomic64_add_return(num_pages, &mm_s->pinned_vm) > mlock_limit) {
402 		rv = -ENOMEM;
403 		goto out_sem_up;
404 	}
405 	umem->fp_addr = first_page_va;
406 
407 	umem->page_chunk =
408 		kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
409 	if (!umem->page_chunk) {
410 		rv = -ENOMEM;
411 		goto out_sem_up;
412 	}
413 	for (i = 0; num_pages; i++) {
414 		int nents = min_t(int, num_pages, PAGES_PER_CHUNK);
415 		struct page **plist =
416 			kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
417 
418 		if (!plist) {
419 			rv = -ENOMEM;
420 			goto out_sem_up;
421 		}
422 		umem->page_chunk[i].plist = plist;
423 		while (nents) {
424 			rv = pin_user_pages(first_page_va, nents, foll_flags,
425 					    plist);
426 			if (rv < 0)
427 				goto out_sem_up;
428 
429 			umem->num_pages += rv;
430 			first_page_va += rv * PAGE_SIZE;
431 			plist += rv;
432 			nents -= rv;
433 			num_pages -= rv;
434 		}
435 	}
436 out_sem_up:
437 	mmap_read_unlock(mm_s);
438 
439 	if (rv > 0)
440 		return umem;
441 
442 	/* Adjust accounting for pages not pinned */
443 	if (num_pages)
444 		atomic64_sub(num_pages, &mm_s->pinned_vm);
445 
446 	siw_umem_release(umem, false);
447 
448 	return ERR_PTR(rv);
449 }
450