1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/gfp.h> 7 #include <rdma/ib_verbs.h> 8 #include <linux/dma-mapping.h> 9 #include <linux/slab.h> 10 #include <linux/sched/mm.h> 11 #include <linux/resource.h> 12 13 #include "siw.h" 14 #include "siw_mem.h" 15 16 /* 17 * Stag lookup is based on its index part only (24 bits). 18 * The code avoids special Stag of zero and tries to randomize 19 * STag values between 1 and SIW_STAG_MAX_INDEX. 20 */ 21 int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) 22 { 23 struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); 24 u32 id, next; 25 26 get_random_bytes(&next, 4); 27 next &= 0x00ffffff; 28 29 if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, 30 GFP_KERNEL) < 0) 31 return -ENOMEM; 32 33 /* Set the STag index part */ 34 m->stag = id << 8; 35 36 siw_dbg_mem(m, "new MEM object\n"); 37 38 return 0; 39 } 40 41 /* 42 * siw_mem_id2obj() 43 * 44 * resolves memory from stag given by id. might be called from: 45 * o process context before sending out of sgl, or 46 * o in softirq when resolving target memory 47 */ 48 struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) 49 { 50 struct siw_mem *mem; 51 52 rcu_read_lock(); 53 mem = xa_load(&sdev->mem_xa, stag_index); 54 if (likely(mem && kref_get_unless_zero(&mem->ref))) { 55 rcu_read_unlock(); 56 return mem; 57 } 58 rcu_read_unlock(); 59 60 return NULL; 61 } 62 63 static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, 64 bool dirty) 65 { 66 struct page **p = chunk->plist; 67 68 while (num_pages--) { 69 if (!PageDirty(*p) && dirty) 70 put_user_pages_dirty_lock(p, 1); 71 else 72 put_user_page(*p); 73 p++; 74 } 75 } 76 77 void siw_umem_release(struct siw_umem *umem, bool dirty) 78 { 79 struct mm_struct *mm_s = umem->owning_mm; 80 int i, num_pages = umem->num_pages; 81 82 for (i = 0; num_pages; i++) { 83 int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); 84 85 siw_free_plist(&umem->page_chunk[i], to_free, 86 umem->writable && dirty); 87 kfree(umem->page_chunk[i].plist); 88 num_pages -= to_free; 89 } 90 atomic64_sub(umem->num_pages, &mm_s->pinned_vm); 91 92 mmdrop(mm_s); 93 kfree(umem->page_chunk); 94 kfree(umem); 95 } 96 97 int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, 98 u64 start, u64 len, int rights) 99 { 100 struct siw_device *sdev = to_siw_dev(pd->device); 101 struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); 102 struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); 103 u32 id, next; 104 105 if (!mem) 106 return -ENOMEM; 107 108 mem->mem_obj = mem_obj; 109 mem->stag_valid = 0; 110 mem->sdev = sdev; 111 mem->va = start; 112 mem->len = len; 113 mem->pd = pd; 114 mem->perms = rights & IWARP_ACCESS_MASK; 115 kref_init(&mem->ref); 116 117 mr->mem = mem; 118 119 get_random_bytes(&next, 4); 120 next &= 0x00ffffff; 121 122 if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, 123 GFP_KERNEL) < 0) { 124 kfree(mem); 125 return -ENOMEM; 126 } 127 /* Set the STag index part */ 128 mem->stag = id << 8; 129 mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; 130 131 return 0; 132 } 133 134 void siw_mr_drop_mem(struct siw_mr *mr) 135 { 136 struct siw_mem *mem = mr->mem, *found; 137 138 mem->stag_valid = 0; 139 140 /* make STag invalid visible asap */ 141 smp_mb(); 142 143 found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); 144 WARN_ON(found != mem); 145 siw_mem_put(mem); 146 } 147 148 void siw_free_mem(struct kref *ref) 149 { 150 struct siw_mem *mem = container_of(ref, struct siw_mem, ref); 151 152 siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); 153 154 if (!mem->is_mw && mem->mem_obj) { 155 if (mem->is_pbl == 0) 156 siw_umem_release(mem->umem, true); 157 else 158 kfree(mem->pbl); 159 } 160 kfree(mem); 161 } 162 163 /* 164 * siw_check_mem() 165 * 166 * Check protection domain, STAG state, access permissions and 167 * address range for memory object. 168 * 169 * @pd: Protection Domain memory should belong to 170 * @mem: memory to be checked 171 * @addr: starting addr of mem 172 * @perms: requested access permissions 173 * @len: len of memory interval to be checked 174 * 175 */ 176 int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, 177 enum ib_access_flags perms, int len) 178 { 179 if (!mem->stag_valid) { 180 siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); 181 return -E_STAG_INVALID; 182 } 183 if (mem->pd != pd) { 184 siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); 185 return -E_PD_MISMATCH; 186 } 187 /* 188 * check access permissions 189 */ 190 if ((mem->perms & perms) < perms) { 191 siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", 192 mem->perms, perms); 193 return -E_ACCESS_PERM; 194 } 195 /* 196 * Check if access falls into valid memory interval. 197 */ 198 if (addr < mem->va || addr + len > mem->va + mem->len) { 199 siw_dbg_pd(pd, "MEM interval len %d\n", len); 200 siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n", 201 (unsigned long long)addr, 202 (unsigned long long)(addr + len)); 203 siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n", 204 (unsigned long long)mem->va, 205 (unsigned long long)(mem->va + mem->len), 206 mem->stag); 207 208 return -E_BASE_BOUNDS; 209 } 210 return E_ACCESS_OK; 211 } 212 213 /* 214 * siw_check_sge() 215 * 216 * Check SGE for access rights in given interval 217 * 218 * @pd: Protection Domain memory should belong to 219 * @sge: SGE to be checked 220 * @mem: location of memory reference within array 221 * @perms: requested access permissions 222 * @off: starting offset in SGE 223 * @len: len of memory interval to be checked 224 * 225 * NOTE: Function references SGE's memory object (mem->obj) 226 * if not yet done. New reference is kept if check went ok and 227 * released if check failed. If mem->obj is already valid, no new 228 * lookup is being done and mem is not released it check fails. 229 */ 230 int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], 231 enum ib_access_flags perms, u32 off, int len) 232 { 233 struct siw_device *sdev = to_siw_dev(pd->device); 234 struct siw_mem *new = NULL; 235 int rv = E_ACCESS_OK; 236 237 if (len + off > sge->length) { 238 rv = -E_BASE_BOUNDS; 239 goto fail; 240 } 241 if (*mem == NULL) { 242 new = siw_mem_id2obj(sdev, sge->lkey >> 8); 243 if (unlikely(!new)) { 244 siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); 245 rv = -E_STAG_INVALID; 246 goto fail; 247 } 248 *mem = new; 249 } 250 /* Check if user re-registered with different STag key */ 251 if (unlikely((*mem)->stag != sge->lkey)) { 252 siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); 253 rv = -E_STAG_INVALID; 254 goto fail; 255 } 256 rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); 257 if (unlikely(rv)) 258 goto fail; 259 260 return 0; 261 262 fail: 263 if (new) { 264 *mem = NULL; 265 siw_mem_put(new); 266 } 267 return rv; 268 } 269 270 void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) 271 { 272 switch (op) { 273 case SIW_OP_SEND: 274 case SIW_OP_WRITE: 275 case SIW_OP_SEND_WITH_IMM: 276 case SIW_OP_SEND_REMOTE_INV: 277 case SIW_OP_READ: 278 case SIW_OP_READ_LOCAL_INV: 279 if (!(wqe->sqe.flags & SIW_WQE_INLINE)) 280 siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); 281 break; 282 283 case SIW_OP_RECEIVE: 284 siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); 285 break; 286 287 case SIW_OP_READ_RESPONSE: 288 siw_unref_mem_sgl(wqe->mem, 1); 289 break; 290 291 default: 292 /* 293 * SIW_OP_INVAL_STAG and SIW_OP_REG_MR 294 * do not hold memory references 295 */ 296 break; 297 } 298 } 299 300 int siw_invalidate_stag(struct ib_pd *pd, u32 stag) 301 { 302 struct siw_device *sdev = to_siw_dev(pd->device); 303 struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); 304 int rv = 0; 305 306 if (unlikely(!mem)) { 307 siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); 308 return -EINVAL; 309 } 310 if (unlikely(mem->pd != pd)) { 311 siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); 312 rv = -EACCES; 313 goto out; 314 } 315 /* 316 * Per RDMA verbs definition, an STag may already be in invalid 317 * state if invalidation is requested. So no state check here. 318 */ 319 mem->stag_valid = 0; 320 321 siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); 322 out: 323 siw_mem_put(mem); 324 return rv; 325 } 326 327 /* 328 * Gets physical address backed by PBL element. Address is referenced 329 * by linear byte offset into list of variably sized PB elements. 330 * Optionally, provides remaining len within current element, and 331 * current PBL index for later resume at same element. 332 */ 333 u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) 334 { 335 int i = idx ? *idx : 0; 336 337 while (i < pbl->num_buf) { 338 struct siw_pble *pble = &pbl->pbe[i]; 339 340 if (pble->pbl_off + pble->size > off) { 341 u64 pble_off = off - pble->pbl_off; 342 343 if (len) 344 *len = pble->size - pble_off; 345 if (idx) 346 *idx = i; 347 348 return pble->addr + pble_off; 349 } 350 i++; 351 } 352 if (len) 353 *len = 0; 354 return 0; 355 } 356 357 struct siw_pbl *siw_pbl_alloc(u32 num_buf) 358 { 359 struct siw_pbl *pbl; 360 int buf_size = sizeof(*pbl); 361 362 if (num_buf == 0) 363 return ERR_PTR(-EINVAL); 364 365 buf_size += ((num_buf - 1) * sizeof(struct siw_pble)); 366 367 pbl = kzalloc(buf_size, GFP_KERNEL); 368 if (!pbl) 369 return ERR_PTR(-ENOMEM); 370 371 pbl->max_buf = num_buf; 372 373 return pbl; 374 } 375 376 struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) 377 { 378 struct siw_umem *umem; 379 struct mm_struct *mm_s; 380 u64 first_page_va; 381 unsigned long mlock_limit; 382 unsigned int foll_flags = FOLL_WRITE; 383 int num_pages, num_chunks, i, rv = 0; 384 385 if (!can_do_mlock()) 386 return ERR_PTR(-EPERM); 387 388 if (!len) 389 return ERR_PTR(-EINVAL); 390 391 first_page_va = start & PAGE_MASK; 392 num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; 393 num_chunks = (num_pages >> CHUNK_SHIFT) + 1; 394 395 umem = kzalloc(sizeof(*umem), GFP_KERNEL); 396 if (!umem) 397 return ERR_PTR(-ENOMEM); 398 399 mm_s = current->mm; 400 umem->owning_mm = mm_s; 401 umem->writable = writable; 402 403 mmgrab(mm_s); 404 405 if (!writable) 406 foll_flags |= FOLL_FORCE; 407 408 down_read(&mm_s->mmap_sem); 409 410 mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 411 412 if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { 413 rv = -ENOMEM; 414 goto out_sem_up; 415 } 416 umem->fp_addr = first_page_va; 417 418 umem->page_chunk = 419 kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); 420 if (!umem->page_chunk) { 421 rv = -ENOMEM; 422 goto out_sem_up; 423 } 424 for (i = 0; num_pages; i++) { 425 int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); 426 427 umem->page_chunk[i].plist = 428 kcalloc(nents, sizeof(struct page *), GFP_KERNEL); 429 if (!umem->page_chunk[i].plist) { 430 rv = -ENOMEM; 431 goto out_sem_up; 432 } 433 got = 0; 434 while (nents) { 435 struct page **plist = &umem->page_chunk[i].plist[got]; 436 437 rv = get_user_pages(first_page_va, nents, 438 foll_flags | FOLL_LONGTERM, 439 plist, NULL); 440 if (rv < 0) 441 goto out_sem_up; 442 443 umem->num_pages += rv; 444 atomic64_add(rv, &mm_s->pinned_vm); 445 first_page_va += rv * PAGE_SIZE; 446 nents -= rv; 447 got += rv; 448 } 449 num_pages -= got; 450 } 451 out_sem_up: 452 up_read(&mm_s->mmap_sem); 453 454 if (rv > 0) 455 return umem; 456 457 siw_umem_release(umem, false); 458 459 return ERR_PTR(rv); 460 } 461