1 /* 2 * Copyright(c) 2015 - 2017 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, 80 unsigned maxpkts); 81 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 82 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 83 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 84 static int pin_vector_pages(struct user_sdma_request *req, 85 struct user_sdma_iovec *iovec); 86 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 87 unsigned start, unsigned npages); 88 static int check_header_template(struct user_sdma_request *req, 89 struct hfi1_pkt_header *hdr, u32 lrhlen, 90 u32 datalen); 91 static int set_txreq_header(struct user_sdma_request *req, 92 struct user_sdma_txreq *tx, u32 datalen); 93 static int set_txreq_header_ahg(struct user_sdma_request *req, 94 struct user_sdma_txreq *tx, u32 len); 95 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 96 struct hfi1_user_sdma_comp_q *cq, 97 u16 idx, enum hfi1_sdma_comp_state state, 98 int ret); 99 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 100 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 101 102 static int defer_packet_queue( 103 struct sdma_engine *sde, 104 struct iowait *wait, 105 struct sdma_txreq *txreq, 106 uint seq, 107 bool pkts_sent); 108 static void activate_packet_queue(struct iowait *wait, int reason); 109 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 110 unsigned long len); 111 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 112 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 113 void *arg2, bool *stop); 114 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 115 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 116 117 static struct mmu_rb_ops sdma_rb_ops = { 118 .filter = sdma_rb_filter, 119 .insert = sdma_rb_insert, 120 .evict = sdma_rb_evict, 121 .remove = sdma_rb_remove, 122 .invalidate = sdma_rb_invalidate 123 }; 124 125 static int defer_packet_queue( 126 struct sdma_engine *sde, 127 struct iowait *wait, 128 struct sdma_txreq *txreq, 129 uint seq, 130 bool pkts_sent) 131 { 132 struct hfi1_user_sdma_pkt_q *pq = 133 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 134 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 135 struct user_sdma_txreq *tx = 136 container_of(txreq, struct user_sdma_txreq, txreq); 137 138 if (sdma_progress(sde, seq, txreq)) { 139 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 140 goto eagain; 141 } 142 /* 143 * We are assuming that if the list is enqueued somewhere, it 144 * is to the dmawait list since that is the only place where 145 * it is supposed to be enqueued. 146 */ 147 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 148 write_seqlock(&dev->iowait_lock); 149 if (list_empty(&pq->busy.list)) 150 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 151 write_sequnlock(&dev->iowait_lock); 152 return -EBUSY; 153 eagain: 154 return -EAGAIN; 155 } 156 157 static void activate_packet_queue(struct iowait *wait, int reason) 158 { 159 struct hfi1_user_sdma_pkt_q *pq = 160 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 161 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 162 wake_up(&wait->wait_dma); 163 }; 164 165 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 166 struct hfi1_filedata *fd) 167 { 168 int ret = -ENOMEM; 169 char buf[64]; 170 struct hfi1_devdata *dd; 171 struct hfi1_user_sdma_comp_q *cq; 172 struct hfi1_user_sdma_pkt_q *pq; 173 174 if (!uctxt || !fd) 175 return -EBADF; 176 177 if (!hfi1_sdma_comp_ring_size) 178 return -EINVAL; 179 180 dd = uctxt->dd; 181 182 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 183 if (!pq) 184 return -ENOMEM; 185 186 pq->dd = dd; 187 pq->ctxt = uctxt->ctxt; 188 pq->subctxt = fd->subctxt; 189 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 190 pq->state = SDMA_PKT_Q_INACTIVE; 191 atomic_set(&pq->n_reqs, 0); 192 init_waitqueue_head(&pq->wait); 193 atomic_set(&pq->n_locked, 0); 194 pq->mm = fd->mm; 195 196 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 197 activate_packet_queue, NULL); 198 pq->reqidx = 0; 199 200 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 201 sizeof(*pq->reqs), 202 GFP_KERNEL); 203 if (!pq->reqs) 204 goto pq_reqs_nomem; 205 206 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 207 sizeof(*pq->req_in_use), 208 GFP_KERNEL); 209 if (!pq->req_in_use) 210 goto pq_reqs_no_in_use; 211 212 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 213 fd->subctxt); 214 pq->txreq_cache = kmem_cache_create(buf, 215 sizeof(struct user_sdma_txreq), 216 L1_CACHE_BYTES, 217 SLAB_HWCACHE_ALIGN, 218 NULL); 219 if (!pq->txreq_cache) { 220 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 221 uctxt->ctxt); 222 goto pq_txreq_nomem; 223 } 224 225 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 226 if (!cq) 227 goto cq_nomem; 228 229 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 230 * hfi1_sdma_comp_ring_size)); 231 if (!cq->comps) 232 goto cq_comps_nomem; 233 234 cq->nentries = hfi1_sdma_comp_ring_size; 235 236 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 237 &pq->handler); 238 if (ret) { 239 dd_dev_err(dd, "Failed to register with MMU %d", ret); 240 goto pq_mmu_fail; 241 } 242 243 fd->pq = pq; 244 fd->cq = cq; 245 246 return 0; 247 248 pq_mmu_fail: 249 vfree(cq->comps); 250 cq_comps_nomem: 251 kfree(cq); 252 cq_nomem: 253 kmem_cache_destroy(pq->txreq_cache); 254 pq_txreq_nomem: 255 kfree(pq->req_in_use); 256 pq_reqs_no_in_use: 257 kfree(pq->reqs); 258 pq_reqs_nomem: 259 kfree(pq); 260 261 return ret; 262 } 263 264 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 265 struct hfi1_ctxtdata *uctxt) 266 { 267 struct hfi1_user_sdma_pkt_q *pq; 268 269 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 270 271 pq = fd->pq; 272 if (pq) { 273 if (pq->handler) 274 hfi1_mmu_rb_unregister(pq->handler); 275 iowait_sdma_drain(&pq->busy); 276 /* Wait until all requests have been freed. */ 277 wait_event_interruptible( 278 pq->wait, 279 (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); 280 kfree(pq->reqs); 281 kfree(pq->req_in_use); 282 kmem_cache_destroy(pq->txreq_cache); 283 kfree(pq); 284 fd->pq = NULL; 285 } 286 if (fd->cq) { 287 vfree(fd->cq->comps); 288 kfree(fd->cq); 289 fd->cq = NULL; 290 } 291 return 0; 292 } 293 294 static u8 dlid_to_selector(u16 dlid) 295 { 296 static u8 mapping[256]; 297 static int initialized; 298 static u8 next; 299 int hash; 300 301 if (!initialized) { 302 memset(mapping, 0xFF, 256); 303 initialized = 1; 304 } 305 306 hash = ((dlid >> 8) ^ dlid) & 0xFF; 307 if (mapping[hash] == 0xFF) { 308 mapping[hash] = next; 309 next = (next + 1) & 0x7F; 310 } 311 312 return mapping[hash]; 313 } 314 315 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 316 struct iovec *iovec, unsigned long dim, 317 unsigned long *count) 318 { 319 int ret = 0, i; 320 struct hfi1_ctxtdata *uctxt = fd->uctxt; 321 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 322 struct hfi1_user_sdma_comp_q *cq = fd->cq; 323 struct hfi1_devdata *dd = pq->dd; 324 unsigned long idx = 0; 325 u8 pcount = initial_pkt_count; 326 struct sdma_req_info info; 327 struct user_sdma_request *req; 328 u8 opcode, sc, vl; 329 u16 pkey; 330 u32 slid; 331 int req_queued = 0; 332 u16 dlid; 333 u32 selector; 334 335 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 336 hfi1_cdbg( 337 SDMA, 338 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 339 dd->unit, uctxt->ctxt, fd->subctxt, 340 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 341 return -EINVAL; 342 } 343 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 344 if (ret) { 345 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 346 dd->unit, uctxt->ctxt, fd->subctxt, ret); 347 return -EFAULT; 348 } 349 350 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 351 (u16 *)&info); 352 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 353 hfi1_cdbg(SDMA, 354 "[%u:%u:%u:%u] Invalid comp index", 355 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 356 return -EINVAL; 357 } 358 359 /* 360 * Sanity check the header io vector count. Need at least 1 vector 361 * (header) and cannot be larger than the actual io vector count. 362 */ 363 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 364 hfi1_cdbg(SDMA, 365 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 366 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 367 req_iovcnt(info.ctrl), dim); 368 return -EINVAL; 369 } 370 371 if (!info.fragsize) { 372 hfi1_cdbg(SDMA, 373 "[%u:%u:%u:%u] Request does not specify fragsize", 374 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 375 return -EINVAL; 376 } 377 378 /* Try to claim the request. */ 379 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 380 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 381 dd->unit, uctxt->ctxt, fd->subctxt, 382 info.comp_idx); 383 return -EBADSLT; 384 } 385 /* 386 * All safety checks have been done and this request has been claimed. 387 */ 388 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 389 info.comp_idx); 390 req = pq->reqs + info.comp_idx; 391 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 392 req->data_len = 0; 393 req->pq = pq; 394 req->cq = cq; 395 req->status = -1; 396 req->ahg_idx = -1; 397 req->iov_idx = 0; 398 req->sent = 0; 399 req->seqnum = 0; 400 req->seqcomp = 0; 401 req->seqsubmitted = 0; 402 req->tids = NULL; 403 req->done = 0; 404 req->has_error = 0; 405 INIT_LIST_HEAD(&req->txps); 406 407 memcpy(&req->info, &info, sizeof(info)); 408 409 if (req_opcode(info.ctrl) == EXPECTED) { 410 /* expected must have a TID info and at least one data vector */ 411 if (req->data_iovs < 2) { 412 SDMA_DBG(req, 413 "Not enough vectors for expected request"); 414 ret = -EINVAL; 415 goto free_req; 416 } 417 req->data_iovs--; 418 } 419 420 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 421 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 422 MAX_VECTORS_PER_REQ); 423 ret = -EINVAL; 424 goto free_req; 425 } 426 /* Copy the header from the user buffer */ 427 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 428 sizeof(req->hdr)); 429 if (ret) { 430 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 431 ret = -EFAULT; 432 goto free_req; 433 } 434 435 /* If Static rate control is not enabled, sanitize the header. */ 436 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 437 req->hdr.pbc[2] = 0; 438 439 /* Validate the opcode. Do not trust packets from user space blindly. */ 440 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 441 if ((opcode & USER_OPCODE_CHECK_MASK) != 442 USER_OPCODE_CHECK_VAL) { 443 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 444 ret = -EINVAL; 445 goto free_req; 446 } 447 /* 448 * Validate the vl. Do not trust packets from user space blindly. 449 * VL comes from PBC, SC comes from LRH, and the VL needs to 450 * match the SC look up. 451 */ 452 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 453 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 454 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 455 if (vl >= dd->pport->vls_operational || 456 vl != sc_to_vlt(dd, sc)) { 457 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 458 ret = -EINVAL; 459 goto free_req; 460 } 461 462 /* Checking P_KEY for requests from user-space */ 463 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 464 slid = be16_to_cpu(req->hdr.lrh[3]); 465 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 466 ret = -EINVAL; 467 goto free_req; 468 } 469 470 /* 471 * Also should check the BTH.lnh. If it says the next header is GRH then 472 * the RXE parsing will be off and will land in the middle of the KDETH 473 * or miss it entirely. 474 */ 475 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 476 SDMA_DBG(req, "User tried to pass in a GRH"); 477 ret = -EINVAL; 478 goto free_req; 479 } 480 481 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 482 /* 483 * Calculate the initial TID offset based on the values of 484 * KDETH.OFFSET and KDETH.OM that are passed in. 485 */ 486 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 487 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 488 KDETH_OM_LARGE : KDETH_OM_SMALL); 489 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 490 info.comp_idx, req->tidoffset); 491 idx++; 492 493 /* Save all the IO vector structures */ 494 for (i = 0; i < req->data_iovs; i++) { 495 req->iovs[i].offset = 0; 496 INIT_LIST_HEAD(&req->iovs[i].list); 497 memcpy(&req->iovs[i].iov, 498 iovec + idx++, 499 sizeof(req->iovs[i].iov)); 500 ret = pin_vector_pages(req, &req->iovs[i]); 501 if (ret) { 502 req->data_iovs = i; 503 req->status = ret; 504 goto free_req; 505 } 506 req->data_len += req->iovs[i].iov.iov_len; 507 } 508 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 509 info.comp_idx, req->data_len); 510 if (pcount > req->info.npkts) 511 pcount = req->info.npkts; 512 /* 513 * Copy any TID info 514 * User space will provide the TID info only when the 515 * request type is EXPECTED. This is true even if there is 516 * only one packet in the request and the header is already 517 * setup. The reason for the singular TID case is that the 518 * driver needs to perform safety checks. 519 */ 520 if (req_opcode(req->info.ctrl) == EXPECTED) { 521 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 522 u32 *tmp; 523 524 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 525 ret = -EINVAL; 526 goto free_req; 527 } 528 529 /* 530 * We have to copy all of the tids because they may vary 531 * in size and, therefore, the TID count might not be 532 * equal to the pkt count. However, there is no way to 533 * tell at this point. 534 */ 535 tmp = memdup_user(iovec[idx].iov_base, 536 ntids * sizeof(*req->tids)); 537 if (IS_ERR(tmp)) { 538 ret = PTR_ERR(tmp); 539 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 540 ntids, ret); 541 goto free_req; 542 } 543 req->tids = tmp; 544 req->n_tids = ntids; 545 req->tididx = 0; 546 idx++; 547 } 548 549 dlid = be16_to_cpu(req->hdr.lrh[1]); 550 selector = dlid_to_selector(dlid); 551 selector += uctxt->ctxt + fd->subctxt; 552 req->sde = sdma_select_user_engine(dd, selector, vl); 553 554 if (!req->sde || !sdma_running(req->sde)) { 555 ret = -ECOMM; 556 goto free_req; 557 } 558 559 /* We don't need an AHG entry if the request contains only one packet */ 560 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 561 req->ahg_idx = sdma_ahg_alloc(req->sde); 562 563 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 564 atomic_inc(&pq->n_reqs); 565 req_queued = 1; 566 /* Send the first N packets in the request to buy us some time */ 567 ret = user_sdma_send_pkts(req, pcount); 568 if (unlikely(ret < 0 && ret != -EBUSY)) { 569 req->status = ret; 570 goto free_req; 571 } 572 573 /* 574 * It is possible that the SDMA engine would have processed all the 575 * submitted packets by the time we get here. Therefore, only set 576 * packet queue state to ACTIVE if there are still uncompleted 577 * requests. 578 */ 579 if (atomic_read(&pq->n_reqs)) 580 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 581 582 /* 583 * This is a somewhat blocking send implementation. 584 * The driver will block the caller until all packets of the 585 * request have been submitted to the SDMA engine. However, it 586 * will not wait for send completions. 587 */ 588 while (req->seqsubmitted != req->info.npkts) { 589 ret = user_sdma_send_pkts(req, pcount); 590 if (ret < 0) { 591 if (ret != -EBUSY) { 592 req->status = ret; 593 WRITE_ONCE(req->has_error, 1); 594 if (READ_ONCE(req->seqcomp) == 595 req->seqsubmitted - 1) 596 goto free_req; 597 return ret; 598 } 599 wait_event_interruptible_timeout( 600 pq->busy.wait_dma, 601 (pq->state == SDMA_PKT_Q_ACTIVE), 602 msecs_to_jiffies( 603 SDMA_IOWAIT_TIMEOUT)); 604 } 605 } 606 *count += idx; 607 return 0; 608 free_req: 609 user_sdma_free_request(req, true); 610 if (req_queued) 611 pq_update(pq); 612 set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); 613 return ret; 614 } 615 616 static inline u32 compute_data_length(struct user_sdma_request *req, 617 struct user_sdma_txreq *tx) 618 { 619 /* 620 * Determine the proper size of the packet data. 621 * The size of the data of the first packet is in the header 622 * template. However, it includes the header and ICRC, which need 623 * to be subtracted. 624 * The minimum representable packet data length in a header is 4 bytes, 625 * therefore, when the data length request is less than 4 bytes, there's 626 * only one packet, and the packet data length is equal to that of the 627 * request data length. 628 * The size of the remaining packets is the minimum of the frag 629 * size (MTU) or remaining data in the request. 630 */ 631 u32 len; 632 633 if (!req->seqnum) { 634 if (req->data_len < sizeof(u32)) 635 len = req->data_len; 636 else 637 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 638 (sizeof(tx->hdr) - 4)); 639 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 640 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 641 PAGE_SIZE; 642 /* 643 * Get the data length based on the remaining space in the 644 * TID pair. 645 */ 646 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 647 /* If we've filled up the TID pair, move to the next one. */ 648 if (unlikely(!len) && ++req->tididx < req->n_tids && 649 req->tids[req->tididx]) { 650 tidlen = EXP_TID_GET(req->tids[req->tididx], 651 LEN) * PAGE_SIZE; 652 req->tidoffset = 0; 653 len = min_t(u32, tidlen, req->info.fragsize); 654 } 655 /* 656 * Since the TID pairs map entire pages, make sure that we 657 * are not going to try to send more data that we have 658 * remaining. 659 */ 660 len = min(len, req->data_len - req->sent); 661 } else { 662 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 663 } 664 trace_hfi1_sdma_user_compute_length(req->pq->dd, 665 req->pq->ctxt, 666 req->pq->subctxt, 667 req->info.comp_idx, 668 len); 669 return len; 670 } 671 672 static inline u32 pad_len(u32 len) 673 { 674 if (len & (sizeof(u32) - 1)) 675 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 676 return len; 677 } 678 679 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 680 { 681 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 682 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 683 } 684 685 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 686 struct user_sdma_txreq *tx, 687 u32 datalen) 688 { 689 int ret; 690 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 691 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 692 struct hfi1_user_sdma_pkt_q *pq = req->pq; 693 694 /* 695 * Copy the request header into the tx header 696 * because the HW needs a cacheline-aligned 697 * address. 698 * This copy can be optimized out if the hdr 699 * member of user_sdma_request were also 700 * cacheline aligned. 701 */ 702 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 703 if (PBC2LRH(pbclen) != lrhlen) { 704 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 705 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 706 } 707 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 708 if (ret) 709 return ret; 710 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 711 sizeof(tx->hdr) + datalen, req->ahg_idx, 712 0, NULL, 0, user_sdma_txreq_cb); 713 if (ret) 714 return ret; 715 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 716 if (ret) 717 sdma_txclean(pq->dd, &tx->txreq); 718 return ret; 719 } 720 721 static int user_sdma_txadd(struct user_sdma_request *req, 722 struct user_sdma_txreq *tx, 723 struct user_sdma_iovec *iovec, u32 datalen, 724 u32 *queued_ptr, u32 *data_sent_ptr, 725 u64 *iov_offset_ptr) 726 { 727 int ret; 728 unsigned int pageidx, len; 729 unsigned long base, offset; 730 u64 iov_offset = *iov_offset_ptr; 731 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 732 struct hfi1_user_sdma_pkt_q *pq = req->pq; 733 734 base = (unsigned long)iovec->iov.iov_base; 735 offset = offset_in_page(base + iovec->offset + iov_offset); 736 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 737 PAGE_SHIFT); 738 len = offset + req->info.fragsize > PAGE_SIZE ? 739 PAGE_SIZE - offset : req->info.fragsize; 740 len = min((datalen - queued), len); 741 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 742 offset, len); 743 if (ret) { 744 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 745 return ret; 746 } 747 iov_offset += len; 748 queued += len; 749 data_sent += len; 750 if (unlikely(queued < datalen && pageidx == iovec->npages && 751 req->iov_idx < req->data_iovs - 1)) { 752 iovec->offset += iov_offset; 753 iovec = &req->iovs[++req->iov_idx]; 754 iov_offset = 0; 755 } 756 757 *queued_ptr = queued; 758 *data_sent_ptr = data_sent; 759 *iov_offset_ptr = iov_offset; 760 return ret; 761 } 762 763 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) 764 { 765 int ret = 0, count; 766 unsigned npkts = 0; 767 struct user_sdma_txreq *tx = NULL; 768 struct hfi1_user_sdma_pkt_q *pq = NULL; 769 struct user_sdma_iovec *iovec = NULL; 770 771 if (!req->pq) 772 return -EINVAL; 773 774 pq = req->pq; 775 776 /* If tx completion has reported an error, we are done. */ 777 if (READ_ONCE(req->has_error)) 778 return -EFAULT; 779 780 /* 781 * Check if we might have sent the entire request already 782 */ 783 if (unlikely(req->seqnum == req->info.npkts)) { 784 if (!list_empty(&req->txps)) 785 goto dosend; 786 return ret; 787 } 788 789 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 790 maxpkts = req->info.npkts - req->seqnum; 791 792 while (npkts < maxpkts) { 793 u32 datalen = 0, queued = 0, data_sent = 0; 794 u64 iov_offset = 0; 795 796 /* 797 * Check whether any of the completions have come back 798 * with errors. If so, we are not going to process any 799 * more packets from this request. 800 */ 801 if (READ_ONCE(req->has_error)) 802 return -EFAULT; 803 804 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 805 if (!tx) 806 return -ENOMEM; 807 808 tx->flags = 0; 809 tx->req = req; 810 tx->busycount = 0; 811 INIT_LIST_HEAD(&tx->list); 812 813 /* 814 * For the last packet set the ACK request 815 * and disable header suppression. 816 */ 817 if (req->seqnum == req->info.npkts - 1) 818 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 819 TXREQ_FLAGS_REQ_DISABLE_SH); 820 821 /* 822 * Calculate the payload size - this is min of the fragment 823 * (MTU) size or the remaining bytes in the request but only 824 * if we have payload data. 825 */ 826 if (req->data_len) { 827 iovec = &req->iovs[req->iov_idx]; 828 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 829 if (++req->iov_idx == req->data_iovs) { 830 ret = -EFAULT; 831 goto free_tx; 832 } 833 iovec = &req->iovs[req->iov_idx]; 834 WARN_ON(iovec->offset); 835 } 836 837 datalen = compute_data_length(req, tx); 838 839 /* 840 * Disable header suppression for the payload <= 8DWS. 841 * If there is an uncorrectable error in the receive 842 * data FIFO when the received payload size is less than 843 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 844 * not reported.There is set RHF.EccErr if the header 845 * is not suppressed. 846 */ 847 if (!datalen) { 848 SDMA_DBG(req, 849 "Request has data but pkt len is 0"); 850 ret = -EFAULT; 851 goto free_tx; 852 } else if (datalen <= 32) { 853 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 854 } 855 } 856 857 if (req->ahg_idx >= 0) { 858 if (!req->seqnum) { 859 ret = user_sdma_txadd_ahg(req, tx, datalen); 860 if (ret) 861 goto free_tx; 862 } else { 863 int changes; 864 865 changes = set_txreq_header_ahg(req, tx, 866 datalen); 867 if (changes < 0) 868 goto free_tx; 869 } 870 } else { 871 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 872 datalen, user_sdma_txreq_cb); 873 if (ret) 874 goto free_tx; 875 /* 876 * Modify the header for this packet. This only needs 877 * to be done if we are not going to use AHG. Otherwise, 878 * the HW will do it based on the changes we gave it 879 * during sdma_txinit_ahg(). 880 */ 881 ret = set_txreq_header(req, tx, datalen); 882 if (ret) 883 goto free_txreq; 884 } 885 886 /* 887 * If the request contains any data vectors, add up to 888 * fragsize bytes to the descriptor. 889 */ 890 while (queued < datalen && 891 (req->sent + data_sent) < req->data_len) { 892 ret = user_sdma_txadd(req, tx, iovec, datalen, 893 &queued, &data_sent, &iov_offset); 894 if (ret) 895 goto free_txreq; 896 } 897 /* 898 * The txreq was submitted successfully so we can update 899 * the counters. 900 */ 901 req->koffset += datalen; 902 if (req_opcode(req->info.ctrl) == EXPECTED) 903 req->tidoffset += datalen; 904 req->sent += data_sent; 905 if (req->data_len) 906 iovec->offset += iov_offset; 907 list_add_tail(&tx->txreq.list, &req->txps); 908 /* 909 * It is important to increment this here as it is used to 910 * generate the BTH.PSN and, therefore, can't be bulk-updated 911 * outside of the loop. 912 */ 913 tx->seqnum = req->seqnum++; 914 npkts++; 915 } 916 dosend: 917 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); 918 req->seqsubmitted += count; 919 if (req->seqsubmitted == req->info.npkts) { 920 WRITE_ONCE(req->done, 1); 921 /* 922 * The txreq has already been submitted to the HW queue 923 * so we can free the AHG entry now. Corruption will not 924 * happen due to the sequential manner in which 925 * descriptors are processed. 926 */ 927 if (req->ahg_idx >= 0) 928 sdma_ahg_free(req->sde, req->ahg_idx); 929 } 930 return ret; 931 932 free_txreq: 933 sdma_txclean(pq->dd, &tx->txreq); 934 free_tx: 935 kmem_cache_free(pq->txreq_cache, tx); 936 return ret; 937 } 938 939 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 940 { 941 struct evict_data evict_data; 942 943 evict_data.cleared = 0; 944 evict_data.target = npages; 945 hfi1_mmu_rb_evict(pq->handler, &evict_data); 946 return evict_data.cleared; 947 } 948 949 static int pin_sdma_pages(struct user_sdma_request *req, 950 struct user_sdma_iovec *iovec, 951 struct sdma_mmu_node *node, 952 int npages) 953 { 954 int pinned, cleared; 955 struct page **pages; 956 struct hfi1_user_sdma_pkt_q *pq = req->pq; 957 958 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 959 if (!pages) 960 return -ENOMEM; 961 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 962 963 npages -= node->npages; 964 retry: 965 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 966 atomic_read(&pq->n_locked), npages)) { 967 cleared = sdma_cache_evict(pq, npages); 968 if (cleared >= npages) 969 goto retry; 970 } 971 pinned = hfi1_acquire_user_pages(pq->mm, 972 ((unsigned long)iovec->iov.iov_base + 973 (node->npages * PAGE_SIZE)), npages, 0, 974 pages + node->npages); 975 if (pinned < 0) { 976 kfree(pages); 977 return pinned; 978 } 979 if (pinned != npages) { 980 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 981 return -EFAULT; 982 } 983 kfree(node->pages); 984 node->rb.len = iovec->iov.iov_len; 985 node->pages = pages; 986 atomic_add(pinned, &pq->n_locked); 987 return pinned; 988 } 989 990 static void unpin_sdma_pages(struct sdma_mmu_node *node) 991 { 992 if (node->npages) { 993 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 994 atomic_sub(node->npages, &node->pq->n_locked); 995 } 996 } 997 998 static int pin_vector_pages(struct user_sdma_request *req, 999 struct user_sdma_iovec *iovec) 1000 { 1001 int ret = 0, pinned, npages; 1002 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1003 struct sdma_mmu_node *node = NULL; 1004 struct mmu_rb_node *rb_node; 1005 struct iovec *iov; 1006 bool extracted; 1007 1008 extracted = 1009 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1010 (unsigned long) 1011 iovec->iov.iov_base, 1012 iovec->iov.iov_len, &rb_node); 1013 if (rb_node) { 1014 node = container_of(rb_node, struct sdma_mmu_node, rb); 1015 if (!extracted) { 1016 atomic_inc(&node->refcount); 1017 iovec->pages = node->pages; 1018 iovec->npages = node->npages; 1019 iovec->node = node; 1020 return 0; 1021 } 1022 } 1023 1024 if (!node) { 1025 node = kzalloc(sizeof(*node), GFP_KERNEL); 1026 if (!node) 1027 return -ENOMEM; 1028 1029 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1030 node->pq = pq; 1031 atomic_set(&node->refcount, 0); 1032 } 1033 1034 iov = &iovec->iov; 1035 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1036 if (node->npages < npages) { 1037 pinned = pin_sdma_pages(req, iovec, node, npages); 1038 if (pinned < 0) { 1039 ret = pinned; 1040 goto bail; 1041 } 1042 node->npages += pinned; 1043 npages = node->npages; 1044 } 1045 iovec->pages = node->pages; 1046 iovec->npages = npages; 1047 iovec->node = node; 1048 1049 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1050 if (ret) { 1051 iovec->node = NULL; 1052 goto bail; 1053 } 1054 return 0; 1055 bail: 1056 unpin_sdma_pages(node); 1057 kfree(node); 1058 return ret; 1059 } 1060 1061 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1062 unsigned start, unsigned npages) 1063 { 1064 hfi1_release_user_pages(mm, pages + start, npages, false); 1065 kfree(pages); 1066 } 1067 1068 static int check_header_template(struct user_sdma_request *req, 1069 struct hfi1_pkt_header *hdr, u32 lrhlen, 1070 u32 datalen) 1071 { 1072 /* 1073 * Perform safety checks for any type of packet: 1074 * - transfer size is multiple of 64bytes 1075 * - packet length is multiple of 4 bytes 1076 * - packet length is not larger than MTU size 1077 * 1078 * These checks are only done for the first packet of the 1079 * transfer since the header is "given" to us by user space. 1080 * For the remainder of the packets we compute the values. 1081 */ 1082 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1083 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1084 return -EINVAL; 1085 1086 if (req_opcode(req->info.ctrl) == EXPECTED) { 1087 /* 1088 * The header is checked only on the first packet. Furthermore, 1089 * we ensure that at least one TID entry is copied when the 1090 * request is submitted. Therefore, we don't have to verify that 1091 * tididx points to something sane. 1092 */ 1093 u32 tidval = req->tids[req->tididx], 1094 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1095 tididx = EXP_TID_GET(tidval, IDX), 1096 tidctrl = EXP_TID_GET(tidval, CTRL), 1097 tidoff; 1098 __le32 kval = hdr->kdeth.ver_tid_offset; 1099 1100 tidoff = KDETH_GET(kval, OFFSET) * 1101 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1102 KDETH_OM_LARGE : KDETH_OM_SMALL); 1103 /* 1104 * Expected receive packets have the following 1105 * additional checks: 1106 * - offset is not larger than the TID size 1107 * - TIDCtrl values match between header and TID array 1108 * - TID indexes match between header and TID array 1109 */ 1110 if ((tidoff + datalen > tidlen) || 1111 KDETH_GET(kval, TIDCTRL) != tidctrl || 1112 KDETH_GET(kval, TID) != tididx) 1113 return -EINVAL; 1114 } 1115 return 0; 1116 } 1117 1118 /* 1119 * Correctly set the BTH.PSN field based on type of 1120 * transfer - eager packets can just increment the PSN but 1121 * expected packets encode generation and sequence in the 1122 * BTH.PSN field so just incrementing will result in errors. 1123 */ 1124 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1125 { 1126 u32 val = be32_to_cpu(bthpsn), 1127 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1128 0xffffffull), 1129 psn = val & mask; 1130 if (expct) 1131 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1132 else 1133 psn = psn + frags; 1134 return psn & mask; 1135 } 1136 1137 static int set_txreq_header(struct user_sdma_request *req, 1138 struct user_sdma_txreq *tx, u32 datalen) 1139 { 1140 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1141 struct hfi1_pkt_header *hdr = &tx->hdr; 1142 u8 omfactor; /* KDETH.OM */ 1143 u16 pbclen; 1144 int ret; 1145 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1146 1147 /* Copy the header template to the request before modification */ 1148 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1149 1150 /* 1151 * Check if the PBC and LRH length are mismatched. If so 1152 * adjust both in the header. 1153 */ 1154 pbclen = le16_to_cpu(hdr->pbc[0]); 1155 if (PBC2LRH(pbclen) != lrhlen) { 1156 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1157 hdr->pbc[0] = cpu_to_le16(pbclen); 1158 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1159 /* 1160 * Third packet 1161 * This is the first packet in the sequence that has 1162 * a "static" size that can be used for the rest of 1163 * the packets (besides the last one). 1164 */ 1165 if (unlikely(req->seqnum == 2)) { 1166 /* 1167 * From this point on the lengths in both the 1168 * PBC and LRH are the same until the last 1169 * packet. 1170 * Adjust the template so we don't have to update 1171 * every packet 1172 */ 1173 req->hdr.pbc[0] = hdr->pbc[0]; 1174 req->hdr.lrh[2] = hdr->lrh[2]; 1175 } 1176 } 1177 /* 1178 * We only have to modify the header if this is not the 1179 * first packet in the request. Otherwise, we use the 1180 * header given to us. 1181 */ 1182 if (unlikely(!req->seqnum)) { 1183 ret = check_header_template(req, hdr, lrhlen, datalen); 1184 if (ret) 1185 return ret; 1186 goto done; 1187 } 1188 1189 hdr->bth[2] = cpu_to_be32( 1190 set_pkt_bth_psn(hdr->bth[2], 1191 (req_opcode(req->info.ctrl) == EXPECTED), 1192 req->seqnum)); 1193 1194 /* Set ACK request on last packet */ 1195 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1196 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1197 1198 /* Set the new offset */ 1199 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1200 /* Expected packets have to fill in the new TID information */ 1201 if (req_opcode(req->info.ctrl) == EXPECTED) { 1202 tidval = req->tids[req->tididx]; 1203 /* 1204 * If the offset puts us at the end of the current TID, 1205 * advance everything. 1206 */ 1207 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1208 PAGE_SIZE)) { 1209 req->tidoffset = 0; 1210 /* 1211 * Since we don't copy all the TIDs, all at once, 1212 * we have to check again. 1213 */ 1214 if (++req->tididx > req->n_tids - 1 || 1215 !req->tids[req->tididx]) { 1216 return -EINVAL; 1217 } 1218 tidval = req->tids[req->tididx]; 1219 } 1220 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1221 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1222 KDETH_OM_SMALL_SHIFT; 1223 /* Set KDETH.TIDCtrl based on value for this TID. */ 1224 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1225 EXP_TID_GET(tidval, CTRL)); 1226 /* Set KDETH.TID based on value for this TID */ 1227 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1228 EXP_TID_GET(tidval, IDX)); 1229 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1230 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1231 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1232 /* 1233 * Set the KDETH.OFFSET and KDETH.OM based on size of 1234 * transfer. 1235 */ 1236 trace_hfi1_sdma_user_tid_info( 1237 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1238 req->tidoffset, req->tidoffset >> omfactor, 1239 omfactor != KDETH_OM_SMALL_SHIFT); 1240 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1241 req->tidoffset >> omfactor); 1242 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1243 omfactor != KDETH_OM_SMALL_SHIFT); 1244 } 1245 done: 1246 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1247 req->info.comp_idx, hdr, tidval); 1248 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1249 } 1250 1251 static int set_txreq_header_ahg(struct user_sdma_request *req, 1252 struct user_sdma_txreq *tx, u32 datalen) 1253 { 1254 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1255 int idx = 0; 1256 u8 omfactor; /* KDETH.OM */ 1257 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1258 struct hfi1_pkt_header *hdr = &req->hdr; 1259 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1260 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1261 size_t array_size = ARRAY_SIZE(ahg); 1262 1263 if (PBC2LRH(pbclen) != lrhlen) { 1264 /* PBC.PbcLengthDWs */ 1265 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1266 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1267 if (idx < 0) 1268 return idx; 1269 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1270 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1271 (__force u16)cpu_to_be16(lrhlen >> 2)); 1272 if (idx < 0) 1273 return idx; 1274 } 1275 1276 /* 1277 * Do the common updates 1278 */ 1279 /* BTH.PSN and BTH.A */ 1280 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1281 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1282 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1283 val32 |= 1UL << 31; 1284 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1285 (__force u16)cpu_to_be16(val32 >> 16)); 1286 if (idx < 0) 1287 return idx; 1288 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1289 (__force u16)cpu_to_be16(val32 & 0xffff)); 1290 if (idx < 0) 1291 return idx; 1292 /* KDETH.Offset */ 1293 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1294 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1295 if (idx < 0) 1296 return idx; 1297 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1298 (__force u16)cpu_to_le16(req->koffset >> 16)); 1299 if (idx < 0) 1300 return idx; 1301 if (req_opcode(req->info.ctrl) == EXPECTED) { 1302 __le16 val; 1303 1304 tidval = req->tids[req->tididx]; 1305 1306 /* 1307 * If the offset puts us at the end of the current TID, 1308 * advance everything. 1309 */ 1310 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1311 PAGE_SIZE)) { 1312 req->tidoffset = 0; 1313 /* 1314 * Since we don't copy all the TIDs, all at once, 1315 * we have to check again. 1316 */ 1317 if (++req->tididx > req->n_tids - 1 || 1318 !req->tids[req->tididx]) 1319 return -EINVAL; 1320 tidval = req->tids[req->tididx]; 1321 } 1322 omfactor = ((EXP_TID_GET(tidval, LEN) * 1323 PAGE_SIZE) >= 1324 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1325 KDETH_OM_SMALL_SHIFT; 1326 /* KDETH.OM and KDETH.OFFSET (TID) */ 1327 idx = ahg_header_set( 1328 ahg, idx, array_size, 7, 0, 16, 1329 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1330 ((req->tidoffset >> omfactor) 1331 & 0x7fff))); 1332 if (idx < 0) 1333 return idx; 1334 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1335 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1336 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1337 1338 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1339 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1340 INTR) << 1341 AHG_KDETH_INTR_SHIFT)); 1342 } else { 1343 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1344 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1345 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1346 INTR) << 1347 AHG_KDETH_INTR_SHIFT)); 1348 } 1349 1350 idx = ahg_header_set(ahg, idx, array_size, 1351 7, 16, 14, (__force u16)val); 1352 if (idx < 0) 1353 return idx; 1354 } 1355 1356 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1357 req->info.comp_idx, req->sde->this_idx, 1358 req->ahg_idx, ahg, idx, tidval); 1359 sdma_txinit_ahg(&tx->txreq, 1360 SDMA_TXREQ_F_USE_AHG, 1361 datalen, req->ahg_idx, idx, 1362 ahg, sizeof(req->hdr), 1363 user_sdma_txreq_cb); 1364 1365 return idx; 1366 } 1367 1368 /* 1369 * SDMA tx request completion callback. Called when the SDMA progress 1370 * state machine gets notification that the SDMA descriptors for this 1371 * tx request have been processed by the DMA engine. Called in 1372 * interrupt context. 1373 */ 1374 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1375 { 1376 struct user_sdma_txreq *tx = 1377 container_of(txreq, struct user_sdma_txreq, txreq); 1378 struct user_sdma_request *req; 1379 struct hfi1_user_sdma_pkt_q *pq; 1380 struct hfi1_user_sdma_comp_q *cq; 1381 u16 idx; 1382 1383 if (!tx->req) 1384 return; 1385 1386 req = tx->req; 1387 pq = req->pq; 1388 cq = req->cq; 1389 1390 if (status != SDMA_TXREQ_S_OK) { 1391 SDMA_DBG(req, "SDMA completion with error %d", 1392 status); 1393 WRITE_ONCE(req->has_error, 1); 1394 } 1395 1396 req->seqcomp = tx->seqnum; 1397 kmem_cache_free(pq->txreq_cache, tx); 1398 tx = NULL; 1399 1400 idx = req->info.comp_idx; 1401 if (req->status == -1 && status == SDMA_TXREQ_S_OK) { 1402 if (req->seqcomp == req->info.npkts - 1) { 1403 req->status = 0; 1404 user_sdma_free_request(req, false); 1405 pq_update(pq); 1406 set_comp_state(pq, cq, idx, COMPLETE, 0); 1407 } 1408 } else { 1409 if (status != SDMA_TXREQ_S_OK) 1410 req->status = status; 1411 if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) && 1412 (READ_ONCE(req->done) || 1413 READ_ONCE(req->has_error))) { 1414 user_sdma_free_request(req, false); 1415 pq_update(pq); 1416 set_comp_state(pq, cq, idx, ERROR, req->status); 1417 } 1418 } 1419 } 1420 1421 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1422 { 1423 if (atomic_dec_and_test(&pq->n_reqs)) { 1424 xchg(&pq->state, SDMA_PKT_Q_INACTIVE); 1425 wake_up(&pq->wait); 1426 } 1427 } 1428 1429 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1430 { 1431 int i; 1432 1433 if (!list_empty(&req->txps)) { 1434 struct sdma_txreq *t, *p; 1435 1436 list_for_each_entry_safe(t, p, &req->txps, list) { 1437 struct user_sdma_txreq *tx = 1438 container_of(t, struct user_sdma_txreq, txreq); 1439 list_del_init(&t->list); 1440 sdma_txclean(req->pq->dd, t); 1441 kmem_cache_free(req->pq->txreq_cache, tx); 1442 } 1443 } 1444 1445 for (i = 0; i < req->data_iovs; i++) { 1446 struct sdma_mmu_node *node = req->iovs[i].node; 1447 1448 if (!node) 1449 continue; 1450 1451 if (unpin) 1452 hfi1_mmu_rb_remove(req->pq->handler, 1453 &node->rb); 1454 else 1455 atomic_dec(&node->refcount); 1456 } 1457 1458 kfree(req->tids); 1459 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1460 } 1461 1462 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1463 struct hfi1_user_sdma_comp_q *cq, 1464 u16 idx, enum hfi1_sdma_comp_state state, 1465 int ret) 1466 { 1467 if (state == ERROR) 1468 cq->comps[idx].errcode = -ret; 1469 smp_wmb(); /* make sure errcode is visible first */ 1470 cq->comps[idx].status = state; 1471 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1472 idx, state, ret); 1473 } 1474 1475 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1476 unsigned long len) 1477 { 1478 return (bool)(node->addr == addr); 1479 } 1480 1481 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1482 { 1483 struct sdma_mmu_node *node = 1484 container_of(mnode, struct sdma_mmu_node, rb); 1485 1486 atomic_inc(&node->refcount); 1487 return 0; 1488 } 1489 1490 /* 1491 * Return 1 to remove the node from the rb tree and call the remove op. 1492 * 1493 * Called with the rb tree lock held. 1494 */ 1495 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1496 void *evict_arg, bool *stop) 1497 { 1498 struct sdma_mmu_node *node = 1499 container_of(mnode, struct sdma_mmu_node, rb); 1500 struct evict_data *evict_data = evict_arg; 1501 1502 /* is this node still being used? */ 1503 if (atomic_read(&node->refcount)) 1504 return 0; /* keep this node */ 1505 1506 /* this node will be evicted, add its pages to our count */ 1507 evict_data->cleared += node->npages; 1508 1509 /* have enough pages been cleared? */ 1510 if (evict_data->cleared >= evict_data->target) 1511 *stop = true; 1512 1513 return 1; /* remove this node */ 1514 } 1515 1516 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1517 { 1518 struct sdma_mmu_node *node = 1519 container_of(mnode, struct sdma_mmu_node, rb); 1520 1521 unpin_sdma_pages(node); 1522 kfree(node); 1523 } 1524 1525 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1526 { 1527 struct sdma_mmu_node *node = 1528 container_of(mnode, struct sdma_mmu_node, rb); 1529 1530 if (!atomic_read(&node->refcount)) 1531 return 1; 1532 return 0; 1533 } 1534