1 /* 2 * Copyright(c) 2015 - 2017 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, 80 unsigned maxpkts); 81 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 82 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 83 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 84 static int pin_vector_pages(struct user_sdma_request *req, 85 struct user_sdma_iovec *iovec); 86 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 87 unsigned start, unsigned npages); 88 static int check_header_template(struct user_sdma_request *req, 89 struct hfi1_pkt_header *hdr, u32 lrhlen, 90 u32 datalen); 91 static int set_txreq_header(struct user_sdma_request *req, 92 struct user_sdma_txreq *tx, u32 datalen); 93 static int set_txreq_header_ahg(struct user_sdma_request *req, 94 struct user_sdma_txreq *tx, u32 len); 95 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 96 struct hfi1_user_sdma_comp_q *cq, 97 u16 idx, enum hfi1_sdma_comp_state state, 98 int ret); 99 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 100 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 101 102 static int defer_packet_queue( 103 struct sdma_engine *sde, 104 struct iowait *wait, 105 struct sdma_txreq *txreq, 106 uint seq, 107 bool pkts_sent); 108 static void activate_packet_queue(struct iowait *wait, int reason); 109 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 110 unsigned long len); 111 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 112 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 113 void *arg2, bool *stop); 114 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 115 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 116 117 static struct mmu_rb_ops sdma_rb_ops = { 118 .filter = sdma_rb_filter, 119 .insert = sdma_rb_insert, 120 .evict = sdma_rb_evict, 121 .remove = sdma_rb_remove, 122 .invalidate = sdma_rb_invalidate 123 }; 124 125 static int defer_packet_queue( 126 struct sdma_engine *sde, 127 struct iowait *wait, 128 struct sdma_txreq *txreq, 129 uint seq, 130 bool pkts_sent) 131 { 132 struct hfi1_user_sdma_pkt_q *pq = 133 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 134 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 135 struct user_sdma_txreq *tx = 136 container_of(txreq, struct user_sdma_txreq, txreq); 137 138 if (sdma_progress(sde, seq, txreq)) { 139 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 140 goto eagain; 141 } 142 /* 143 * We are assuming that if the list is enqueued somewhere, it 144 * is to the dmawait list since that is the only place where 145 * it is supposed to be enqueued. 146 */ 147 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 148 write_seqlock(&dev->iowait_lock); 149 if (list_empty(&pq->busy.list)) 150 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 151 write_sequnlock(&dev->iowait_lock); 152 return -EBUSY; 153 eagain: 154 return -EAGAIN; 155 } 156 157 static void activate_packet_queue(struct iowait *wait, int reason) 158 { 159 struct hfi1_user_sdma_pkt_q *pq = 160 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 161 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 162 wake_up(&wait->wait_dma); 163 }; 164 165 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 166 struct hfi1_filedata *fd) 167 { 168 int ret = -ENOMEM; 169 char buf[64]; 170 struct hfi1_devdata *dd; 171 struct hfi1_user_sdma_comp_q *cq; 172 struct hfi1_user_sdma_pkt_q *pq; 173 174 if (!uctxt || !fd) 175 return -EBADF; 176 177 if (!hfi1_sdma_comp_ring_size) 178 return -EINVAL; 179 180 dd = uctxt->dd; 181 182 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 183 if (!pq) 184 return -ENOMEM; 185 186 pq->dd = dd; 187 pq->ctxt = uctxt->ctxt; 188 pq->subctxt = fd->subctxt; 189 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 190 pq->state = SDMA_PKT_Q_INACTIVE; 191 atomic_set(&pq->n_reqs, 0); 192 init_waitqueue_head(&pq->wait); 193 atomic_set(&pq->n_locked, 0); 194 pq->mm = fd->mm; 195 196 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 197 activate_packet_queue, NULL); 198 pq->reqidx = 0; 199 200 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 201 sizeof(*pq->reqs), 202 GFP_KERNEL); 203 if (!pq->reqs) 204 goto pq_reqs_nomem; 205 206 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 207 sizeof(*pq->req_in_use), 208 GFP_KERNEL); 209 if (!pq->req_in_use) 210 goto pq_reqs_no_in_use; 211 212 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 213 fd->subctxt); 214 pq->txreq_cache = kmem_cache_create(buf, 215 sizeof(struct user_sdma_txreq), 216 L1_CACHE_BYTES, 217 SLAB_HWCACHE_ALIGN, 218 NULL); 219 if (!pq->txreq_cache) { 220 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 221 uctxt->ctxt); 222 goto pq_txreq_nomem; 223 } 224 225 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 226 if (!cq) 227 goto cq_nomem; 228 229 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 230 * hfi1_sdma_comp_ring_size)); 231 if (!cq->comps) 232 goto cq_comps_nomem; 233 234 cq->nentries = hfi1_sdma_comp_ring_size; 235 236 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 237 &pq->handler); 238 if (ret) { 239 dd_dev_err(dd, "Failed to register with MMU %d", ret); 240 goto pq_mmu_fail; 241 } 242 243 fd->pq = pq; 244 fd->cq = cq; 245 246 return 0; 247 248 pq_mmu_fail: 249 vfree(cq->comps); 250 cq_comps_nomem: 251 kfree(cq); 252 cq_nomem: 253 kmem_cache_destroy(pq->txreq_cache); 254 pq_txreq_nomem: 255 kfree(pq->req_in_use); 256 pq_reqs_no_in_use: 257 kfree(pq->reqs); 258 pq_reqs_nomem: 259 kfree(pq); 260 261 return ret; 262 } 263 264 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 265 struct hfi1_ctxtdata *uctxt) 266 { 267 struct hfi1_user_sdma_pkt_q *pq; 268 269 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 270 271 pq = fd->pq; 272 if (pq) { 273 if (pq->handler) 274 hfi1_mmu_rb_unregister(pq->handler); 275 iowait_sdma_drain(&pq->busy); 276 /* Wait until all requests have been freed. */ 277 wait_event_interruptible( 278 pq->wait, 279 (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); 280 kfree(pq->reqs); 281 kfree(pq->req_in_use); 282 kmem_cache_destroy(pq->txreq_cache); 283 kfree(pq); 284 fd->pq = NULL; 285 } 286 if (fd->cq) { 287 vfree(fd->cq->comps); 288 kfree(fd->cq); 289 fd->cq = NULL; 290 } 291 return 0; 292 } 293 294 static u8 dlid_to_selector(u16 dlid) 295 { 296 static u8 mapping[256]; 297 static int initialized; 298 static u8 next; 299 int hash; 300 301 if (!initialized) { 302 memset(mapping, 0xFF, 256); 303 initialized = 1; 304 } 305 306 hash = ((dlid >> 8) ^ dlid) & 0xFF; 307 if (mapping[hash] == 0xFF) { 308 mapping[hash] = next; 309 next = (next + 1) & 0x7F; 310 } 311 312 return mapping[hash]; 313 } 314 315 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 316 struct iovec *iovec, unsigned long dim, 317 unsigned long *count) 318 { 319 int ret = 0, i; 320 struct hfi1_ctxtdata *uctxt = fd->uctxt; 321 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 322 struct hfi1_user_sdma_comp_q *cq = fd->cq; 323 struct hfi1_devdata *dd = pq->dd; 324 unsigned long idx = 0; 325 u8 pcount = initial_pkt_count; 326 struct sdma_req_info info; 327 struct user_sdma_request *req; 328 u8 opcode, sc, vl; 329 u16 pkey; 330 u32 slid; 331 int req_queued = 0; 332 u16 dlid; 333 u32 selector; 334 335 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 336 hfi1_cdbg( 337 SDMA, 338 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 339 dd->unit, uctxt->ctxt, fd->subctxt, 340 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 341 return -EINVAL; 342 } 343 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 344 if (ret) { 345 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 346 dd->unit, uctxt->ctxt, fd->subctxt, ret); 347 return -EFAULT; 348 } 349 350 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 351 (u16 *)&info); 352 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 353 hfi1_cdbg(SDMA, 354 "[%u:%u:%u:%u] Invalid comp index", 355 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 356 return -EINVAL; 357 } 358 359 /* 360 * Sanity check the header io vector count. Need at least 1 vector 361 * (header) and cannot be larger than the actual io vector count. 362 */ 363 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 364 hfi1_cdbg(SDMA, 365 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 366 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 367 req_iovcnt(info.ctrl), dim); 368 return -EINVAL; 369 } 370 371 if (!info.fragsize) { 372 hfi1_cdbg(SDMA, 373 "[%u:%u:%u:%u] Request does not specify fragsize", 374 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 375 return -EINVAL; 376 } 377 378 /* Try to claim the request. */ 379 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 380 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 381 dd->unit, uctxt->ctxt, fd->subctxt, 382 info.comp_idx); 383 return -EBADSLT; 384 } 385 /* 386 * All safety checks have been done and this request has been claimed. 387 */ 388 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 389 info.comp_idx); 390 req = pq->reqs + info.comp_idx; 391 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 392 req->data_len = 0; 393 req->pq = pq; 394 req->cq = cq; 395 req->status = -1; 396 req->ahg_idx = -1; 397 req->iov_idx = 0; 398 req->sent = 0; 399 req->seqnum = 0; 400 req->seqcomp = 0; 401 req->seqsubmitted = 0; 402 req->tids = NULL; 403 req->done = 0; 404 req->has_error = 0; 405 INIT_LIST_HEAD(&req->txps); 406 407 memcpy(&req->info, &info, sizeof(info)); 408 409 if (req_opcode(info.ctrl) == EXPECTED) { 410 /* expected must have a TID info and at least one data vector */ 411 if (req->data_iovs < 2) { 412 SDMA_DBG(req, 413 "Not enough vectors for expected request"); 414 ret = -EINVAL; 415 goto free_req; 416 } 417 req->data_iovs--; 418 } 419 420 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 421 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 422 MAX_VECTORS_PER_REQ); 423 ret = -EINVAL; 424 goto free_req; 425 } 426 /* Copy the header from the user buffer */ 427 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 428 sizeof(req->hdr)); 429 if (ret) { 430 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 431 ret = -EFAULT; 432 goto free_req; 433 } 434 435 /* If Static rate control is not enabled, sanitize the header. */ 436 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 437 req->hdr.pbc[2] = 0; 438 439 /* Validate the opcode. Do not trust packets from user space blindly. */ 440 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 441 if ((opcode & USER_OPCODE_CHECK_MASK) != 442 USER_OPCODE_CHECK_VAL) { 443 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 444 ret = -EINVAL; 445 goto free_req; 446 } 447 /* 448 * Validate the vl. Do not trust packets from user space blindly. 449 * VL comes from PBC, SC comes from LRH, and the VL needs to 450 * match the SC look up. 451 */ 452 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 453 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 454 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 455 if (vl >= dd->pport->vls_operational || 456 vl != sc_to_vlt(dd, sc)) { 457 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 458 ret = -EINVAL; 459 goto free_req; 460 } 461 462 /* Checking P_KEY for requests from user-space */ 463 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 464 slid = be16_to_cpu(req->hdr.lrh[3]); 465 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 466 ret = -EINVAL; 467 goto free_req; 468 } 469 470 /* 471 * Also should check the BTH.lnh. If it says the next header is GRH then 472 * the RXE parsing will be off and will land in the middle of the KDETH 473 * or miss it entirely. 474 */ 475 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 476 SDMA_DBG(req, "User tried to pass in a GRH"); 477 ret = -EINVAL; 478 goto free_req; 479 } 480 481 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 482 /* 483 * Calculate the initial TID offset based on the values of 484 * KDETH.OFFSET and KDETH.OM that are passed in. 485 */ 486 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 487 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 488 KDETH_OM_LARGE : KDETH_OM_SMALL); 489 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 490 info.comp_idx, req->tidoffset); 491 idx++; 492 493 /* Save all the IO vector structures */ 494 for (i = 0; i < req->data_iovs; i++) { 495 req->iovs[i].offset = 0; 496 INIT_LIST_HEAD(&req->iovs[i].list); 497 memcpy(&req->iovs[i].iov, 498 iovec + idx++, 499 sizeof(req->iovs[i].iov)); 500 ret = pin_vector_pages(req, &req->iovs[i]); 501 if (ret) { 502 req->data_iovs = i; 503 req->status = ret; 504 goto free_req; 505 } 506 req->data_len += req->iovs[i].iov.iov_len; 507 } 508 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 509 info.comp_idx, req->data_len); 510 if (pcount > req->info.npkts) 511 pcount = req->info.npkts; 512 /* 513 * Copy any TID info 514 * User space will provide the TID info only when the 515 * request type is EXPECTED. This is true even if there is 516 * only one packet in the request and the header is already 517 * setup. The reason for the singular TID case is that the 518 * driver needs to perform safety checks. 519 */ 520 if (req_opcode(req->info.ctrl) == EXPECTED) { 521 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 522 u32 *tmp; 523 524 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 525 ret = -EINVAL; 526 goto free_req; 527 } 528 529 /* 530 * We have to copy all of the tids because they may vary 531 * in size and, therefore, the TID count might not be 532 * equal to the pkt count. However, there is no way to 533 * tell at this point. 534 */ 535 tmp = memdup_user(iovec[idx].iov_base, 536 ntids * sizeof(*req->tids)); 537 if (IS_ERR(tmp)) { 538 ret = PTR_ERR(tmp); 539 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 540 ntids, ret); 541 goto free_req; 542 } 543 req->tids = tmp; 544 req->n_tids = ntids; 545 req->tididx = 0; 546 idx++; 547 } 548 549 dlid = be16_to_cpu(req->hdr.lrh[1]); 550 selector = dlid_to_selector(dlid); 551 selector += uctxt->ctxt + fd->subctxt; 552 req->sde = sdma_select_user_engine(dd, selector, vl); 553 554 if (!req->sde || !sdma_running(req->sde)) { 555 ret = -ECOMM; 556 goto free_req; 557 } 558 559 /* We don't need an AHG entry if the request contains only one packet */ 560 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 561 req->ahg_idx = sdma_ahg_alloc(req->sde); 562 563 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 564 atomic_inc(&pq->n_reqs); 565 req_queued = 1; 566 /* Send the first N packets in the request to buy us some time */ 567 ret = user_sdma_send_pkts(req, pcount); 568 if (unlikely(ret < 0 && ret != -EBUSY)) { 569 req->status = ret; 570 goto free_req; 571 } 572 573 /* 574 * It is possible that the SDMA engine would have processed all the 575 * submitted packets by the time we get here. Therefore, only set 576 * packet queue state to ACTIVE if there are still uncompleted 577 * requests. 578 */ 579 if (atomic_read(&pq->n_reqs)) 580 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 581 582 /* 583 * This is a somewhat blocking send implementation. 584 * The driver will block the caller until all packets of the 585 * request have been submitted to the SDMA engine. However, it 586 * will not wait for send completions. 587 */ 588 while (req->seqsubmitted != req->info.npkts) { 589 ret = user_sdma_send_pkts(req, pcount); 590 if (ret < 0) { 591 if (ret != -EBUSY) { 592 req->status = ret; 593 WRITE_ONCE(req->has_error, 1); 594 if (READ_ONCE(req->seqcomp) == 595 req->seqsubmitted - 1) 596 goto free_req; 597 return ret; 598 } 599 wait_event_interruptible_timeout( 600 pq->busy.wait_dma, 601 (pq->state == SDMA_PKT_Q_ACTIVE), 602 msecs_to_jiffies( 603 SDMA_IOWAIT_TIMEOUT)); 604 } 605 } 606 *count += idx; 607 return 0; 608 free_req: 609 user_sdma_free_request(req, true); 610 if (req_queued) 611 pq_update(pq); 612 set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); 613 return ret; 614 } 615 616 static inline u32 compute_data_length(struct user_sdma_request *req, 617 struct user_sdma_txreq *tx) 618 { 619 /* 620 * Determine the proper size of the packet data. 621 * The size of the data of the first packet is in the header 622 * template. However, it includes the header and ICRC, which need 623 * to be subtracted. 624 * The minimum representable packet data length in a header is 4 bytes, 625 * therefore, when the data length request is less than 4 bytes, there's 626 * only one packet, and the packet data length is equal to that of the 627 * request data length. 628 * The size of the remaining packets is the minimum of the frag 629 * size (MTU) or remaining data in the request. 630 */ 631 u32 len; 632 633 if (!req->seqnum) { 634 if (req->data_len < sizeof(u32)) 635 len = req->data_len; 636 else 637 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 638 (sizeof(tx->hdr) - 4)); 639 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 640 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 641 PAGE_SIZE; 642 /* 643 * Get the data length based on the remaining space in the 644 * TID pair. 645 */ 646 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 647 /* If we've filled up the TID pair, move to the next one. */ 648 if (unlikely(!len) && ++req->tididx < req->n_tids && 649 req->tids[req->tididx]) { 650 tidlen = EXP_TID_GET(req->tids[req->tididx], 651 LEN) * PAGE_SIZE; 652 req->tidoffset = 0; 653 len = min_t(u32, tidlen, req->info.fragsize); 654 } 655 /* 656 * Since the TID pairs map entire pages, make sure that we 657 * are not going to try to send more data that we have 658 * remaining. 659 */ 660 len = min(len, req->data_len - req->sent); 661 } else { 662 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 663 } 664 trace_hfi1_sdma_user_compute_length(req->pq->dd, 665 req->pq->ctxt, 666 req->pq->subctxt, 667 req->info.comp_idx, 668 len); 669 return len; 670 } 671 672 static inline u32 pad_len(u32 len) 673 { 674 if (len & (sizeof(u32) - 1)) 675 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 676 return len; 677 } 678 679 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 680 { 681 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 682 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 683 } 684 685 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 686 struct user_sdma_txreq *tx, 687 u32 datalen) 688 { 689 int ret; 690 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 691 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 692 struct hfi1_user_sdma_pkt_q *pq = req->pq; 693 694 /* 695 * Copy the request header into the tx header 696 * because the HW needs a cacheline-aligned 697 * address. 698 * This copy can be optimized out if the hdr 699 * member of user_sdma_request were also 700 * cacheline aligned. 701 */ 702 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 703 if (PBC2LRH(pbclen) != lrhlen) { 704 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 705 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 706 } 707 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 708 if (ret) 709 return ret; 710 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 711 sizeof(tx->hdr) + datalen, req->ahg_idx, 712 0, NULL, 0, user_sdma_txreq_cb); 713 if (ret) 714 return ret; 715 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 716 if (ret) 717 sdma_txclean(pq->dd, &tx->txreq); 718 return ret; 719 } 720 721 static int user_sdma_txadd(struct user_sdma_request *req, 722 struct user_sdma_txreq *tx, 723 struct user_sdma_iovec *iovec, u32 datalen, 724 u32 *queued_ptr, u32 *data_sent_ptr, 725 u64 *iov_offset_ptr) 726 { 727 int ret; 728 unsigned int pageidx, len; 729 unsigned long base, offset; 730 u64 iov_offset = *iov_offset_ptr; 731 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 732 struct hfi1_user_sdma_pkt_q *pq = req->pq; 733 734 base = (unsigned long)iovec->iov.iov_base; 735 offset = offset_in_page(base + iovec->offset + iov_offset); 736 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 737 PAGE_SHIFT); 738 len = offset + req->info.fragsize > PAGE_SIZE ? 739 PAGE_SIZE - offset : req->info.fragsize; 740 len = min((datalen - queued), len); 741 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 742 offset, len); 743 if (ret) { 744 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 745 return ret; 746 } 747 iov_offset += len; 748 queued += len; 749 data_sent += len; 750 if (unlikely(queued < datalen && pageidx == iovec->npages && 751 req->iov_idx < req->data_iovs - 1)) { 752 iovec->offset += iov_offset; 753 iovec = &req->iovs[++req->iov_idx]; 754 iov_offset = 0; 755 } 756 757 *queued_ptr = queued; 758 *data_sent_ptr = data_sent; 759 *iov_offset_ptr = iov_offset; 760 return ret; 761 } 762 763 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) 764 { 765 int ret = 0, count; 766 unsigned npkts = 0; 767 struct user_sdma_txreq *tx = NULL; 768 struct hfi1_user_sdma_pkt_q *pq = NULL; 769 struct user_sdma_iovec *iovec = NULL; 770 771 if (!req->pq) 772 return -EINVAL; 773 774 pq = req->pq; 775 776 /* If tx completion has reported an error, we are done. */ 777 if (READ_ONCE(req->has_error)) 778 return -EFAULT; 779 780 /* 781 * Check if we might have sent the entire request already 782 */ 783 if (unlikely(req->seqnum == req->info.npkts)) { 784 if (!list_empty(&req->txps)) 785 goto dosend; 786 return ret; 787 } 788 789 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 790 maxpkts = req->info.npkts - req->seqnum; 791 792 while (npkts < maxpkts) { 793 u32 datalen = 0, queued = 0, data_sent = 0; 794 u64 iov_offset = 0; 795 796 /* 797 * Check whether any of the completions have come back 798 * with errors. If so, we are not going to process any 799 * more packets from this request. 800 */ 801 if (READ_ONCE(req->has_error)) 802 return -EFAULT; 803 804 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 805 if (!tx) 806 return -ENOMEM; 807 808 tx->flags = 0; 809 tx->req = req; 810 tx->busycount = 0; 811 INIT_LIST_HEAD(&tx->list); 812 813 /* 814 * For the last packet set the ACK request 815 * and disable header suppression. 816 */ 817 if (req->seqnum == req->info.npkts - 1) 818 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 819 TXREQ_FLAGS_REQ_DISABLE_SH); 820 821 /* 822 * Calculate the payload size - this is min of the fragment 823 * (MTU) size or the remaining bytes in the request but only 824 * if we have payload data. 825 */ 826 if (req->data_len) { 827 iovec = &req->iovs[req->iov_idx]; 828 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 829 if (++req->iov_idx == req->data_iovs) { 830 ret = -EFAULT; 831 goto free_txreq; 832 } 833 iovec = &req->iovs[req->iov_idx]; 834 WARN_ON(iovec->offset); 835 } 836 837 datalen = compute_data_length(req, tx); 838 839 /* 840 * Disable header suppression for the payload <= 8DWS. 841 * If there is an uncorrectable error in the receive 842 * data FIFO when the received payload size is less than 843 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 844 * not reported.There is set RHF.EccErr if the header 845 * is not suppressed. 846 */ 847 if (!datalen) { 848 SDMA_DBG(req, 849 "Request has data but pkt len is 0"); 850 ret = -EFAULT; 851 goto free_tx; 852 } else if (datalen <= 32) { 853 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 854 } 855 } 856 857 if (req->ahg_idx >= 0) { 858 if (!req->seqnum) { 859 ret = user_sdma_txadd_ahg(req, tx, datalen); 860 if (ret) 861 goto free_tx; 862 } else { 863 int changes; 864 865 changes = set_txreq_header_ahg(req, tx, 866 datalen); 867 if (changes < 0) 868 goto free_tx; 869 } 870 } else { 871 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 872 datalen, user_sdma_txreq_cb); 873 if (ret) 874 goto free_tx; 875 /* 876 * Modify the header for this packet. This only needs 877 * to be done if we are not going to use AHG. Otherwise, 878 * the HW will do it based on the changes we gave it 879 * during sdma_txinit_ahg(). 880 */ 881 ret = set_txreq_header(req, tx, datalen); 882 if (ret) 883 goto free_txreq; 884 } 885 886 /* 887 * If the request contains any data vectors, add up to 888 * fragsize bytes to the descriptor. 889 */ 890 while (queued < datalen && 891 (req->sent + data_sent) < req->data_len) { 892 ret = user_sdma_txadd(req, tx, iovec, datalen, 893 &queued, &data_sent, &iov_offset); 894 if (ret) 895 goto free_txreq; 896 } 897 /* 898 * The txreq was submitted successfully so we can update 899 * the counters. 900 */ 901 req->koffset += datalen; 902 if (req_opcode(req->info.ctrl) == EXPECTED) 903 req->tidoffset += datalen; 904 req->sent += data_sent; 905 if (req->data_len) 906 iovec->offset += iov_offset; 907 list_add_tail(&tx->txreq.list, &req->txps); 908 /* 909 * It is important to increment this here as it is used to 910 * generate the BTH.PSN and, therefore, can't be bulk-updated 911 * outside of the loop. 912 */ 913 tx->seqnum = req->seqnum++; 914 npkts++; 915 } 916 dosend: 917 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); 918 req->seqsubmitted += count; 919 if (req->seqsubmitted == req->info.npkts) { 920 WRITE_ONCE(req->done, 1); 921 /* 922 * The txreq has already been submitted to the HW queue 923 * so we can free the AHG entry now. Corruption will not 924 * happen due to the sequential manner in which 925 * descriptors are processed. 926 */ 927 if (req->ahg_idx >= 0) 928 sdma_ahg_free(req->sde, req->ahg_idx); 929 } 930 return ret; 931 932 free_txreq: 933 sdma_txclean(pq->dd, &tx->txreq); 934 free_tx: 935 kmem_cache_free(pq->txreq_cache, tx); 936 return ret; 937 } 938 939 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 940 { 941 struct evict_data evict_data; 942 943 evict_data.cleared = 0; 944 evict_data.target = npages; 945 hfi1_mmu_rb_evict(pq->handler, &evict_data); 946 return evict_data.cleared; 947 } 948 949 static int pin_sdma_pages(struct user_sdma_request *req, 950 struct user_sdma_iovec *iovec, 951 struct sdma_mmu_node *node, 952 int npages) 953 { 954 int pinned, cleared; 955 struct page **pages; 956 struct hfi1_user_sdma_pkt_q *pq = req->pq; 957 958 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 959 if (!pages) { 960 SDMA_DBG(req, "Failed page array alloc"); 961 return -ENOMEM; 962 } 963 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 964 965 npages -= node->npages; 966 retry: 967 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 968 atomic_read(&pq->n_locked), npages)) { 969 cleared = sdma_cache_evict(pq, npages); 970 if (cleared >= npages) 971 goto retry; 972 } 973 pinned = hfi1_acquire_user_pages(pq->mm, 974 ((unsigned long)iovec->iov.iov_base + 975 (node->npages * PAGE_SIZE)), npages, 0, 976 pages + node->npages); 977 if (pinned < 0) { 978 kfree(pages); 979 return pinned; 980 } 981 if (pinned != npages) { 982 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 983 return -EFAULT; 984 } 985 kfree(node->pages); 986 node->rb.len = iovec->iov.iov_len; 987 node->pages = pages; 988 atomic_add(pinned, &pq->n_locked); 989 return pinned; 990 } 991 992 static void unpin_sdma_pages(struct sdma_mmu_node *node) 993 { 994 if (node->npages) { 995 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 996 atomic_sub(node->npages, &node->pq->n_locked); 997 } 998 } 999 1000 static int pin_vector_pages(struct user_sdma_request *req, 1001 struct user_sdma_iovec *iovec) 1002 { 1003 int ret = 0, pinned, npages; 1004 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1005 struct sdma_mmu_node *node = NULL; 1006 struct mmu_rb_node *rb_node; 1007 struct iovec *iov; 1008 bool extracted; 1009 1010 extracted = 1011 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1012 (unsigned long) 1013 iovec->iov.iov_base, 1014 iovec->iov.iov_len, &rb_node); 1015 if (rb_node) { 1016 node = container_of(rb_node, struct sdma_mmu_node, rb); 1017 if (!extracted) { 1018 atomic_inc(&node->refcount); 1019 iovec->pages = node->pages; 1020 iovec->npages = node->npages; 1021 iovec->node = node; 1022 return 0; 1023 } 1024 } 1025 1026 if (!node) { 1027 node = kzalloc(sizeof(*node), GFP_KERNEL); 1028 if (!node) 1029 return -ENOMEM; 1030 1031 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1032 node->pq = pq; 1033 atomic_set(&node->refcount, 0); 1034 } 1035 1036 iov = &iovec->iov; 1037 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1038 if (node->npages < npages) { 1039 pinned = pin_sdma_pages(req, iovec, node, npages); 1040 if (pinned < 0) { 1041 ret = pinned; 1042 goto bail; 1043 } 1044 node->npages += pinned; 1045 npages = node->npages; 1046 } 1047 iovec->pages = node->pages; 1048 iovec->npages = npages; 1049 iovec->node = node; 1050 1051 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1052 if (ret) { 1053 iovec->node = NULL; 1054 goto bail; 1055 } 1056 return 0; 1057 bail: 1058 unpin_sdma_pages(node); 1059 kfree(node); 1060 return ret; 1061 } 1062 1063 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1064 unsigned start, unsigned npages) 1065 { 1066 hfi1_release_user_pages(mm, pages + start, npages, false); 1067 kfree(pages); 1068 } 1069 1070 static int check_header_template(struct user_sdma_request *req, 1071 struct hfi1_pkt_header *hdr, u32 lrhlen, 1072 u32 datalen) 1073 { 1074 /* 1075 * Perform safety checks for any type of packet: 1076 * - transfer size is multiple of 64bytes 1077 * - packet length is multiple of 4 bytes 1078 * - packet length is not larger than MTU size 1079 * 1080 * These checks are only done for the first packet of the 1081 * transfer since the header is "given" to us by user space. 1082 * For the remainder of the packets we compute the values. 1083 */ 1084 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1085 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1086 return -EINVAL; 1087 1088 if (req_opcode(req->info.ctrl) == EXPECTED) { 1089 /* 1090 * The header is checked only on the first packet. Furthermore, 1091 * we ensure that at least one TID entry is copied when the 1092 * request is submitted. Therefore, we don't have to verify that 1093 * tididx points to something sane. 1094 */ 1095 u32 tidval = req->tids[req->tididx], 1096 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1097 tididx = EXP_TID_GET(tidval, IDX), 1098 tidctrl = EXP_TID_GET(tidval, CTRL), 1099 tidoff; 1100 __le32 kval = hdr->kdeth.ver_tid_offset; 1101 1102 tidoff = KDETH_GET(kval, OFFSET) * 1103 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1104 KDETH_OM_LARGE : KDETH_OM_SMALL); 1105 /* 1106 * Expected receive packets have the following 1107 * additional checks: 1108 * - offset is not larger than the TID size 1109 * - TIDCtrl values match between header and TID array 1110 * - TID indexes match between header and TID array 1111 */ 1112 if ((tidoff + datalen > tidlen) || 1113 KDETH_GET(kval, TIDCTRL) != tidctrl || 1114 KDETH_GET(kval, TID) != tididx) 1115 return -EINVAL; 1116 } 1117 return 0; 1118 } 1119 1120 /* 1121 * Correctly set the BTH.PSN field based on type of 1122 * transfer - eager packets can just increment the PSN but 1123 * expected packets encode generation and sequence in the 1124 * BTH.PSN field so just incrementing will result in errors. 1125 */ 1126 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1127 { 1128 u32 val = be32_to_cpu(bthpsn), 1129 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1130 0xffffffull), 1131 psn = val & mask; 1132 if (expct) 1133 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1134 else 1135 psn = psn + frags; 1136 return psn & mask; 1137 } 1138 1139 static int set_txreq_header(struct user_sdma_request *req, 1140 struct user_sdma_txreq *tx, u32 datalen) 1141 { 1142 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1143 struct hfi1_pkt_header *hdr = &tx->hdr; 1144 u8 omfactor; /* KDETH.OM */ 1145 u16 pbclen; 1146 int ret; 1147 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1148 1149 /* Copy the header template to the request before modification */ 1150 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1151 1152 /* 1153 * Check if the PBC and LRH length are mismatched. If so 1154 * adjust both in the header. 1155 */ 1156 pbclen = le16_to_cpu(hdr->pbc[0]); 1157 if (PBC2LRH(pbclen) != lrhlen) { 1158 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1159 hdr->pbc[0] = cpu_to_le16(pbclen); 1160 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1161 /* 1162 * Third packet 1163 * This is the first packet in the sequence that has 1164 * a "static" size that can be used for the rest of 1165 * the packets (besides the last one). 1166 */ 1167 if (unlikely(req->seqnum == 2)) { 1168 /* 1169 * From this point on the lengths in both the 1170 * PBC and LRH are the same until the last 1171 * packet. 1172 * Adjust the template so we don't have to update 1173 * every packet 1174 */ 1175 req->hdr.pbc[0] = hdr->pbc[0]; 1176 req->hdr.lrh[2] = hdr->lrh[2]; 1177 } 1178 } 1179 /* 1180 * We only have to modify the header if this is not the 1181 * first packet in the request. Otherwise, we use the 1182 * header given to us. 1183 */ 1184 if (unlikely(!req->seqnum)) { 1185 ret = check_header_template(req, hdr, lrhlen, datalen); 1186 if (ret) 1187 return ret; 1188 goto done; 1189 } 1190 1191 hdr->bth[2] = cpu_to_be32( 1192 set_pkt_bth_psn(hdr->bth[2], 1193 (req_opcode(req->info.ctrl) == EXPECTED), 1194 req->seqnum)); 1195 1196 /* Set ACK request on last packet */ 1197 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1198 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1199 1200 /* Set the new offset */ 1201 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1202 /* Expected packets have to fill in the new TID information */ 1203 if (req_opcode(req->info.ctrl) == EXPECTED) { 1204 tidval = req->tids[req->tididx]; 1205 /* 1206 * If the offset puts us at the end of the current TID, 1207 * advance everything. 1208 */ 1209 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1210 PAGE_SIZE)) { 1211 req->tidoffset = 0; 1212 /* 1213 * Since we don't copy all the TIDs, all at once, 1214 * we have to check again. 1215 */ 1216 if (++req->tididx > req->n_tids - 1 || 1217 !req->tids[req->tididx]) { 1218 return -EINVAL; 1219 } 1220 tidval = req->tids[req->tididx]; 1221 } 1222 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1223 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1224 KDETH_OM_SMALL_SHIFT; 1225 /* Set KDETH.TIDCtrl based on value for this TID. */ 1226 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1227 EXP_TID_GET(tidval, CTRL)); 1228 /* Set KDETH.TID based on value for this TID */ 1229 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1230 EXP_TID_GET(tidval, IDX)); 1231 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1232 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1233 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1234 /* 1235 * Set the KDETH.OFFSET and KDETH.OM based on size of 1236 * transfer. 1237 */ 1238 trace_hfi1_sdma_user_tid_info( 1239 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1240 req->tidoffset, req->tidoffset >> omfactor, 1241 omfactor != KDETH_OM_SMALL_SHIFT); 1242 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1243 req->tidoffset >> omfactor); 1244 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1245 omfactor != KDETH_OM_SMALL_SHIFT); 1246 } 1247 done: 1248 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1249 req->info.comp_idx, hdr, tidval); 1250 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1251 } 1252 1253 static int set_txreq_header_ahg(struct user_sdma_request *req, 1254 struct user_sdma_txreq *tx, u32 datalen) 1255 { 1256 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1257 int diff = 0; 1258 u8 omfactor; /* KDETH.OM */ 1259 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1260 struct hfi1_pkt_header *hdr = &req->hdr; 1261 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1262 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1263 1264 if (PBC2LRH(pbclen) != lrhlen) { 1265 /* PBC.PbcLengthDWs */ 1266 AHG_HEADER_SET(ahg, diff, 0, 0, 12, 1267 cpu_to_le16(LRH2PBC(lrhlen))); 1268 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1269 AHG_HEADER_SET(ahg, diff, 3, 0, 16, 1270 cpu_to_be16(lrhlen >> 2)); 1271 } 1272 1273 /* 1274 * Do the common updates 1275 */ 1276 /* BTH.PSN and BTH.A */ 1277 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1278 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1279 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1280 val32 |= 1UL << 31; 1281 AHG_HEADER_SET(ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); 1282 AHG_HEADER_SET(ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); 1283 /* KDETH.Offset */ 1284 AHG_HEADER_SET(ahg, diff, 15, 0, 16, 1285 cpu_to_le16(req->koffset & 0xffff)); 1286 AHG_HEADER_SET(ahg, diff, 15, 16, 16, cpu_to_le16(req->koffset >> 16)); 1287 if (req_opcode(req->info.ctrl) == EXPECTED) { 1288 __le16 val; 1289 1290 tidval = req->tids[req->tididx]; 1291 1292 /* 1293 * If the offset puts us at the end of the current TID, 1294 * advance everything. 1295 */ 1296 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1297 PAGE_SIZE)) { 1298 req->tidoffset = 0; 1299 /* 1300 * Since we don't copy all the TIDs, all at once, 1301 * we have to check again. 1302 */ 1303 if (++req->tididx > req->n_tids - 1 || 1304 !req->tids[req->tididx]) 1305 return -EINVAL; 1306 tidval = req->tids[req->tididx]; 1307 } 1308 omfactor = ((EXP_TID_GET(tidval, LEN) * 1309 PAGE_SIZE) >= 1310 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1311 KDETH_OM_SMALL_SHIFT; 1312 /* KDETH.OM and KDETH.OFFSET (TID) */ 1313 AHG_HEADER_SET(ahg, diff, 7, 0, 16, 1314 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1315 ((req->tidoffset >> omfactor) 1316 & 0x7fff))); 1317 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1318 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1319 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1320 1321 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1322 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1323 INTR) << 1324 AHG_KDETH_INTR_SHIFT)); 1325 } else { 1326 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1327 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1328 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1329 INTR) << 1330 AHG_KDETH_INTR_SHIFT)); 1331 } 1332 1333 AHG_HEADER_SET(ahg, diff, 7, 16, 14, val); 1334 } 1335 if (diff < 0) 1336 return diff; 1337 1338 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1339 req->info.comp_idx, req->sde->this_idx, 1340 req->ahg_idx, ahg, diff, tidval); 1341 sdma_txinit_ahg(&tx->txreq, 1342 SDMA_TXREQ_F_USE_AHG, 1343 datalen, req->ahg_idx, diff, 1344 ahg, sizeof(req->hdr), 1345 user_sdma_txreq_cb); 1346 1347 return diff; 1348 } 1349 1350 /* 1351 * SDMA tx request completion callback. Called when the SDMA progress 1352 * state machine gets notification that the SDMA descriptors for this 1353 * tx request have been processed by the DMA engine. Called in 1354 * interrupt context. 1355 */ 1356 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1357 { 1358 struct user_sdma_txreq *tx = 1359 container_of(txreq, struct user_sdma_txreq, txreq); 1360 struct user_sdma_request *req; 1361 struct hfi1_user_sdma_pkt_q *pq; 1362 struct hfi1_user_sdma_comp_q *cq; 1363 u16 idx; 1364 1365 if (!tx->req) 1366 return; 1367 1368 req = tx->req; 1369 pq = req->pq; 1370 cq = req->cq; 1371 1372 if (status != SDMA_TXREQ_S_OK) { 1373 SDMA_DBG(req, "SDMA completion with error %d", 1374 status); 1375 WRITE_ONCE(req->has_error, 1); 1376 } 1377 1378 req->seqcomp = tx->seqnum; 1379 kmem_cache_free(pq->txreq_cache, tx); 1380 tx = NULL; 1381 1382 idx = req->info.comp_idx; 1383 if (req->status == -1 && status == SDMA_TXREQ_S_OK) { 1384 if (req->seqcomp == req->info.npkts - 1) { 1385 req->status = 0; 1386 user_sdma_free_request(req, false); 1387 pq_update(pq); 1388 set_comp_state(pq, cq, idx, COMPLETE, 0); 1389 } 1390 } else { 1391 if (status != SDMA_TXREQ_S_OK) 1392 req->status = status; 1393 if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) && 1394 (READ_ONCE(req->done) || 1395 READ_ONCE(req->has_error))) { 1396 user_sdma_free_request(req, false); 1397 pq_update(pq); 1398 set_comp_state(pq, cq, idx, ERROR, req->status); 1399 } 1400 } 1401 } 1402 1403 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1404 { 1405 if (atomic_dec_and_test(&pq->n_reqs)) { 1406 xchg(&pq->state, SDMA_PKT_Q_INACTIVE); 1407 wake_up(&pq->wait); 1408 } 1409 } 1410 1411 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1412 { 1413 if (!list_empty(&req->txps)) { 1414 struct sdma_txreq *t, *p; 1415 1416 list_for_each_entry_safe(t, p, &req->txps, list) { 1417 struct user_sdma_txreq *tx = 1418 container_of(t, struct user_sdma_txreq, txreq); 1419 list_del_init(&t->list); 1420 sdma_txclean(req->pq->dd, t); 1421 kmem_cache_free(req->pq->txreq_cache, tx); 1422 } 1423 } 1424 if (req->data_iovs) { 1425 struct sdma_mmu_node *node; 1426 int i; 1427 1428 for (i = 0; i < req->data_iovs; i++) { 1429 node = req->iovs[i].node; 1430 if (!node) 1431 continue; 1432 1433 if (unpin) 1434 hfi1_mmu_rb_remove(req->pq->handler, 1435 &node->rb); 1436 else 1437 atomic_dec(&node->refcount); 1438 } 1439 } 1440 kfree(req->tids); 1441 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1442 } 1443 1444 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1445 struct hfi1_user_sdma_comp_q *cq, 1446 u16 idx, enum hfi1_sdma_comp_state state, 1447 int ret) 1448 { 1449 if (state == ERROR) 1450 cq->comps[idx].errcode = -ret; 1451 smp_wmb(); /* make sure errcode is visible first */ 1452 cq->comps[idx].status = state; 1453 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1454 idx, state, ret); 1455 } 1456 1457 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1458 unsigned long len) 1459 { 1460 return (bool)(node->addr == addr); 1461 } 1462 1463 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1464 { 1465 struct sdma_mmu_node *node = 1466 container_of(mnode, struct sdma_mmu_node, rb); 1467 1468 atomic_inc(&node->refcount); 1469 return 0; 1470 } 1471 1472 /* 1473 * Return 1 to remove the node from the rb tree and call the remove op. 1474 * 1475 * Called with the rb tree lock held. 1476 */ 1477 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1478 void *evict_arg, bool *stop) 1479 { 1480 struct sdma_mmu_node *node = 1481 container_of(mnode, struct sdma_mmu_node, rb); 1482 struct evict_data *evict_data = evict_arg; 1483 1484 /* is this node still being used? */ 1485 if (atomic_read(&node->refcount)) 1486 return 0; /* keep this node */ 1487 1488 /* this node will be evicted, add its pages to our count */ 1489 evict_data->cleared += node->npages; 1490 1491 /* have enough pages been cleared? */ 1492 if (evict_data->cleared >= evict_data->target) 1493 *stop = true; 1494 1495 return 1; /* remove this node */ 1496 } 1497 1498 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1499 { 1500 struct sdma_mmu_node *node = 1501 container_of(mnode, struct sdma_mmu_node, rb); 1502 1503 unpin_sdma_pages(node); 1504 kfree(node); 1505 } 1506 1507 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1508 { 1509 struct sdma_mmu_node *node = 1510 container_of(mnode, struct sdma_mmu_node, rb); 1511 1512 if (!atomic_read(&node->refcount)) 1513 return 1; 1514 return 0; 1515 } 1516