1 /* 2 * Copyright(c) 2015 - 2018 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "mmu_rb.h" 68 #include "user_sdma.h" 69 #include "verbs.h" /* for the headers */ 70 #include "common.h" /* for struct hfi1_tid_info */ 71 #include "trace.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 static unsigned initial_pkt_count = 8; 78 79 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); 80 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 81 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 82 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 83 static int pin_vector_pages(struct user_sdma_request *req, 84 struct user_sdma_iovec *iovec); 85 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 86 unsigned start, unsigned npages); 87 static int check_header_template(struct user_sdma_request *req, 88 struct hfi1_pkt_header *hdr, u32 lrhlen, 89 u32 datalen); 90 static int set_txreq_header(struct user_sdma_request *req, 91 struct user_sdma_txreq *tx, u32 datalen); 92 static int set_txreq_header_ahg(struct user_sdma_request *req, 93 struct user_sdma_txreq *tx, u32 len); 94 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 95 struct hfi1_user_sdma_comp_q *cq, 96 u16 idx, enum hfi1_sdma_comp_state state, 97 int ret); 98 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 99 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 100 101 static int defer_packet_queue( 102 struct sdma_engine *sde, 103 struct iowait_work *wait, 104 struct sdma_txreq *txreq, 105 uint seq, 106 bool pkts_sent); 107 static void activate_packet_queue(struct iowait *wait, int reason); 108 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 109 unsigned long len); 110 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 111 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 112 void *arg2, bool *stop); 113 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 114 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 115 116 static struct mmu_rb_ops sdma_rb_ops = { 117 .filter = sdma_rb_filter, 118 .insert = sdma_rb_insert, 119 .evict = sdma_rb_evict, 120 .remove = sdma_rb_remove, 121 .invalidate = sdma_rb_invalidate 122 }; 123 124 static int defer_packet_queue( 125 struct sdma_engine *sde, 126 struct iowait_work *wait, 127 struct sdma_txreq *txreq, 128 uint seq, 129 bool pkts_sent) 130 { 131 struct hfi1_user_sdma_pkt_q *pq = 132 container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); 133 134 write_seqlock(&sde->waitlock); 135 if (sdma_progress(sde, seq, txreq)) 136 goto eagain; 137 /* 138 * We are assuming that if the list is enqueued somewhere, it 139 * is to the dmawait list since that is the only place where 140 * it is supposed to be enqueued. 141 */ 142 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 143 if (list_empty(&pq->busy.list)) { 144 pq->busy.lock = &sde->waitlock; 145 iowait_get_priority(&pq->busy); 146 iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); 147 } 148 write_sequnlock(&sde->waitlock); 149 return -EBUSY; 150 eagain: 151 write_sequnlock(&sde->waitlock); 152 return -EAGAIN; 153 } 154 155 static void activate_packet_queue(struct iowait *wait, int reason) 156 { 157 struct hfi1_user_sdma_pkt_q *pq = 158 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 159 pq->busy.lock = NULL; 160 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 161 wake_up(&wait->wait_dma); 162 }; 163 164 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 165 struct hfi1_filedata *fd) 166 { 167 int ret = -ENOMEM; 168 char buf[64]; 169 struct hfi1_devdata *dd; 170 struct hfi1_user_sdma_comp_q *cq; 171 struct hfi1_user_sdma_pkt_q *pq; 172 173 if (!uctxt || !fd) 174 return -EBADF; 175 176 if (!hfi1_sdma_comp_ring_size) 177 return -EINVAL; 178 179 dd = uctxt->dd; 180 181 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 182 if (!pq) 183 return -ENOMEM; 184 pq->dd = dd; 185 pq->ctxt = uctxt->ctxt; 186 pq->subctxt = fd->subctxt; 187 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 188 atomic_set(&pq->n_reqs, 0); 189 init_waitqueue_head(&pq->wait); 190 atomic_set(&pq->n_locked, 0); 191 pq->mm = fd->mm; 192 193 iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, 194 activate_packet_queue, NULL, NULL); 195 pq->reqidx = 0; 196 197 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 198 sizeof(*pq->reqs), 199 GFP_KERNEL); 200 if (!pq->reqs) 201 goto pq_reqs_nomem; 202 203 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 204 sizeof(*pq->req_in_use), 205 GFP_KERNEL); 206 if (!pq->req_in_use) 207 goto pq_reqs_no_in_use; 208 209 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 210 fd->subctxt); 211 pq->txreq_cache = kmem_cache_create(buf, 212 sizeof(struct user_sdma_txreq), 213 L1_CACHE_BYTES, 214 SLAB_HWCACHE_ALIGN, 215 NULL); 216 if (!pq->txreq_cache) { 217 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 218 uctxt->ctxt); 219 goto pq_txreq_nomem; 220 } 221 222 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 223 if (!cq) 224 goto cq_nomem; 225 226 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 227 * hfi1_sdma_comp_ring_size)); 228 if (!cq->comps) 229 goto cq_comps_nomem; 230 231 cq->nentries = hfi1_sdma_comp_ring_size; 232 233 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 234 &pq->handler); 235 if (ret) { 236 dd_dev_err(dd, "Failed to register with MMU %d", ret); 237 goto pq_mmu_fail; 238 } 239 240 rcu_assign_pointer(fd->pq, pq); 241 fd->cq = cq; 242 243 return 0; 244 245 pq_mmu_fail: 246 vfree(cq->comps); 247 cq_comps_nomem: 248 kfree(cq); 249 cq_nomem: 250 kmem_cache_destroy(pq->txreq_cache); 251 pq_txreq_nomem: 252 kfree(pq->req_in_use); 253 pq_reqs_no_in_use: 254 kfree(pq->reqs); 255 pq_reqs_nomem: 256 kfree(pq); 257 258 return ret; 259 } 260 261 static void flush_pq_iowait(struct hfi1_user_sdma_pkt_q *pq) 262 { 263 unsigned long flags; 264 seqlock_t *lock = pq->busy.lock; 265 266 if (!lock) 267 return; 268 write_seqlock_irqsave(lock, flags); 269 if (!list_empty(&pq->busy.list)) { 270 list_del_init(&pq->busy.list); 271 pq->busy.lock = NULL; 272 } 273 write_sequnlock_irqrestore(lock, flags); 274 } 275 276 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, 277 struct hfi1_ctxtdata *uctxt) 278 { 279 struct hfi1_user_sdma_pkt_q *pq; 280 281 trace_hfi1_sdma_user_free_queues(uctxt->dd, uctxt->ctxt, fd->subctxt); 282 283 spin_lock(&fd->pq_rcu_lock); 284 pq = srcu_dereference_check(fd->pq, &fd->pq_srcu, 285 lockdep_is_held(&fd->pq_rcu_lock)); 286 if (pq) { 287 rcu_assign_pointer(fd->pq, NULL); 288 spin_unlock(&fd->pq_rcu_lock); 289 synchronize_srcu(&fd->pq_srcu); 290 /* at this point there can be no more new requests */ 291 if (pq->handler) 292 hfi1_mmu_rb_unregister(pq->handler); 293 iowait_sdma_drain(&pq->busy); 294 /* Wait until all requests have been freed. */ 295 wait_event_interruptible( 296 pq->wait, 297 !atomic_read(&pq->n_reqs)); 298 kfree(pq->reqs); 299 kfree(pq->req_in_use); 300 kmem_cache_destroy(pq->txreq_cache); 301 flush_pq_iowait(pq); 302 kfree(pq); 303 } else { 304 spin_unlock(&fd->pq_rcu_lock); 305 } 306 if (fd->cq) { 307 vfree(fd->cq->comps); 308 kfree(fd->cq); 309 fd->cq = NULL; 310 } 311 return 0; 312 } 313 314 static u8 dlid_to_selector(u16 dlid) 315 { 316 static u8 mapping[256]; 317 static int initialized; 318 static u8 next; 319 int hash; 320 321 if (!initialized) { 322 memset(mapping, 0xFF, 256); 323 initialized = 1; 324 } 325 326 hash = ((dlid >> 8) ^ dlid) & 0xFF; 327 if (mapping[hash] == 0xFF) { 328 mapping[hash] = next; 329 next = (next + 1) & 0x7F; 330 } 331 332 return mapping[hash]; 333 } 334 335 /** 336 * hfi1_user_sdma_process_request() - Process and start a user sdma request 337 * @fd: valid file descriptor 338 * @iovec: array of io vectors to process 339 * @dim: overall iovec array size 340 * @count: number of io vector array entries processed 341 */ 342 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 343 struct iovec *iovec, unsigned long dim, 344 unsigned long *count) 345 { 346 int ret = 0, i; 347 struct hfi1_ctxtdata *uctxt = fd->uctxt; 348 struct hfi1_user_sdma_pkt_q *pq = 349 srcu_dereference(fd->pq, &fd->pq_srcu); 350 struct hfi1_user_sdma_comp_q *cq = fd->cq; 351 struct hfi1_devdata *dd = pq->dd; 352 unsigned long idx = 0; 353 u8 pcount = initial_pkt_count; 354 struct sdma_req_info info; 355 struct user_sdma_request *req; 356 u8 opcode, sc, vl; 357 u16 pkey; 358 u32 slid; 359 u16 dlid; 360 u32 selector; 361 362 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 363 hfi1_cdbg( 364 SDMA, 365 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 366 dd->unit, uctxt->ctxt, fd->subctxt, 367 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 368 return -EINVAL; 369 } 370 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 371 if (ret) { 372 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 373 dd->unit, uctxt->ctxt, fd->subctxt, ret); 374 return -EFAULT; 375 } 376 377 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 378 (u16 *)&info); 379 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 380 hfi1_cdbg(SDMA, 381 "[%u:%u:%u:%u] Invalid comp index", 382 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 383 return -EINVAL; 384 } 385 386 /* 387 * Sanity check the header io vector count. Need at least 1 vector 388 * (header) and cannot be larger than the actual io vector count. 389 */ 390 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 391 hfi1_cdbg(SDMA, 392 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 393 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 394 req_iovcnt(info.ctrl), dim); 395 return -EINVAL; 396 } 397 398 if (!info.fragsize) { 399 hfi1_cdbg(SDMA, 400 "[%u:%u:%u:%u] Request does not specify fragsize", 401 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 402 return -EINVAL; 403 } 404 405 /* Try to claim the request. */ 406 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 407 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 408 dd->unit, uctxt->ctxt, fd->subctxt, 409 info.comp_idx); 410 return -EBADSLT; 411 } 412 /* 413 * All safety checks have been done and this request has been claimed. 414 */ 415 trace_hfi1_sdma_user_process_request(dd, uctxt->ctxt, fd->subctxt, 416 info.comp_idx); 417 req = pq->reqs + info.comp_idx; 418 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 419 req->data_len = 0; 420 req->pq = pq; 421 req->cq = cq; 422 req->ahg_idx = -1; 423 req->iov_idx = 0; 424 req->sent = 0; 425 req->seqnum = 0; 426 req->seqcomp = 0; 427 req->seqsubmitted = 0; 428 req->tids = NULL; 429 req->has_error = 0; 430 INIT_LIST_HEAD(&req->txps); 431 432 memcpy(&req->info, &info, sizeof(info)); 433 434 /* The request is initialized, count it */ 435 atomic_inc(&pq->n_reqs); 436 437 if (req_opcode(info.ctrl) == EXPECTED) { 438 /* expected must have a TID info and at least one data vector */ 439 if (req->data_iovs < 2) { 440 SDMA_DBG(req, 441 "Not enough vectors for expected request"); 442 ret = -EINVAL; 443 goto free_req; 444 } 445 req->data_iovs--; 446 } 447 448 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 449 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 450 MAX_VECTORS_PER_REQ); 451 ret = -EINVAL; 452 goto free_req; 453 } 454 /* Copy the header from the user buffer */ 455 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 456 sizeof(req->hdr)); 457 if (ret) { 458 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 459 ret = -EFAULT; 460 goto free_req; 461 } 462 463 /* If Static rate control is not enabled, sanitize the header. */ 464 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 465 req->hdr.pbc[2] = 0; 466 467 /* Validate the opcode. Do not trust packets from user space blindly. */ 468 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 469 if ((opcode & USER_OPCODE_CHECK_MASK) != 470 USER_OPCODE_CHECK_VAL) { 471 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 472 ret = -EINVAL; 473 goto free_req; 474 } 475 /* 476 * Validate the vl. Do not trust packets from user space blindly. 477 * VL comes from PBC, SC comes from LRH, and the VL needs to 478 * match the SC look up. 479 */ 480 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 481 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 482 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 483 if (vl >= dd->pport->vls_operational || 484 vl != sc_to_vlt(dd, sc)) { 485 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 486 ret = -EINVAL; 487 goto free_req; 488 } 489 490 /* Checking P_KEY for requests from user-space */ 491 pkey = (u16)be32_to_cpu(req->hdr.bth[0]); 492 slid = be16_to_cpu(req->hdr.lrh[3]); 493 if (egress_pkey_check(dd->pport, slid, pkey, sc, PKEY_CHECK_INVALID)) { 494 ret = -EINVAL; 495 goto free_req; 496 } 497 498 /* 499 * Also should check the BTH.lnh. If it says the next header is GRH then 500 * the RXE parsing will be off and will land in the middle of the KDETH 501 * or miss it entirely. 502 */ 503 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 504 SDMA_DBG(req, "User tried to pass in a GRH"); 505 ret = -EINVAL; 506 goto free_req; 507 } 508 509 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 510 /* 511 * Calculate the initial TID offset based on the values of 512 * KDETH.OFFSET and KDETH.OM that are passed in. 513 */ 514 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 515 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 516 KDETH_OM_LARGE : KDETH_OM_SMALL); 517 trace_hfi1_sdma_user_initial_tidoffset(dd, uctxt->ctxt, fd->subctxt, 518 info.comp_idx, req->tidoffset); 519 idx++; 520 521 /* Save all the IO vector structures */ 522 for (i = 0; i < req->data_iovs; i++) { 523 req->iovs[i].offset = 0; 524 INIT_LIST_HEAD(&req->iovs[i].list); 525 memcpy(&req->iovs[i].iov, 526 iovec + idx++, 527 sizeof(req->iovs[i].iov)); 528 ret = pin_vector_pages(req, &req->iovs[i]); 529 if (ret) { 530 req->data_iovs = i; 531 goto free_req; 532 } 533 req->data_len += req->iovs[i].iov.iov_len; 534 } 535 trace_hfi1_sdma_user_data_length(dd, uctxt->ctxt, fd->subctxt, 536 info.comp_idx, req->data_len); 537 if (pcount > req->info.npkts) 538 pcount = req->info.npkts; 539 /* 540 * Copy any TID info 541 * User space will provide the TID info only when the 542 * request type is EXPECTED. This is true even if there is 543 * only one packet in the request and the header is already 544 * setup. The reason for the singular TID case is that the 545 * driver needs to perform safety checks. 546 */ 547 if (req_opcode(req->info.ctrl) == EXPECTED) { 548 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 549 u32 *tmp; 550 551 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 552 ret = -EINVAL; 553 goto free_req; 554 } 555 556 /* 557 * We have to copy all of the tids because they may vary 558 * in size and, therefore, the TID count might not be 559 * equal to the pkt count. However, there is no way to 560 * tell at this point. 561 */ 562 tmp = memdup_user(iovec[idx].iov_base, 563 ntids * sizeof(*req->tids)); 564 if (IS_ERR(tmp)) { 565 ret = PTR_ERR(tmp); 566 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 567 ntids, ret); 568 goto free_req; 569 } 570 req->tids = tmp; 571 req->n_tids = ntids; 572 req->tididx = 0; 573 idx++; 574 } 575 576 dlid = be16_to_cpu(req->hdr.lrh[1]); 577 selector = dlid_to_selector(dlid); 578 selector += uctxt->ctxt + fd->subctxt; 579 req->sde = sdma_select_user_engine(dd, selector, vl); 580 581 if (!req->sde || !sdma_running(req->sde)) { 582 ret = -ECOMM; 583 goto free_req; 584 } 585 586 /* We don't need an AHG entry if the request contains only one packet */ 587 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 588 req->ahg_idx = sdma_ahg_alloc(req->sde); 589 590 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 591 pq->state = SDMA_PKT_Q_ACTIVE; 592 593 /* 594 * This is a somewhat blocking send implementation. 595 * The driver will block the caller until all packets of the 596 * request have been submitted to the SDMA engine. However, it 597 * will not wait for send completions. 598 */ 599 while (req->seqsubmitted != req->info.npkts) { 600 ret = user_sdma_send_pkts(req, pcount); 601 if (ret < 0) { 602 if (ret != -EBUSY) 603 goto free_req; 604 if (wait_event_interruptible_timeout( 605 pq->busy.wait_dma, 606 pq->state == SDMA_PKT_Q_ACTIVE, 607 msecs_to_jiffies( 608 SDMA_IOWAIT_TIMEOUT)) <= 0) 609 flush_pq_iowait(pq); 610 } 611 } 612 *count += idx; 613 return 0; 614 free_req: 615 /* 616 * If the submitted seqsubmitted == npkts, the completion routine 617 * controls the final state. If sequbmitted < npkts, wait for any 618 * outstanding packets to finish before cleaning up. 619 */ 620 if (req->seqsubmitted < req->info.npkts) { 621 if (req->seqsubmitted) 622 wait_event(pq->busy.wait_dma, 623 (req->seqcomp == req->seqsubmitted - 1)); 624 user_sdma_free_request(req, true); 625 pq_update(pq); 626 set_comp_state(pq, cq, info.comp_idx, ERROR, ret); 627 } 628 return ret; 629 } 630 631 static inline u32 compute_data_length(struct user_sdma_request *req, 632 struct user_sdma_txreq *tx) 633 { 634 /* 635 * Determine the proper size of the packet data. 636 * The size of the data of the first packet is in the header 637 * template. However, it includes the header and ICRC, which need 638 * to be subtracted. 639 * The minimum representable packet data length in a header is 4 bytes, 640 * therefore, when the data length request is less than 4 bytes, there's 641 * only one packet, and the packet data length is equal to that of the 642 * request data length. 643 * The size of the remaining packets is the minimum of the frag 644 * size (MTU) or remaining data in the request. 645 */ 646 u32 len; 647 648 if (!req->seqnum) { 649 if (req->data_len < sizeof(u32)) 650 len = req->data_len; 651 else 652 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 653 (sizeof(tx->hdr) - 4)); 654 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 655 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 656 PAGE_SIZE; 657 /* 658 * Get the data length based on the remaining space in the 659 * TID pair. 660 */ 661 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 662 /* If we've filled up the TID pair, move to the next one. */ 663 if (unlikely(!len) && ++req->tididx < req->n_tids && 664 req->tids[req->tididx]) { 665 tidlen = EXP_TID_GET(req->tids[req->tididx], 666 LEN) * PAGE_SIZE; 667 req->tidoffset = 0; 668 len = min_t(u32, tidlen, req->info.fragsize); 669 } 670 /* 671 * Since the TID pairs map entire pages, make sure that we 672 * are not going to try to send more data that we have 673 * remaining. 674 */ 675 len = min(len, req->data_len - req->sent); 676 } else { 677 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 678 } 679 trace_hfi1_sdma_user_compute_length(req->pq->dd, 680 req->pq->ctxt, 681 req->pq->subctxt, 682 req->info.comp_idx, 683 len); 684 return len; 685 } 686 687 static inline u32 pad_len(u32 len) 688 { 689 if (len & (sizeof(u32) - 1)) 690 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 691 return len; 692 } 693 694 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 695 { 696 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 697 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 698 } 699 700 static int user_sdma_txadd_ahg(struct user_sdma_request *req, 701 struct user_sdma_txreq *tx, 702 u32 datalen) 703 { 704 int ret; 705 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 706 u32 lrhlen = get_lrh_len(req->hdr, pad_len(datalen)); 707 struct hfi1_user_sdma_pkt_q *pq = req->pq; 708 709 /* 710 * Copy the request header into the tx header 711 * because the HW needs a cacheline-aligned 712 * address. 713 * This copy can be optimized out if the hdr 714 * member of user_sdma_request were also 715 * cacheline aligned. 716 */ 717 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 718 if (PBC2LRH(pbclen) != lrhlen) { 719 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 720 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 721 } 722 ret = check_header_template(req, &tx->hdr, lrhlen, datalen); 723 if (ret) 724 return ret; 725 ret = sdma_txinit_ahg(&tx->txreq, SDMA_TXREQ_F_AHG_COPY, 726 sizeof(tx->hdr) + datalen, req->ahg_idx, 727 0, NULL, 0, user_sdma_txreq_cb); 728 if (ret) 729 return ret; 730 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, &tx->hdr, sizeof(tx->hdr)); 731 if (ret) 732 sdma_txclean(pq->dd, &tx->txreq); 733 return ret; 734 } 735 736 static int user_sdma_txadd(struct user_sdma_request *req, 737 struct user_sdma_txreq *tx, 738 struct user_sdma_iovec *iovec, u32 datalen, 739 u32 *queued_ptr, u32 *data_sent_ptr, 740 u64 *iov_offset_ptr) 741 { 742 int ret; 743 unsigned int pageidx, len; 744 unsigned long base, offset; 745 u64 iov_offset = *iov_offset_ptr; 746 u32 queued = *queued_ptr, data_sent = *data_sent_ptr; 747 struct hfi1_user_sdma_pkt_q *pq = req->pq; 748 749 base = (unsigned long)iovec->iov.iov_base; 750 offset = offset_in_page(base + iovec->offset + iov_offset); 751 pageidx = (((iovec->offset + iov_offset + base) - (base & PAGE_MASK)) >> 752 PAGE_SHIFT); 753 len = offset + req->info.fragsize > PAGE_SIZE ? 754 PAGE_SIZE - offset : req->info.fragsize; 755 len = min((datalen - queued), len); 756 ret = sdma_txadd_page(pq->dd, &tx->txreq, iovec->pages[pageidx], 757 offset, len); 758 if (ret) { 759 SDMA_DBG(req, "SDMA txreq add page failed %d\n", ret); 760 return ret; 761 } 762 iov_offset += len; 763 queued += len; 764 data_sent += len; 765 if (unlikely(queued < datalen && pageidx == iovec->npages && 766 req->iov_idx < req->data_iovs - 1)) { 767 iovec->offset += iov_offset; 768 iovec = &req->iovs[++req->iov_idx]; 769 iov_offset = 0; 770 } 771 772 *queued_ptr = queued; 773 *data_sent_ptr = data_sent; 774 *iov_offset_ptr = iov_offset; 775 return ret; 776 } 777 778 static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) 779 { 780 int ret = 0; 781 u16 count; 782 unsigned npkts = 0; 783 struct user_sdma_txreq *tx = NULL; 784 struct hfi1_user_sdma_pkt_q *pq = NULL; 785 struct user_sdma_iovec *iovec = NULL; 786 787 if (!req->pq) 788 return -EINVAL; 789 790 pq = req->pq; 791 792 /* If tx completion has reported an error, we are done. */ 793 if (READ_ONCE(req->has_error)) 794 return -EFAULT; 795 796 /* 797 * Check if we might have sent the entire request already 798 */ 799 if (unlikely(req->seqnum == req->info.npkts)) { 800 if (!list_empty(&req->txps)) 801 goto dosend; 802 return ret; 803 } 804 805 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 806 maxpkts = req->info.npkts - req->seqnum; 807 808 while (npkts < maxpkts) { 809 u32 datalen = 0, queued = 0, data_sent = 0; 810 u64 iov_offset = 0; 811 812 /* 813 * Check whether any of the completions have come back 814 * with errors. If so, we are not going to process any 815 * more packets from this request. 816 */ 817 if (READ_ONCE(req->has_error)) 818 return -EFAULT; 819 820 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 821 if (!tx) 822 return -ENOMEM; 823 824 tx->flags = 0; 825 tx->req = req; 826 INIT_LIST_HEAD(&tx->list); 827 828 /* 829 * For the last packet set the ACK request 830 * and disable header suppression. 831 */ 832 if (req->seqnum == req->info.npkts - 1) 833 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 834 TXREQ_FLAGS_REQ_DISABLE_SH); 835 836 /* 837 * Calculate the payload size - this is min of the fragment 838 * (MTU) size or the remaining bytes in the request but only 839 * if we have payload data. 840 */ 841 if (req->data_len) { 842 iovec = &req->iovs[req->iov_idx]; 843 if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) { 844 if (++req->iov_idx == req->data_iovs) { 845 ret = -EFAULT; 846 goto free_tx; 847 } 848 iovec = &req->iovs[req->iov_idx]; 849 WARN_ON(iovec->offset); 850 } 851 852 datalen = compute_data_length(req, tx); 853 854 /* 855 * Disable header suppression for the payload <= 8DWS. 856 * If there is an uncorrectable error in the receive 857 * data FIFO when the received payload size is less than 858 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 859 * not reported.There is set RHF.EccErr if the header 860 * is not suppressed. 861 */ 862 if (!datalen) { 863 SDMA_DBG(req, 864 "Request has data but pkt len is 0"); 865 ret = -EFAULT; 866 goto free_tx; 867 } else if (datalen <= 32) { 868 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 869 } 870 } 871 872 if (req->ahg_idx >= 0) { 873 if (!req->seqnum) { 874 ret = user_sdma_txadd_ahg(req, tx, datalen); 875 if (ret) 876 goto free_tx; 877 } else { 878 int changes; 879 880 changes = set_txreq_header_ahg(req, tx, 881 datalen); 882 if (changes < 0) { 883 ret = changes; 884 goto free_tx; 885 } 886 } 887 } else { 888 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 889 datalen, user_sdma_txreq_cb); 890 if (ret) 891 goto free_tx; 892 /* 893 * Modify the header for this packet. This only needs 894 * to be done if we are not going to use AHG. Otherwise, 895 * the HW will do it based on the changes we gave it 896 * during sdma_txinit_ahg(). 897 */ 898 ret = set_txreq_header(req, tx, datalen); 899 if (ret) 900 goto free_txreq; 901 } 902 903 /* 904 * If the request contains any data vectors, add up to 905 * fragsize bytes to the descriptor. 906 */ 907 while (queued < datalen && 908 (req->sent + data_sent) < req->data_len) { 909 ret = user_sdma_txadd(req, tx, iovec, datalen, 910 &queued, &data_sent, &iov_offset); 911 if (ret) 912 goto free_txreq; 913 } 914 /* 915 * The txreq was submitted successfully so we can update 916 * the counters. 917 */ 918 req->koffset += datalen; 919 if (req_opcode(req->info.ctrl) == EXPECTED) 920 req->tidoffset += datalen; 921 req->sent += data_sent; 922 if (req->data_len) 923 iovec->offset += iov_offset; 924 list_add_tail(&tx->txreq.list, &req->txps); 925 /* 926 * It is important to increment this here as it is used to 927 * generate the BTH.PSN and, therefore, can't be bulk-updated 928 * outside of the loop. 929 */ 930 tx->seqnum = req->seqnum++; 931 npkts++; 932 } 933 dosend: 934 ret = sdma_send_txlist(req->sde, 935 iowait_get_ib_work(&pq->busy), 936 &req->txps, &count); 937 req->seqsubmitted += count; 938 if (req->seqsubmitted == req->info.npkts) { 939 /* 940 * The txreq has already been submitted to the HW queue 941 * so we can free the AHG entry now. Corruption will not 942 * happen due to the sequential manner in which 943 * descriptors are processed. 944 */ 945 if (req->ahg_idx >= 0) 946 sdma_ahg_free(req->sde, req->ahg_idx); 947 } 948 return ret; 949 950 free_txreq: 951 sdma_txclean(pq->dd, &tx->txreq); 952 free_tx: 953 kmem_cache_free(pq->txreq_cache, tx); 954 return ret; 955 } 956 957 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 958 { 959 struct evict_data evict_data; 960 961 evict_data.cleared = 0; 962 evict_data.target = npages; 963 hfi1_mmu_rb_evict(pq->handler, &evict_data); 964 return evict_data.cleared; 965 } 966 967 static int pin_sdma_pages(struct user_sdma_request *req, 968 struct user_sdma_iovec *iovec, 969 struct sdma_mmu_node *node, 970 int npages) 971 { 972 int pinned, cleared; 973 struct page **pages; 974 struct hfi1_user_sdma_pkt_q *pq = req->pq; 975 976 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 977 if (!pages) 978 return -ENOMEM; 979 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 980 981 npages -= node->npages; 982 retry: 983 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 984 atomic_read(&pq->n_locked), npages)) { 985 cleared = sdma_cache_evict(pq, npages); 986 if (cleared >= npages) 987 goto retry; 988 } 989 pinned = hfi1_acquire_user_pages(pq->mm, 990 ((unsigned long)iovec->iov.iov_base + 991 (node->npages * PAGE_SIZE)), npages, 0, 992 pages + node->npages); 993 if (pinned < 0) { 994 kfree(pages); 995 return pinned; 996 } 997 if (pinned != npages) { 998 unpin_vector_pages(pq->mm, pages, node->npages, pinned); 999 return -EFAULT; 1000 } 1001 kfree(node->pages); 1002 node->rb.len = iovec->iov.iov_len; 1003 node->pages = pages; 1004 atomic_add(pinned, &pq->n_locked); 1005 return pinned; 1006 } 1007 1008 static void unpin_sdma_pages(struct sdma_mmu_node *node) 1009 { 1010 if (node->npages) { 1011 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 1012 atomic_sub(node->npages, &node->pq->n_locked); 1013 } 1014 } 1015 1016 static int pin_vector_pages(struct user_sdma_request *req, 1017 struct user_sdma_iovec *iovec) 1018 { 1019 int ret = 0, pinned, npages; 1020 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1021 struct sdma_mmu_node *node = NULL; 1022 struct mmu_rb_node *rb_node; 1023 struct iovec *iov; 1024 bool extracted; 1025 1026 extracted = 1027 hfi1_mmu_rb_remove_unless_exact(pq->handler, 1028 (unsigned long) 1029 iovec->iov.iov_base, 1030 iovec->iov.iov_len, &rb_node); 1031 if (rb_node) { 1032 node = container_of(rb_node, struct sdma_mmu_node, rb); 1033 if (!extracted) { 1034 atomic_inc(&node->refcount); 1035 iovec->pages = node->pages; 1036 iovec->npages = node->npages; 1037 iovec->node = node; 1038 return 0; 1039 } 1040 } 1041 1042 if (!node) { 1043 node = kzalloc(sizeof(*node), GFP_KERNEL); 1044 if (!node) 1045 return -ENOMEM; 1046 1047 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1048 node->pq = pq; 1049 atomic_set(&node->refcount, 0); 1050 } 1051 1052 iov = &iovec->iov; 1053 npages = num_user_pages((unsigned long)iov->iov_base, iov->iov_len); 1054 if (node->npages < npages) { 1055 pinned = pin_sdma_pages(req, iovec, node, npages); 1056 if (pinned < 0) { 1057 ret = pinned; 1058 goto bail; 1059 } 1060 node->npages += pinned; 1061 npages = node->npages; 1062 } 1063 iovec->pages = node->pages; 1064 iovec->npages = npages; 1065 iovec->node = node; 1066 1067 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1068 if (ret) { 1069 iovec->node = NULL; 1070 goto bail; 1071 } 1072 return 0; 1073 bail: 1074 unpin_sdma_pages(node); 1075 kfree(node); 1076 return ret; 1077 } 1078 1079 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1080 unsigned start, unsigned npages) 1081 { 1082 hfi1_release_user_pages(mm, pages + start, npages, false); 1083 kfree(pages); 1084 } 1085 1086 static int check_header_template(struct user_sdma_request *req, 1087 struct hfi1_pkt_header *hdr, u32 lrhlen, 1088 u32 datalen) 1089 { 1090 /* 1091 * Perform safety checks for any type of packet: 1092 * - transfer size is multiple of 64bytes 1093 * - packet length is multiple of 4 bytes 1094 * - packet length is not larger than MTU size 1095 * 1096 * These checks are only done for the first packet of the 1097 * transfer since the header is "given" to us by user space. 1098 * For the remainder of the packets we compute the values. 1099 */ 1100 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1101 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1102 return -EINVAL; 1103 1104 if (req_opcode(req->info.ctrl) == EXPECTED) { 1105 /* 1106 * The header is checked only on the first packet. Furthermore, 1107 * we ensure that at least one TID entry is copied when the 1108 * request is submitted. Therefore, we don't have to verify that 1109 * tididx points to something sane. 1110 */ 1111 u32 tidval = req->tids[req->tididx], 1112 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1113 tididx = EXP_TID_GET(tidval, IDX), 1114 tidctrl = EXP_TID_GET(tidval, CTRL), 1115 tidoff; 1116 __le32 kval = hdr->kdeth.ver_tid_offset; 1117 1118 tidoff = KDETH_GET(kval, OFFSET) * 1119 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1120 KDETH_OM_LARGE : KDETH_OM_SMALL); 1121 /* 1122 * Expected receive packets have the following 1123 * additional checks: 1124 * - offset is not larger than the TID size 1125 * - TIDCtrl values match between header and TID array 1126 * - TID indexes match between header and TID array 1127 */ 1128 if ((tidoff + datalen > tidlen) || 1129 KDETH_GET(kval, TIDCTRL) != tidctrl || 1130 KDETH_GET(kval, TID) != tididx) 1131 return -EINVAL; 1132 } 1133 return 0; 1134 } 1135 1136 /* 1137 * Correctly set the BTH.PSN field based on type of 1138 * transfer - eager packets can just increment the PSN but 1139 * expected packets encode generation and sequence in the 1140 * BTH.PSN field so just incrementing will result in errors. 1141 */ 1142 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1143 { 1144 u32 val = be32_to_cpu(bthpsn), 1145 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1146 0xffffffull), 1147 psn = val & mask; 1148 if (expct) 1149 psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) | 1150 ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK); 1151 else 1152 psn = psn + frags; 1153 return psn & mask; 1154 } 1155 1156 static int set_txreq_header(struct user_sdma_request *req, 1157 struct user_sdma_txreq *tx, u32 datalen) 1158 { 1159 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1160 struct hfi1_pkt_header *hdr = &tx->hdr; 1161 u8 omfactor; /* KDETH.OM */ 1162 u16 pbclen; 1163 int ret; 1164 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1165 1166 /* Copy the header template to the request before modification */ 1167 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1168 1169 /* 1170 * Check if the PBC and LRH length are mismatched. If so 1171 * adjust both in the header. 1172 */ 1173 pbclen = le16_to_cpu(hdr->pbc[0]); 1174 if (PBC2LRH(pbclen) != lrhlen) { 1175 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1176 hdr->pbc[0] = cpu_to_le16(pbclen); 1177 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1178 /* 1179 * Third packet 1180 * This is the first packet in the sequence that has 1181 * a "static" size that can be used for the rest of 1182 * the packets (besides the last one). 1183 */ 1184 if (unlikely(req->seqnum == 2)) { 1185 /* 1186 * From this point on the lengths in both the 1187 * PBC and LRH are the same until the last 1188 * packet. 1189 * Adjust the template so we don't have to update 1190 * every packet 1191 */ 1192 req->hdr.pbc[0] = hdr->pbc[0]; 1193 req->hdr.lrh[2] = hdr->lrh[2]; 1194 } 1195 } 1196 /* 1197 * We only have to modify the header if this is not the 1198 * first packet in the request. Otherwise, we use the 1199 * header given to us. 1200 */ 1201 if (unlikely(!req->seqnum)) { 1202 ret = check_header_template(req, hdr, lrhlen, datalen); 1203 if (ret) 1204 return ret; 1205 goto done; 1206 } 1207 1208 hdr->bth[2] = cpu_to_be32( 1209 set_pkt_bth_psn(hdr->bth[2], 1210 (req_opcode(req->info.ctrl) == EXPECTED), 1211 req->seqnum)); 1212 1213 /* Set ACK request on last packet */ 1214 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1215 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1216 1217 /* Set the new offset */ 1218 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1219 /* Expected packets have to fill in the new TID information */ 1220 if (req_opcode(req->info.ctrl) == EXPECTED) { 1221 tidval = req->tids[req->tididx]; 1222 /* 1223 * If the offset puts us at the end of the current TID, 1224 * advance everything. 1225 */ 1226 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1227 PAGE_SIZE)) { 1228 req->tidoffset = 0; 1229 /* 1230 * Since we don't copy all the TIDs, all at once, 1231 * we have to check again. 1232 */ 1233 if (++req->tididx > req->n_tids - 1 || 1234 !req->tids[req->tididx]) { 1235 return -EINVAL; 1236 } 1237 tidval = req->tids[req->tididx]; 1238 } 1239 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1240 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1241 KDETH_OM_SMALL_SHIFT; 1242 /* Set KDETH.TIDCtrl based on value for this TID. */ 1243 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1244 EXP_TID_GET(tidval, CTRL)); 1245 /* Set KDETH.TID based on value for this TID */ 1246 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1247 EXP_TID_GET(tidval, IDX)); 1248 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1249 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1250 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1251 /* 1252 * Set the KDETH.OFFSET and KDETH.OM based on size of 1253 * transfer. 1254 */ 1255 trace_hfi1_sdma_user_tid_info( 1256 pq->dd, pq->ctxt, pq->subctxt, req->info.comp_idx, 1257 req->tidoffset, req->tidoffset >> omfactor, 1258 omfactor != KDETH_OM_SMALL_SHIFT); 1259 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1260 req->tidoffset >> omfactor); 1261 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1262 omfactor != KDETH_OM_SMALL_SHIFT); 1263 } 1264 done: 1265 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1266 req->info.comp_idx, hdr, tidval); 1267 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1268 } 1269 1270 static int set_txreq_header_ahg(struct user_sdma_request *req, 1271 struct user_sdma_txreq *tx, u32 datalen) 1272 { 1273 u32 ahg[AHG_KDETH_ARRAY_SIZE]; 1274 int idx = 0; 1275 u8 omfactor; /* KDETH.OM */ 1276 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1277 struct hfi1_pkt_header *hdr = &req->hdr; 1278 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1279 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1280 size_t array_size = ARRAY_SIZE(ahg); 1281 1282 if (PBC2LRH(pbclen) != lrhlen) { 1283 /* PBC.PbcLengthDWs */ 1284 idx = ahg_header_set(ahg, idx, array_size, 0, 0, 12, 1285 (__force u16)cpu_to_le16(LRH2PBC(lrhlen))); 1286 if (idx < 0) 1287 return idx; 1288 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1289 idx = ahg_header_set(ahg, idx, array_size, 3, 0, 16, 1290 (__force u16)cpu_to_be16(lrhlen >> 2)); 1291 if (idx < 0) 1292 return idx; 1293 } 1294 1295 /* 1296 * Do the common updates 1297 */ 1298 /* BTH.PSN and BTH.A */ 1299 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1300 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1301 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1302 val32 |= 1UL << 31; 1303 idx = ahg_header_set(ahg, idx, array_size, 6, 0, 16, 1304 (__force u16)cpu_to_be16(val32 >> 16)); 1305 if (idx < 0) 1306 return idx; 1307 idx = ahg_header_set(ahg, idx, array_size, 6, 16, 16, 1308 (__force u16)cpu_to_be16(val32 & 0xffff)); 1309 if (idx < 0) 1310 return idx; 1311 /* KDETH.Offset */ 1312 idx = ahg_header_set(ahg, idx, array_size, 15, 0, 16, 1313 (__force u16)cpu_to_le16(req->koffset & 0xffff)); 1314 if (idx < 0) 1315 return idx; 1316 idx = ahg_header_set(ahg, idx, array_size, 15, 16, 16, 1317 (__force u16)cpu_to_le16(req->koffset >> 16)); 1318 if (idx < 0) 1319 return idx; 1320 if (req_opcode(req->info.ctrl) == EXPECTED) { 1321 __le16 val; 1322 1323 tidval = req->tids[req->tididx]; 1324 1325 /* 1326 * If the offset puts us at the end of the current TID, 1327 * advance everything. 1328 */ 1329 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1330 PAGE_SIZE)) { 1331 req->tidoffset = 0; 1332 /* 1333 * Since we don't copy all the TIDs, all at once, 1334 * we have to check again. 1335 */ 1336 if (++req->tididx > req->n_tids - 1 || 1337 !req->tids[req->tididx]) 1338 return -EINVAL; 1339 tidval = req->tids[req->tididx]; 1340 } 1341 omfactor = ((EXP_TID_GET(tidval, LEN) * 1342 PAGE_SIZE) >= 1343 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1344 KDETH_OM_SMALL_SHIFT; 1345 /* KDETH.OM and KDETH.OFFSET (TID) */ 1346 idx = ahg_header_set( 1347 ahg, idx, array_size, 7, 0, 16, 1348 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1349 ((req->tidoffset >> omfactor) 1350 & 0x7fff))); 1351 if (idx < 0) 1352 return idx; 1353 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1354 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1355 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1356 1357 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1358 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1359 INTR) << 1360 AHG_KDETH_INTR_SHIFT)); 1361 } else { 1362 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1363 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1364 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1365 INTR) << 1366 AHG_KDETH_INTR_SHIFT)); 1367 } 1368 1369 idx = ahg_header_set(ahg, idx, array_size, 1370 7, 16, 14, (__force u16)val); 1371 if (idx < 0) 1372 return idx; 1373 } 1374 1375 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1376 req->info.comp_idx, req->sde->this_idx, 1377 req->ahg_idx, ahg, idx, tidval); 1378 sdma_txinit_ahg(&tx->txreq, 1379 SDMA_TXREQ_F_USE_AHG, 1380 datalen, req->ahg_idx, idx, 1381 ahg, sizeof(req->hdr), 1382 user_sdma_txreq_cb); 1383 1384 return idx; 1385 } 1386 1387 /** 1388 * user_sdma_txreq_cb() - SDMA tx request completion callback. 1389 * @txreq: valid sdma tx request 1390 * @status: success/failure of request 1391 * 1392 * Called when the SDMA progress state machine gets notification that 1393 * the SDMA descriptors for this tx request have been processed by the 1394 * DMA engine. Called in interrupt context. 1395 * Only do work on completed sequences. 1396 */ 1397 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1398 { 1399 struct user_sdma_txreq *tx = 1400 container_of(txreq, struct user_sdma_txreq, txreq); 1401 struct user_sdma_request *req; 1402 struct hfi1_user_sdma_pkt_q *pq; 1403 struct hfi1_user_sdma_comp_q *cq; 1404 enum hfi1_sdma_comp_state state = COMPLETE; 1405 1406 if (!tx->req) 1407 return; 1408 1409 req = tx->req; 1410 pq = req->pq; 1411 cq = req->cq; 1412 1413 if (status != SDMA_TXREQ_S_OK) { 1414 SDMA_DBG(req, "SDMA completion with error %d", 1415 status); 1416 WRITE_ONCE(req->has_error, 1); 1417 state = ERROR; 1418 } 1419 1420 req->seqcomp = tx->seqnum; 1421 kmem_cache_free(pq->txreq_cache, tx); 1422 1423 /* sequence isn't complete? We are done */ 1424 if (req->seqcomp != req->info.npkts - 1) 1425 return; 1426 1427 user_sdma_free_request(req, false); 1428 set_comp_state(pq, cq, req->info.comp_idx, state, status); 1429 pq_update(pq); 1430 } 1431 1432 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1433 { 1434 if (atomic_dec_and_test(&pq->n_reqs)) 1435 wake_up(&pq->wait); 1436 } 1437 1438 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1439 { 1440 int i; 1441 1442 if (!list_empty(&req->txps)) { 1443 struct sdma_txreq *t, *p; 1444 1445 list_for_each_entry_safe(t, p, &req->txps, list) { 1446 struct user_sdma_txreq *tx = 1447 container_of(t, struct user_sdma_txreq, txreq); 1448 list_del_init(&t->list); 1449 sdma_txclean(req->pq->dd, t); 1450 kmem_cache_free(req->pq->txreq_cache, tx); 1451 } 1452 } 1453 1454 for (i = 0; i < req->data_iovs; i++) { 1455 struct sdma_mmu_node *node = req->iovs[i].node; 1456 1457 if (!node) 1458 continue; 1459 1460 req->iovs[i].node = NULL; 1461 1462 if (unpin) 1463 hfi1_mmu_rb_remove(req->pq->handler, 1464 &node->rb); 1465 else 1466 atomic_dec(&node->refcount); 1467 } 1468 1469 kfree(req->tids); 1470 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1471 } 1472 1473 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1474 struct hfi1_user_sdma_comp_q *cq, 1475 u16 idx, enum hfi1_sdma_comp_state state, 1476 int ret) 1477 { 1478 if (state == ERROR) 1479 cq->comps[idx].errcode = -ret; 1480 smp_wmb(); /* make sure errcode is visible first */ 1481 cq->comps[idx].status = state; 1482 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1483 idx, state, ret); 1484 } 1485 1486 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1487 unsigned long len) 1488 { 1489 return (bool)(node->addr == addr); 1490 } 1491 1492 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1493 { 1494 struct sdma_mmu_node *node = 1495 container_of(mnode, struct sdma_mmu_node, rb); 1496 1497 atomic_inc(&node->refcount); 1498 return 0; 1499 } 1500 1501 /* 1502 * Return 1 to remove the node from the rb tree and call the remove op. 1503 * 1504 * Called with the rb tree lock held. 1505 */ 1506 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1507 void *evict_arg, bool *stop) 1508 { 1509 struct sdma_mmu_node *node = 1510 container_of(mnode, struct sdma_mmu_node, rb); 1511 struct evict_data *evict_data = evict_arg; 1512 1513 /* is this node still being used? */ 1514 if (atomic_read(&node->refcount)) 1515 return 0; /* keep this node */ 1516 1517 /* this node will be evicted, add its pages to our count */ 1518 evict_data->cleared += node->npages; 1519 1520 /* have enough pages been cleared? */ 1521 if (evict_data->cleared >= evict_data->target) 1522 *stop = true; 1523 1524 return 1; /* remove this node */ 1525 } 1526 1527 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1528 { 1529 struct sdma_mmu_node *node = 1530 container_of(mnode, struct sdma_mmu_node, rb); 1531 1532 unpin_sdma_pages(node); 1533 kfree(node); 1534 } 1535 1536 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1537 { 1538 struct sdma_mmu_node *node = 1539 container_of(mnode, struct sdma_mmu_node, rb); 1540 1541 if (!atomic_read(&node->refcount)) 1542 return 1; 1543 return 0; 1544 } 1545