1 /* 2 * Copyright(c) 2015 - 2017 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 #include <linux/string.h> 64 65 #include "hfi.h" 66 #include "sdma.h" 67 #include "user_sdma.h" 68 #include "verbs.h" /* for the headers */ 69 #include "common.h" /* for struct hfi1_tid_info */ 70 #include "trace.h" 71 #include "mmu_rb.h" 72 73 static uint hfi1_sdma_comp_ring_size = 128; 74 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 75 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 76 77 /* The maximum number of Data io vectors per message/request */ 78 #define MAX_VECTORS_PER_REQ 8 79 /* 80 * Maximum number of packet to send from each message/request 81 * before moving to the next one. 82 */ 83 #define MAX_PKTS_PER_QUEUE 16 84 85 #define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) 86 87 #define req_opcode(x) \ 88 (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 89 #define req_version(x) \ 90 (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 91 #define req_iovcnt(x) \ 92 (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) 93 94 /* Number of BTH.PSN bits used for sequence number in expected rcvs */ 95 #define BTH_SEQ_MASK 0x7ffull 96 97 /* 98 * Define fields in the KDETH header so we can update the header 99 * template. 100 */ 101 #define KDETH_OFFSET_SHIFT 0 102 #define KDETH_OFFSET_MASK 0x7fff 103 #define KDETH_OM_SHIFT 15 104 #define KDETH_OM_MASK 0x1 105 #define KDETH_TID_SHIFT 16 106 #define KDETH_TID_MASK 0x3ff 107 #define KDETH_TIDCTRL_SHIFT 26 108 #define KDETH_TIDCTRL_MASK 0x3 109 #define KDETH_INTR_SHIFT 28 110 #define KDETH_INTR_MASK 0x1 111 #define KDETH_SH_SHIFT 29 112 #define KDETH_SH_MASK 0x1 113 #define KDETH_HCRC_UPPER_SHIFT 16 114 #define KDETH_HCRC_UPPER_MASK 0xff 115 #define KDETH_HCRC_LOWER_SHIFT 24 116 #define KDETH_HCRC_LOWER_MASK 0xff 117 118 #define AHG_KDETH_INTR_SHIFT 12 119 #define AHG_KDETH_SH_SHIFT 13 120 121 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) 122 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) 123 124 #define KDETH_GET(val, field) \ 125 (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) 126 #define KDETH_SET(dw, field, val) do { \ 127 u32 dwval = le32_to_cpu(dw); \ 128 dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ 129 dwval |= (((val) & KDETH_##field##_MASK) << \ 130 KDETH_##field##_SHIFT); \ 131 dw = cpu_to_le32(dwval); \ 132 } while (0) 133 134 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ 135 do { \ 136 if ((idx) < ARRAY_SIZE((arr))) \ 137 (arr)[(idx++)] = sdma_build_ahg_descriptor( \ 138 (__force u16)(value), (dw), (bit), \ 139 (width)); \ 140 else \ 141 return -ERANGE; \ 142 } while (0) 143 144 /* KDETH OM multipliers and switch over point */ 145 #define KDETH_OM_SMALL 4 146 #define KDETH_OM_SMALL_SHIFT 2 147 #define KDETH_OM_LARGE 64 148 #define KDETH_OM_LARGE_SHIFT 6 149 #define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) 150 151 /* Tx request flag bits */ 152 #define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ 153 #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ 154 155 /* SDMA request flag bits */ 156 #define SDMA_REQ_FOR_THREAD 1 157 #define SDMA_REQ_SEND_DONE 2 158 #define SDMA_REQ_HAS_ERROR 3 159 #define SDMA_REQ_DONE_ERROR 4 160 161 #define SDMA_PKT_Q_INACTIVE BIT(0) 162 #define SDMA_PKT_Q_ACTIVE BIT(1) 163 #define SDMA_PKT_Q_DEFERRED BIT(2) 164 165 /* 166 * Maximum retry attempts to submit a TX request 167 * before putting the process to sleep. 168 */ 169 #define MAX_DEFER_RETRY_COUNT 1 170 171 static unsigned initial_pkt_count = 8; 172 173 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ 174 175 struct sdma_mmu_node; 176 177 struct user_sdma_iovec { 178 struct list_head list; 179 struct iovec iov; 180 /* number of pages in this vector */ 181 unsigned npages; 182 /* array of pinned pages for this vector */ 183 struct page **pages; 184 /* 185 * offset into the virtual address space of the vector at 186 * which we last left off. 187 */ 188 u64 offset; 189 struct sdma_mmu_node *node; 190 }; 191 192 struct sdma_mmu_node { 193 struct mmu_rb_node rb; 194 struct hfi1_user_sdma_pkt_q *pq; 195 atomic_t refcount; 196 struct page **pages; 197 unsigned npages; 198 }; 199 200 /* evict operation argument */ 201 struct evict_data { 202 u32 cleared; /* count evicted so far */ 203 u32 target; /* target count to evict */ 204 }; 205 206 struct user_sdma_request { 207 struct sdma_req_info info; 208 struct hfi1_user_sdma_pkt_q *pq; 209 struct hfi1_user_sdma_comp_q *cq; 210 /* This is the original header from user space */ 211 struct hfi1_pkt_header hdr; 212 /* 213 * Pointer to the SDMA engine for this request. 214 * Since different request could be on different VLs, 215 * each request will need it's own engine pointer. 216 */ 217 struct sdma_engine *sde; 218 s8 ahg_idx; 219 u32 ahg[9]; 220 /* 221 * KDETH.Offset (Eager) field 222 * We need to remember the initial value so the headers 223 * can be updated properly. 224 */ 225 u32 koffset; 226 /* 227 * KDETH.OFFSET (TID) field 228 * The offset can cover multiple packets, depending on the 229 * size of the TID entry. 230 */ 231 u32 tidoffset; 232 /* 233 * We copy the iovs for this request (based on 234 * info.iovcnt). These are only the data vectors 235 */ 236 unsigned data_iovs; 237 /* total length of the data in the request */ 238 u32 data_len; 239 /* progress index moving along the iovs array */ 240 unsigned iov_idx; 241 struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; 242 /* number of elements copied to the tids array */ 243 u16 n_tids; 244 /* TID array values copied from the tid_iov vector */ 245 u32 *tids; 246 u16 tididx; 247 u32 sent; 248 u64 seqnum; 249 u64 seqcomp; 250 u64 seqsubmitted; 251 struct list_head txps; 252 unsigned long flags; 253 /* status of the last txreq completed */ 254 int status; 255 }; 256 257 /* 258 * A single txreq could span up to 3 physical pages when the MTU 259 * is sufficiently large (> 4K). Each of the IOV pointers also 260 * needs it's own set of flags so the vector has been handled 261 * independently of each other. 262 */ 263 struct user_sdma_txreq { 264 /* Packet header for the txreq */ 265 struct hfi1_pkt_header hdr; 266 struct sdma_txreq txreq; 267 struct list_head list; 268 struct user_sdma_request *req; 269 u16 flags; 270 unsigned busycount; 271 u64 seqnum; 272 }; 273 274 #define SDMA_DBG(req, fmt, ...) \ 275 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ 276 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ 277 ##__VA_ARGS__) 278 #define SDMA_Q_DBG(pq, fmt, ...) \ 279 hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ 280 (pq)->subctxt, ##__VA_ARGS__) 281 282 static int user_sdma_send_pkts(struct user_sdma_request *req, 283 unsigned maxpkts); 284 static int num_user_pages(const struct iovec *iov); 285 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); 286 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); 287 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); 288 static int pin_vector_pages(struct user_sdma_request *req, 289 struct user_sdma_iovec *iovec); 290 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 291 unsigned start, unsigned npages); 292 static int check_header_template(struct user_sdma_request *req, 293 struct hfi1_pkt_header *hdr, u32 lrhlen, 294 u32 datalen); 295 static int set_txreq_header(struct user_sdma_request *req, 296 struct user_sdma_txreq *tx, u32 datalen); 297 static int set_txreq_header_ahg(struct user_sdma_request *req, 298 struct user_sdma_txreq *tx, u32 len); 299 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 300 struct hfi1_user_sdma_comp_q *cq, 301 u16 idx, enum hfi1_sdma_comp_state state, 302 int ret); 303 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags); 304 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 305 306 static int defer_packet_queue( 307 struct sdma_engine *sde, 308 struct iowait *wait, 309 struct sdma_txreq *txreq, 310 unsigned int seq); 311 static void activate_packet_queue(struct iowait *wait, int reason); 312 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 313 unsigned long len); 314 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode); 315 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 316 void *arg2, bool *stop); 317 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); 318 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode); 319 320 static struct mmu_rb_ops sdma_rb_ops = { 321 .filter = sdma_rb_filter, 322 .insert = sdma_rb_insert, 323 .evict = sdma_rb_evict, 324 .remove = sdma_rb_remove, 325 .invalidate = sdma_rb_invalidate 326 }; 327 328 static int defer_packet_queue( 329 struct sdma_engine *sde, 330 struct iowait *wait, 331 struct sdma_txreq *txreq, 332 unsigned seq) 333 { 334 struct hfi1_user_sdma_pkt_q *pq = 335 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 336 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 337 struct user_sdma_txreq *tx = 338 container_of(txreq, struct user_sdma_txreq, txreq); 339 340 if (sdma_progress(sde, seq, txreq)) { 341 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 342 goto eagain; 343 } 344 /* 345 * We are assuming that if the list is enqueued somewhere, it 346 * is to the dmawait list since that is the only place where 347 * it is supposed to be enqueued. 348 */ 349 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 350 write_seqlock(&dev->iowait_lock); 351 if (list_empty(&pq->busy.list)) 352 list_add_tail(&pq->busy.list, &sde->dmawait); 353 write_sequnlock(&dev->iowait_lock); 354 return -EBUSY; 355 eagain: 356 return -EAGAIN; 357 } 358 359 static void activate_packet_queue(struct iowait *wait, int reason) 360 { 361 struct hfi1_user_sdma_pkt_q *pq = 362 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 363 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 364 wake_up(&wait->wait_dma); 365 }; 366 367 static void sdma_kmem_cache_ctor(void *obj) 368 { 369 struct user_sdma_txreq *tx = obj; 370 371 memset(tx, 0, sizeof(*tx)); 372 } 373 374 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, 375 struct hfi1_filedata *fd) 376 { 377 int ret = -ENOMEM; 378 char buf[64]; 379 struct hfi1_devdata *dd; 380 struct hfi1_user_sdma_comp_q *cq; 381 struct hfi1_user_sdma_pkt_q *pq; 382 unsigned long flags; 383 384 if (!uctxt || !fd) 385 return -EBADF; 386 387 if (!hfi1_sdma_comp_ring_size) 388 return -EINVAL; 389 390 dd = uctxt->dd; 391 392 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 393 if (!pq) 394 return -ENOMEM; 395 396 INIT_LIST_HEAD(&pq->list); 397 pq->dd = dd; 398 pq->ctxt = uctxt->ctxt; 399 pq->subctxt = fd->subctxt; 400 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 401 pq->state = SDMA_PKT_Q_INACTIVE; 402 atomic_set(&pq->n_reqs, 0); 403 init_waitqueue_head(&pq->wait); 404 atomic_set(&pq->n_locked, 0); 405 pq->mm = fd->mm; 406 407 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 408 activate_packet_queue, NULL); 409 pq->reqidx = 0; 410 411 pq->reqs = kcalloc(hfi1_sdma_comp_ring_size, 412 sizeof(*pq->reqs), 413 GFP_KERNEL); 414 if (!pq->reqs) 415 goto pq_reqs_nomem; 416 417 pq->req_in_use = kcalloc(BITS_TO_LONGS(hfi1_sdma_comp_ring_size), 418 sizeof(*pq->req_in_use), 419 GFP_KERNEL); 420 if (!pq->req_in_use) 421 goto pq_reqs_no_in_use; 422 423 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 424 fd->subctxt); 425 pq->txreq_cache = kmem_cache_create(buf, 426 sizeof(struct user_sdma_txreq), 427 L1_CACHE_BYTES, 428 SLAB_HWCACHE_ALIGN, 429 sdma_kmem_cache_ctor); 430 if (!pq->txreq_cache) { 431 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 432 uctxt->ctxt); 433 goto pq_txreq_nomem; 434 } 435 436 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 437 if (!cq) 438 goto cq_nomem; 439 440 cq->comps = vmalloc_user(PAGE_ALIGN(sizeof(*cq->comps) 441 * hfi1_sdma_comp_ring_size)); 442 if (!cq->comps) 443 goto cq_comps_nomem; 444 445 cq->nentries = hfi1_sdma_comp_ring_size; 446 447 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 448 &pq->handler); 449 if (ret) { 450 dd_dev_err(dd, "Failed to register with MMU %d", ret); 451 goto pq_mmu_fail; 452 } 453 454 fd->pq = pq; 455 fd->cq = cq; 456 457 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 458 list_add(&pq->list, &uctxt->sdma_queues); 459 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 460 461 return 0; 462 463 pq_mmu_fail: 464 vfree(cq->comps); 465 cq_comps_nomem: 466 kfree(cq); 467 cq_nomem: 468 kmem_cache_destroy(pq->txreq_cache); 469 pq_txreq_nomem: 470 kfree(pq->req_in_use); 471 pq_reqs_no_in_use: 472 kfree(pq->reqs); 473 pq_reqs_nomem: 474 kfree(pq); 475 476 return ret; 477 } 478 479 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) 480 { 481 struct hfi1_ctxtdata *uctxt = fd->uctxt; 482 struct hfi1_user_sdma_pkt_q *pq; 483 unsigned long flags; 484 485 hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, 486 uctxt->ctxt, fd->subctxt); 487 pq = fd->pq; 488 if (pq) { 489 if (pq->handler) 490 hfi1_mmu_rb_unregister(pq->handler); 491 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 492 if (!list_empty(&pq->list)) 493 list_del_init(&pq->list); 494 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 495 iowait_sdma_drain(&pq->busy); 496 /* Wait until all requests have been freed. */ 497 wait_event_interruptible( 498 pq->wait, 499 (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); 500 kfree(pq->reqs); 501 kfree(pq->req_in_use); 502 kmem_cache_destroy(pq->txreq_cache); 503 kfree(pq); 504 fd->pq = NULL; 505 } 506 if (fd->cq) { 507 vfree(fd->cq->comps); 508 kfree(fd->cq); 509 fd->cq = NULL; 510 } 511 return 0; 512 } 513 514 static u8 dlid_to_selector(u16 dlid) 515 { 516 static u8 mapping[256]; 517 static int initialized; 518 static u8 next; 519 int hash; 520 521 if (!initialized) { 522 memset(mapping, 0xFF, 256); 523 initialized = 1; 524 } 525 526 hash = ((dlid >> 8) ^ dlid) & 0xFF; 527 if (mapping[hash] == 0xFF) { 528 mapping[hash] = next; 529 next = (next + 1) & 0x7F; 530 } 531 532 return mapping[hash]; 533 } 534 535 int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, 536 struct iovec *iovec, unsigned long dim, 537 unsigned long *count) 538 { 539 int ret = 0, i; 540 struct hfi1_ctxtdata *uctxt = fd->uctxt; 541 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 542 struct hfi1_user_sdma_comp_q *cq = fd->cq; 543 struct hfi1_devdata *dd = pq->dd; 544 unsigned long idx = 0; 545 u8 pcount = initial_pkt_count; 546 struct sdma_req_info info; 547 struct user_sdma_request *req; 548 u8 opcode, sc, vl; 549 int req_queued = 0; 550 u16 dlid; 551 u32 selector; 552 553 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 554 hfi1_cdbg( 555 SDMA, 556 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 557 dd->unit, uctxt->ctxt, fd->subctxt, 558 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 559 return -EINVAL; 560 } 561 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 562 if (ret) { 563 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 564 dd->unit, uctxt->ctxt, fd->subctxt, ret); 565 return -EFAULT; 566 } 567 568 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 569 (u16 *)&info); 570 571 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 572 hfi1_cdbg(SDMA, 573 "[%u:%u:%u:%u] Invalid comp index", 574 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 575 return -EINVAL; 576 } 577 578 /* 579 * Sanity check the header io vector count. Need at least 1 vector 580 * (header) and cannot be larger than the actual io vector count. 581 */ 582 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 583 hfi1_cdbg(SDMA, 584 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 585 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 586 req_iovcnt(info.ctrl), dim); 587 return -EINVAL; 588 } 589 590 if (!info.fragsize) { 591 hfi1_cdbg(SDMA, 592 "[%u:%u:%u:%u] Request does not specify fragsize", 593 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 594 return -EINVAL; 595 } 596 597 /* Try to claim the request. */ 598 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 599 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 600 dd->unit, uctxt->ctxt, fd->subctxt, 601 info.comp_idx); 602 return -EBADSLT; 603 } 604 /* 605 * All safety checks have been done and this request has been claimed. 606 */ 607 hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, 608 uctxt->ctxt, fd->subctxt, info.comp_idx); 609 req = pq->reqs + info.comp_idx; 610 memset(req, 0, sizeof(*req)); 611 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 612 req->pq = pq; 613 req->cq = cq; 614 req->status = -1; 615 req->ahg_idx = -1; 616 INIT_LIST_HEAD(&req->txps); 617 618 memcpy(&req->info, &info, sizeof(info)); 619 620 if (req_opcode(info.ctrl) == EXPECTED) { 621 /* expected must have a TID info and at least one data vector */ 622 if (req->data_iovs < 2) { 623 SDMA_DBG(req, 624 "Not enough vectors for expected request"); 625 ret = -EINVAL; 626 goto free_req; 627 } 628 req->data_iovs--; 629 } 630 631 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 632 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 633 MAX_VECTORS_PER_REQ); 634 ret = -EINVAL; 635 goto free_req; 636 } 637 /* Copy the header from the user buffer */ 638 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 639 sizeof(req->hdr)); 640 if (ret) { 641 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 642 ret = -EFAULT; 643 goto free_req; 644 } 645 646 /* If Static rate control is not enabled, sanitize the header. */ 647 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 648 req->hdr.pbc[2] = 0; 649 650 /* Validate the opcode. Do not trust packets from user space blindly. */ 651 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 652 if ((opcode & USER_OPCODE_CHECK_MASK) != 653 USER_OPCODE_CHECK_VAL) { 654 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 655 ret = -EINVAL; 656 goto free_req; 657 } 658 /* 659 * Validate the vl. Do not trust packets from user space blindly. 660 * VL comes from PBC, SC comes from LRH, and the VL needs to 661 * match the SC look up. 662 */ 663 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 664 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 665 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 666 if (vl >= dd->pport->vls_operational || 667 vl != sc_to_vlt(dd, sc)) { 668 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 669 ret = -EINVAL; 670 goto free_req; 671 } 672 673 /* Checking P_KEY for requests from user-space */ 674 if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, 675 PKEY_CHECK_INVALID)) { 676 ret = -EINVAL; 677 goto free_req; 678 } 679 680 /* 681 * Also should check the BTH.lnh. If it says the next header is GRH then 682 * the RXE parsing will be off and will land in the middle of the KDETH 683 * or miss it entirely. 684 */ 685 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 686 SDMA_DBG(req, "User tried to pass in a GRH"); 687 ret = -EINVAL; 688 goto free_req; 689 } 690 691 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 692 /* 693 * Calculate the initial TID offset based on the values of 694 * KDETH.OFFSET and KDETH.OM that are passed in. 695 */ 696 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 697 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 698 KDETH_OM_LARGE : KDETH_OM_SMALL); 699 SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); 700 idx++; 701 702 /* Save all the IO vector structures */ 703 for (i = 0; i < req->data_iovs; i++) { 704 INIT_LIST_HEAD(&req->iovs[i].list); 705 memcpy(&req->iovs[i].iov, 706 iovec + idx++, 707 sizeof(req->iovs[i].iov)); 708 ret = pin_vector_pages(req, &req->iovs[i]); 709 if (ret) { 710 req->status = ret; 711 goto free_req; 712 } 713 req->data_len += req->iovs[i].iov.iov_len; 714 } 715 SDMA_DBG(req, "total data length %u", req->data_len); 716 717 if (pcount > req->info.npkts) 718 pcount = req->info.npkts; 719 /* 720 * Copy any TID info 721 * User space will provide the TID info only when the 722 * request type is EXPECTED. This is true even if there is 723 * only one packet in the request and the header is already 724 * setup. The reason for the singular TID case is that the 725 * driver needs to perform safety checks. 726 */ 727 if (req_opcode(req->info.ctrl) == EXPECTED) { 728 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 729 u32 *tmp; 730 731 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 732 ret = -EINVAL; 733 goto free_req; 734 } 735 736 /* 737 * We have to copy all of the tids because they may vary 738 * in size and, therefore, the TID count might not be 739 * equal to the pkt count. However, there is no way to 740 * tell at this point. 741 */ 742 tmp = memdup_user(iovec[idx].iov_base, 743 ntids * sizeof(*req->tids)); 744 if (IS_ERR(tmp)) { 745 ret = PTR_ERR(tmp); 746 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 747 ntids, ret); 748 goto free_req; 749 } 750 req->tids = tmp; 751 req->n_tids = ntids; 752 idx++; 753 } 754 755 dlid = be16_to_cpu(req->hdr.lrh[1]); 756 selector = dlid_to_selector(dlid); 757 selector += uctxt->ctxt + fd->subctxt; 758 req->sde = sdma_select_user_engine(dd, selector, vl); 759 760 if (!req->sde || !sdma_running(req->sde)) { 761 ret = -ECOMM; 762 goto free_req; 763 } 764 765 /* We don't need an AHG entry if the request contains only one packet */ 766 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) 767 req->ahg_idx = sdma_ahg_alloc(req->sde); 768 769 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 770 atomic_inc(&pq->n_reqs); 771 req_queued = 1; 772 /* Send the first N packets in the request to buy us some time */ 773 ret = user_sdma_send_pkts(req, pcount); 774 if (unlikely(ret < 0 && ret != -EBUSY)) { 775 req->status = ret; 776 goto free_req; 777 } 778 779 /* 780 * It is possible that the SDMA engine would have processed all the 781 * submitted packets by the time we get here. Therefore, only set 782 * packet queue state to ACTIVE if there are still uncompleted 783 * requests. 784 */ 785 if (atomic_read(&pq->n_reqs)) 786 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 787 788 /* 789 * This is a somewhat blocking send implementation. 790 * The driver will block the caller until all packets of the 791 * request have been submitted to the SDMA engine. However, it 792 * will not wait for send completions. 793 */ 794 while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { 795 ret = user_sdma_send_pkts(req, pcount); 796 if (ret < 0) { 797 if (ret != -EBUSY) { 798 req->status = ret; 799 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 800 if (ACCESS_ONCE(req->seqcomp) == 801 req->seqsubmitted - 1) 802 goto free_req; 803 return ret; 804 } 805 wait_event_interruptible_timeout( 806 pq->busy.wait_dma, 807 (pq->state == SDMA_PKT_Q_ACTIVE), 808 msecs_to_jiffies( 809 SDMA_IOWAIT_TIMEOUT)); 810 } 811 } 812 *count += idx; 813 return 0; 814 free_req: 815 user_sdma_free_request(req, true); 816 if (req_queued) 817 pq_update(pq); 818 set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); 819 return ret; 820 } 821 822 static inline u32 compute_data_length(struct user_sdma_request *req, 823 struct user_sdma_txreq *tx) 824 { 825 /* 826 * Determine the proper size of the packet data. 827 * The size of the data of the first packet is in the header 828 * template. However, it includes the header and ICRC, which need 829 * to be subtracted. 830 * The minimum representable packet data length in a header is 4 bytes, 831 * therefore, when the data length request is less than 4 bytes, there's 832 * only one packet, and the packet data length is equal to that of the 833 * request data length. 834 * The size of the remaining packets is the minimum of the frag 835 * size (MTU) or remaining data in the request. 836 */ 837 u32 len; 838 839 if (!req->seqnum) { 840 if (req->data_len < sizeof(u32)) 841 len = req->data_len; 842 else 843 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 844 (sizeof(tx->hdr) - 4)); 845 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 846 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 847 PAGE_SIZE; 848 /* 849 * Get the data length based on the remaining space in the 850 * TID pair. 851 */ 852 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 853 /* If we've filled up the TID pair, move to the next one. */ 854 if (unlikely(!len) && ++req->tididx < req->n_tids && 855 req->tids[req->tididx]) { 856 tidlen = EXP_TID_GET(req->tids[req->tididx], 857 LEN) * PAGE_SIZE; 858 req->tidoffset = 0; 859 len = min_t(u32, tidlen, req->info.fragsize); 860 } 861 /* 862 * Since the TID pairs map entire pages, make sure that we 863 * are not going to try to send more data that we have 864 * remaining. 865 */ 866 len = min(len, req->data_len - req->sent); 867 } else { 868 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 869 } 870 SDMA_DBG(req, "Data Length = %u", len); 871 return len; 872 } 873 874 static inline u32 pad_len(u32 len) 875 { 876 if (len & (sizeof(u32) - 1)) 877 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 878 return len; 879 } 880 881 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 882 { 883 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 884 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 885 } 886 887 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) 888 { 889 int ret = 0, count; 890 unsigned npkts = 0; 891 struct user_sdma_txreq *tx = NULL; 892 struct hfi1_user_sdma_pkt_q *pq = NULL; 893 struct user_sdma_iovec *iovec = NULL; 894 895 if (!req->pq) 896 return -EINVAL; 897 898 pq = req->pq; 899 900 /* If tx completion has reported an error, we are done. */ 901 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 902 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 903 return -EFAULT; 904 } 905 906 /* 907 * Check if we might have sent the entire request already 908 */ 909 if (unlikely(req->seqnum == req->info.npkts)) { 910 if (!list_empty(&req->txps)) 911 goto dosend; 912 return ret; 913 } 914 915 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 916 maxpkts = req->info.npkts - req->seqnum; 917 918 while (npkts < maxpkts) { 919 u32 datalen = 0, queued = 0, data_sent = 0; 920 u64 iov_offset = 0; 921 922 /* 923 * Check whether any of the completions have come back 924 * with errors. If so, we are not going to process any 925 * more packets from this request. 926 */ 927 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 928 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 929 return -EFAULT; 930 } 931 932 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 933 if (!tx) 934 return -ENOMEM; 935 936 tx->flags = 0; 937 tx->req = req; 938 tx->busycount = 0; 939 INIT_LIST_HEAD(&tx->list); 940 941 /* 942 * For the last packet set the ACK request 943 * and disable header suppression. 944 */ 945 if (req->seqnum == req->info.npkts - 1) 946 tx->flags |= (TXREQ_FLAGS_REQ_ACK | 947 TXREQ_FLAGS_REQ_DISABLE_SH); 948 949 /* 950 * Calculate the payload size - this is min of the fragment 951 * (MTU) size or the remaining bytes in the request but only 952 * if we have payload data. 953 */ 954 if (req->data_len) { 955 iovec = &req->iovs[req->iov_idx]; 956 if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { 957 if (++req->iov_idx == req->data_iovs) { 958 ret = -EFAULT; 959 goto free_txreq; 960 } 961 iovec = &req->iovs[req->iov_idx]; 962 WARN_ON(iovec->offset); 963 } 964 965 datalen = compute_data_length(req, tx); 966 967 /* 968 * Disable header suppression for the payload <= 8DWS. 969 * If there is an uncorrectable error in the receive 970 * data FIFO when the received payload size is less than 971 * or equal to 8DWS then the RxDmaDataFifoRdUncErr is 972 * not reported.There is set RHF.EccErr if the header 973 * is not suppressed. 974 */ 975 if (!datalen) { 976 SDMA_DBG(req, 977 "Request has data but pkt len is 0"); 978 ret = -EFAULT; 979 goto free_tx; 980 } else if (datalen <= 32) { 981 tx->flags |= TXREQ_FLAGS_REQ_DISABLE_SH; 982 } 983 } 984 985 if (req->ahg_idx >= 0) { 986 if (!req->seqnum) { 987 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 988 u32 lrhlen = get_lrh_len(req->hdr, 989 pad_len(datalen)); 990 /* 991 * Copy the request header into the tx header 992 * because the HW needs a cacheline-aligned 993 * address. 994 * This copy can be optimized out if the hdr 995 * member of user_sdma_request were also 996 * cacheline aligned. 997 */ 998 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 999 if (PBC2LRH(pbclen) != lrhlen) { 1000 pbclen = (pbclen & 0xf000) | 1001 LRH2PBC(lrhlen); 1002 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 1003 } 1004 ret = check_header_template(req, &tx->hdr, 1005 lrhlen, datalen); 1006 if (ret) 1007 goto free_tx; 1008 ret = sdma_txinit_ahg(&tx->txreq, 1009 SDMA_TXREQ_F_AHG_COPY, 1010 sizeof(tx->hdr) + datalen, 1011 req->ahg_idx, 0, NULL, 0, 1012 user_sdma_txreq_cb); 1013 if (ret) 1014 goto free_tx; 1015 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, 1016 &tx->hdr, 1017 sizeof(tx->hdr)); 1018 if (ret) 1019 goto free_txreq; 1020 } else { 1021 int changes; 1022 1023 changes = set_txreq_header_ahg(req, tx, 1024 datalen); 1025 if (changes < 0) 1026 goto free_tx; 1027 sdma_txinit_ahg(&tx->txreq, 1028 SDMA_TXREQ_F_USE_AHG, 1029 datalen, req->ahg_idx, changes, 1030 req->ahg, sizeof(req->hdr), 1031 user_sdma_txreq_cb); 1032 } 1033 } else { 1034 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 1035 datalen, user_sdma_txreq_cb); 1036 if (ret) 1037 goto free_tx; 1038 /* 1039 * Modify the header for this packet. This only needs 1040 * to be done if we are not going to use AHG. Otherwise, 1041 * the HW will do it based on the changes we gave it 1042 * during sdma_txinit_ahg(). 1043 */ 1044 ret = set_txreq_header(req, tx, datalen); 1045 if (ret) 1046 goto free_txreq; 1047 } 1048 1049 /* 1050 * If the request contains any data vectors, add up to 1051 * fragsize bytes to the descriptor. 1052 */ 1053 while (queued < datalen && 1054 (req->sent + data_sent) < req->data_len) { 1055 unsigned long base, offset; 1056 unsigned pageidx, len; 1057 1058 base = (unsigned long)iovec->iov.iov_base; 1059 offset = offset_in_page(base + iovec->offset + 1060 iov_offset); 1061 pageidx = (((iovec->offset + iov_offset + 1062 base) - (base & PAGE_MASK)) >> PAGE_SHIFT); 1063 len = offset + req->info.fragsize > PAGE_SIZE ? 1064 PAGE_SIZE - offset : req->info.fragsize; 1065 len = min((datalen - queued), len); 1066 ret = sdma_txadd_page(pq->dd, &tx->txreq, 1067 iovec->pages[pageidx], 1068 offset, len); 1069 if (ret) { 1070 SDMA_DBG(req, "SDMA txreq add page failed %d\n", 1071 ret); 1072 goto free_txreq; 1073 } 1074 iov_offset += len; 1075 queued += len; 1076 data_sent += len; 1077 if (unlikely(queued < datalen && 1078 pageidx == iovec->npages && 1079 req->iov_idx < req->data_iovs - 1)) { 1080 iovec->offset += iov_offset; 1081 iovec = &req->iovs[++req->iov_idx]; 1082 iov_offset = 0; 1083 } 1084 } 1085 /* 1086 * The txreq was submitted successfully so we can update 1087 * the counters. 1088 */ 1089 req->koffset += datalen; 1090 if (req_opcode(req->info.ctrl) == EXPECTED) 1091 req->tidoffset += datalen; 1092 req->sent += data_sent; 1093 if (req->data_len) 1094 iovec->offset += iov_offset; 1095 list_add_tail(&tx->txreq.list, &req->txps); 1096 /* 1097 * It is important to increment this here as it is used to 1098 * generate the BTH.PSN and, therefore, can't be bulk-updated 1099 * outside of the loop. 1100 */ 1101 tx->seqnum = req->seqnum++; 1102 npkts++; 1103 } 1104 dosend: 1105 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); 1106 req->seqsubmitted += count; 1107 if (req->seqsubmitted == req->info.npkts) { 1108 set_bit(SDMA_REQ_SEND_DONE, &req->flags); 1109 /* 1110 * The txreq has already been submitted to the HW queue 1111 * so we can free the AHG entry now. Corruption will not 1112 * happen due to the sequential manner in which 1113 * descriptors are processed. 1114 */ 1115 if (req->ahg_idx >= 0) 1116 sdma_ahg_free(req->sde, req->ahg_idx); 1117 } 1118 return ret; 1119 1120 free_txreq: 1121 sdma_txclean(pq->dd, &tx->txreq); 1122 free_tx: 1123 kmem_cache_free(pq->txreq_cache, tx); 1124 return ret; 1125 } 1126 1127 /* 1128 * How many pages in this iovec element? 1129 */ 1130 static inline int num_user_pages(const struct iovec *iov) 1131 { 1132 const unsigned long addr = (unsigned long)iov->iov_base; 1133 const unsigned long len = iov->iov_len; 1134 const unsigned long spage = addr & PAGE_MASK; 1135 const unsigned long epage = (addr + len - 1) & PAGE_MASK; 1136 1137 return 1 + ((epage - spage) >> PAGE_SHIFT); 1138 } 1139 1140 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 1141 { 1142 struct evict_data evict_data; 1143 1144 evict_data.cleared = 0; 1145 evict_data.target = npages; 1146 hfi1_mmu_rb_evict(pq->handler, &evict_data); 1147 return evict_data.cleared; 1148 } 1149 1150 static int pin_vector_pages(struct user_sdma_request *req, 1151 struct user_sdma_iovec *iovec) 1152 { 1153 int ret = 0, pinned, npages, cleared; 1154 struct page **pages; 1155 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1156 struct sdma_mmu_node *node = NULL; 1157 struct mmu_rb_node *rb_node; 1158 1159 rb_node = hfi1_mmu_rb_extract(pq->handler, 1160 (unsigned long)iovec->iov.iov_base, 1161 iovec->iov.iov_len); 1162 if (rb_node) 1163 node = container_of(rb_node, struct sdma_mmu_node, rb); 1164 else 1165 rb_node = NULL; 1166 1167 if (!node) { 1168 node = kzalloc(sizeof(*node), GFP_KERNEL); 1169 if (!node) 1170 return -ENOMEM; 1171 1172 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1173 node->pq = pq; 1174 atomic_set(&node->refcount, 0); 1175 } 1176 1177 npages = num_user_pages(&iovec->iov); 1178 if (node->npages < npages) { 1179 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1180 if (!pages) { 1181 SDMA_DBG(req, "Failed page array alloc"); 1182 ret = -ENOMEM; 1183 goto bail; 1184 } 1185 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 1186 1187 npages -= node->npages; 1188 1189 retry: 1190 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 1191 atomic_read(&pq->n_locked), npages)) { 1192 cleared = sdma_cache_evict(pq, npages); 1193 if (cleared >= npages) 1194 goto retry; 1195 } 1196 pinned = hfi1_acquire_user_pages(pq->mm, 1197 ((unsigned long)iovec->iov.iov_base + 1198 (node->npages * PAGE_SIZE)), npages, 0, 1199 pages + node->npages); 1200 if (pinned < 0) { 1201 kfree(pages); 1202 ret = pinned; 1203 goto bail; 1204 } 1205 if (pinned != npages) { 1206 unpin_vector_pages(pq->mm, pages, node->npages, 1207 pinned); 1208 ret = -EFAULT; 1209 goto bail; 1210 } 1211 kfree(node->pages); 1212 node->rb.len = iovec->iov.iov_len; 1213 node->pages = pages; 1214 node->npages += pinned; 1215 npages = node->npages; 1216 atomic_add(pinned, &pq->n_locked); 1217 } 1218 iovec->pages = node->pages; 1219 iovec->npages = npages; 1220 iovec->node = node; 1221 1222 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1223 if (ret) { 1224 atomic_sub(node->npages, &pq->n_locked); 1225 iovec->node = NULL; 1226 goto bail; 1227 } 1228 return 0; 1229 bail: 1230 if (rb_node) 1231 unpin_vector_pages(pq->mm, node->pages, 0, node->npages); 1232 kfree(node); 1233 return ret; 1234 } 1235 1236 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1237 unsigned start, unsigned npages) 1238 { 1239 hfi1_release_user_pages(mm, pages + start, npages, false); 1240 kfree(pages); 1241 } 1242 1243 static int check_header_template(struct user_sdma_request *req, 1244 struct hfi1_pkt_header *hdr, u32 lrhlen, 1245 u32 datalen) 1246 { 1247 /* 1248 * Perform safety checks for any type of packet: 1249 * - transfer size is multiple of 64bytes 1250 * - packet length is multiple of 4 bytes 1251 * - packet length is not larger than MTU size 1252 * 1253 * These checks are only done for the first packet of the 1254 * transfer since the header is "given" to us by user space. 1255 * For the remainder of the packets we compute the values. 1256 */ 1257 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1258 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1259 return -EINVAL; 1260 1261 if (req_opcode(req->info.ctrl) == EXPECTED) { 1262 /* 1263 * The header is checked only on the first packet. Furthermore, 1264 * we ensure that at least one TID entry is copied when the 1265 * request is submitted. Therefore, we don't have to verify that 1266 * tididx points to something sane. 1267 */ 1268 u32 tidval = req->tids[req->tididx], 1269 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1270 tididx = EXP_TID_GET(tidval, IDX), 1271 tidctrl = EXP_TID_GET(tidval, CTRL), 1272 tidoff; 1273 __le32 kval = hdr->kdeth.ver_tid_offset; 1274 1275 tidoff = KDETH_GET(kval, OFFSET) * 1276 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1277 KDETH_OM_LARGE : KDETH_OM_SMALL); 1278 /* 1279 * Expected receive packets have the following 1280 * additional checks: 1281 * - offset is not larger than the TID size 1282 * - TIDCtrl values match between header and TID array 1283 * - TID indexes match between header and TID array 1284 */ 1285 if ((tidoff + datalen > tidlen) || 1286 KDETH_GET(kval, TIDCTRL) != tidctrl || 1287 KDETH_GET(kval, TID) != tididx) 1288 return -EINVAL; 1289 } 1290 return 0; 1291 } 1292 1293 /* 1294 * Correctly set the BTH.PSN field based on type of 1295 * transfer - eager packets can just increment the PSN but 1296 * expected packets encode generation and sequence in the 1297 * BTH.PSN field so just incrementing will result in errors. 1298 */ 1299 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1300 { 1301 u32 val = be32_to_cpu(bthpsn), 1302 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1303 0xffffffull), 1304 psn = val & mask; 1305 if (expct) 1306 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1307 else 1308 psn = psn + frags; 1309 return psn & mask; 1310 } 1311 1312 static int set_txreq_header(struct user_sdma_request *req, 1313 struct user_sdma_txreq *tx, u32 datalen) 1314 { 1315 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1316 struct hfi1_pkt_header *hdr = &tx->hdr; 1317 u8 omfactor; /* KDETH.OM */ 1318 u16 pbclen; 1319 int ret; 1320 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1321 1322 /* Copy the header template to the request before modification */ 1323 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1324 1325 /* 1326 * Check if the PBC and LRH length are mismatched. If so 1327 * adjust both in the header. 1328 */ 1329 pbclen = le16_to_cpu(hdr->pbc[0]); 1330 if (PBC2LRH(pbclen) != lrhlen) { 1331 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1332 hdr->pbc[0] = cpu_to_le16(pbclen); 1333 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1334 /* 1335 * Third packet 1336 * This is the first packet in the sequence that has 1337 * a "static" size that can be used for the rest of 1338 * the packets (besides the last one). 1339 */ 1340 if (unlikely(req->seqnum == 2)) { 1341 /* 1342 * From this point on the lengths in both the 1343 * PBC and LRH are the same until the last 1344 * packet. 1345 * Adjust the template so we don't have to update 1346 * every packet 1347 */ 1348 req->hdr.pbc[0] = hdr->pbc[0]; 1349 req->hdr.lrh[2] = hdr->lrh[2]; 1350 } 1351 } 1352 /* 1353 * We only have to modify the header if this is not the 1354 * first packet in the request. Otherwise, we use the 1355 * header given to us. 1356 */ 1357 if (unlikely(!req->seqnum)) { 1358 ret = check_header_template(req, hdr, lrhlen, datalen); 1359 if (ret) 1360 return ret; 1361 goto done; 1362 } 1363 1364 hdr->bth[2] = cpu_to_be32( 1365 set_pkt_bth_psn(hdr->bth[2], 1366 (req_opcode(req->info.ctrl) == EXPECTED), 1367 req->seqnum)); 1368 1369 /* Set ACK request on last packet */ 1370 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1371 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1372 1373 /* Set the new offset */ 1374 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1375 /* Expected packets have to fill in the new TID information */ 1376 if (req_opcode(req->info.ctrl) == EXPECTED) { 1377 tidval = req->tids[req->tididx]; 1378 /* 1379 * If the offset puts us at the end of the current TID, 1380 * advance everything. 1381 */ 1382 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1383 PAGE_SIZE)) { 1384 req->tidoffset = 0; 1385 /* 1386 * Since we don't copy all the TIDs, all at once, 1387 * we have to check again. 1388 */ 1389 if (++req->tididx > req->n_tids - 1 || 1390 !req->tids[req->tididx]) { 1391 return -EINVAL; 1392 } 1393 tidval = req->tids[req->tididx]; 1394 } 1395 omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1396 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE_SHIFT : 1397 KDETH_OM_SMALL_SHIFT; 1398 /* Set KDETH.TIDCtrl based on value for this TID. */ 1399 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1400 EXP_TID_GET(tidval, CTRL)); 1401 /* Set KDETH.TID based on value for this TID */ 1402 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1403 EXP_TID_GET(tidval, IDX)); 1404 /* Clear KDETH.SH when DISABLE_SH flag is set */ 1405 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) 1406 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1407 /* 1408 * Set the KDETH.OFFSET and KDETH.OM based on size of 1409 * transfer. 1410 */ 1411 SDMA_DBG(req, "TID offset %ubytes %uunits om%u", 1412 req->tidoffset, req->tidoffset >> omfactor, 1413 omfactor != KDETH_OM_SMALL_SHIFT); 1414 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1415 req->tidoffset >> omfactor); 1416 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1417 omfactor != KDETH_OM_SMALL_SHIFT); 1418 } 1419 done: 1420 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1421 req->info.comp_idx, hdr, tidval); 1422 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1423 } 1424 1425 static int set_txreq_header_ahg(struct user_sdma_request *req, 1426 struct user_sdma_txreq *tx, u32 len) 1427 { 1428 int diff = 0; 1429 u8 omfactor; /* KDETH.OM */ 1430 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1431 struct hfi1_pkt_header *hdr = &req->hdr; 1432 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1433 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); 1434 1435 if (PBC2LRH(pbclen) != lrhlen) { 1436 /* PBC.PbcLengthDWs */ 1437 AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, 1438 cpu_to_le16(LRH2PBC(lrhlen))); 1439 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1440 AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, 1441 cpu_to_be16(lrhlen >> 2)); 1442 } 1443 1444 /* 1445 * Do the common updates 1446 */ 1447 /* BTH.PSN and BTH.A */ 1448 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1449 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1450 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_ACK)) 1451 val32 |= 1UL << 31; 1452 AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); 1453 AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); 1454 /* KDETH.Offset */ 1455 AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, 1456 cpu_to_le16(req->koffset & 0xffff)); 1457 AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, 1458 cpu_to_le16(req->koffset >> 16)); 1459 if (req_opcode(req->info.ctrl) == EXPECTED) { 1460 __le16 val; 1461 1462 tidval = req->tids[req->tididx]; 1463 1464 /* 1465 * If the offset puts us at the end of the current TID, 1466 * advance everything. 1467 */ 1468 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1469 PAGE_SIZE)) { 1470 req->tidoffset = 0; 1471 /* 1472 * Since we don't copy all the TIDs, all at once, 1473 * we have to check again. 1474 */ 1475 if (++req->tididx > req->n_tids - 1 || 1476 !req->tids[req->tididx]) { 1477 return -EINVAL; 1478 } 1479 tidval = req->tids[req->tididx]; 1480 } 1481 omfactor = ((EXP_TID_GET(tidval, LEN) * 1482 PAGE_SIZE) >= 1483 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE_SHIFT : 1484 KDETH_OM_SMALL_SHIFT; 1485 /* KDETH.OM and KDETH.OFFSET (TID) */ 1486 AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, 1487 ((!!(omfactor - KDETH_OM_SMALL_SHIFT)) << 15 | 1488 ((req->tidoffset >> omfactor) 1489 & 0x7fff))); 1490 /* KDETH.TIDCtrl, KDETH.TID, KDETH.Intr, KDETH.SH */ 1491 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1492 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1493 1494 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_DISABLE_SH)) { 1495 val |= cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1496 INTR) << 1497 AHG_KDETH_INTR_SHIFT)); 1498 } else { 1499 val |= KDETH_GET(hdr->kdeth.ver_tid_offset, SH) ? 1500 cpu_to_le16(0x1 << AHG_KDETH_SH_SHIFT) : 1501 cpu_to_le16((KDETH_GET(hdr->kdeth.ver_tid_offset, 1502 INTR) << 1503 AHG_KDETH_INTR_SHIFT)); 1504 } 1505 1506 AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); 1507 } 1508 1509 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1510 req->info.comp_idx, req->sde->this_idx, 1511 req->ahg_idx, req->ahg, diff, tidval); 1512 return diff; 1513 } 1514 1515 /* 1516 * SDMA tx request completion callback. Called when the SDMA progress 1517 * state machine gets notification that the SDMA descriptors for this 1518 * tx request have been processed by the DMA engine. Called in 1519 * interrupt context. 1520 */ 1521 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1522 { 1523 struct user_sdma_txreq *tx = 1524 container_of(txreq, struct user_sdma_txreq, txreq); 1525 struct user_sdma_request *req; 1526 struct hfi1_user_sdma_pkt_q *pq; 1527 struct hfi1_user_sdma_comp_q *cq; 1528 u16 idx; 1529 1530 if (!tx->req) 1531 return; 1532 1533 req = tx->req; 1534 pq = req->pq; 1535 cq = req->cq; 1536 1537 if (status != SDMA_TXREQ_S_OK) { 1538 SDMA_DBG(req, "SDMA completion with error %d", 1539 status); 1540 set_bit(SDMA_REQ_HAS_ERROR, &req->flags); 1541 } 1542 1543 req->seqcomp = tx->seqnum; 1544 kmem_cache_free(pq->txreq_cache, tx); 1545 tx = NULL; 1546 1547 idx = req->info.comp_idx; 1548 if (req->status == -1 && status == SDMA_TXREQ_S_OK) { 1549 if (req->seqcomp == req->info.npkts - 1) { 1550 req->status = 0; 1551 user_sdma_free_request(req, false); 1552 pq_update(pq); 1553 set_comp_state(pq, cq, idx, COMPLETE, 0); 1554 } 1555 } else { 1556 if (status != SDMA_TXREQ_S_OK) 1557 req->status = status; 1558 if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && 1559 (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || 1560 test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { 1561 user_sdma_free_request(req, false); 1562 pq_update(pq); 1563 set_comp_state(pq, cq, idx, ERROR, req->status); 1564 } 1565 } 1566 } 1567 1568 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1569 { 1570 if (atomic_dec_and_test(&pq->n_reqs)) { 1571 xchg(&pq->state, SDMA_PKT_Q_INACTIVE); 1572 wake_up(&pq->wait); 1573 } 1574 } 1575 1576 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1577 { 1578 if (!list_empty(&req->txps)) { 1579 struct sdma_txreq *t, *p; 1580 1581 list_for_each_entry_safe(t, p, &req->txps, list) { 1582 struct user_sdma_txreq *tx = 1583 container_of(t, struct user_sdma_txreq, txreq); 1584 list_del_init(&t->list); 1585 sdma_txclean(req->pq->dd, t); 1586 kmem_cache_free(req->pq->txreq_cache, tx); 1587 } 1588 } 1589 if (req->data_iovs) { 1590 struct sdma_mmu_node *node; 1591 int i; 1592 1593 for (i = 0; i < req->data_iovs; i++) { 1594 node = req->iovs[i].node; 1595 if (!node) 1596 continue; 1597 1598 if (unpin) 1599 hfi1_mmu_rb_remove(req->pq->handler, 1600 &node->rb); 1601 else 1602 atomic_dec(&node->refcount); 1603 } 1604 } 1605 kfree(req->tids); 1606 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1607 } 1608 1609 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1610 struct hfi1_user_sdma_comp_q *cq, 1611 u16 idx, enum hfi1_sdma_comp_state state, 1612 int ret) 1613 { 1614 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", 1615 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); 1616 if (state == ERROR) 1617 cq->comps[idx].errcode = -ret; 1618 smp_wmb(); /* make sure errcode is visible first */ 1619 cq->comps[idx].status = state; 1620 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1621 idx, state, ret); 1622 } 1623 1624 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1625 unsigned long len) 1626 { 1627 return (bool)(node->addr == addr); 1628 } 1629 1630 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1631 { 1632 struct sdma_mmu_node *node = 1633 container_of(mnode, struct sdma_mmu_node, rb); 1634 1635 atomic_inc(&node->refcount); 1636 return 0; 1637 } 1638 1639 /* 1640 * Return 1 to remove the node from the rb tree and call the remove op. 1641 * 1642 * Called with the rb tree lock held. 1643 */ 1644 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1645 void *evict_arg, bool *stop) 1646 { 1647 struct sdma_mmu_node *node = 1648 container_of(mnode, struct sdma_mmu_node, rb); 1649 struct evict_data *evict_data = evict_arg; 1650 1651 /* is this node still being used? */ 1652 if (atomic_read(&node->refcount)) 1653 return 0; /* keep this node */ 1654 1655 /* this node will be evicted, add its pages to our count */ 1656 evict_data->cleared += node->npages; 1657 1658 /* have enough pages been cleared? */ 1659 if (evict_data->cleared >= evict_data->target) 1660 *stop = true; 1661 1662 return 1; /* remove this node */ 1663 } 1664 1665 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1666 { 1667 struct sdma_mmu_node *node = 1668 container_of(mnode, struct sdma_mmu_node, rb); 1669 1670 atomic_sub(node->npages, &node->pq->n_locked); 1671 1672 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 1673 1674 kfree(node); 1675 } 1676 1677 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1678 { 1679 struct sdma_mmu_node *node = 1680 container_of(mnode, struct sdma_mmu_node, rb); 1681 1682 if (!atomic_read(&node->refcount)) 1683 return 1; 1684 return 0; 1685 } 1686