1 /* 2 * Copyright(c) 2015, 2016 Intel Corporation. 3 * 4 * This file is provided under a dual BSD/GPLv2 license. When using or 5 * redistributing this file, you may do so under either license. 6 * 7 * GPL LICENSE SUMMARY 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License as 11 * published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, but 14 * WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * General Public License for more details. 17 * 18 * BSD LICENSE 19 * 20 * Redistribution and use in source and binary forms, with or without 21 * modification, are permitted provided that the following conditions 22 * are met: 23 * 24 * - Redistributions of source code must retain the above copyright 25 * notice, this list of conditions and the following disclaimer. 26 * - Redistributions in binary form must reproduce the above copyright 27 * notice, this list of conditions and the following disclaimer in 28 * the documentation and/or other materials provided with the 29 * distribution. 30 * - Neither the name of Intel Corporation nor the names of its 31 * contributors may be used to endorse or promote products derived 32 * from this software without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 45 * 46 */ 47 #include <linux/mm.h> 48 #include <linux/types.h> 49 #include <linux/device.h> 50 #include <linux/dmapool.h> 51 #include <linux/slab.h> 52 #include <linux/list.h> 53 #include <linux/highmem.h> 54 #include <linux/io.h> 55 #include <linux/uio.h> 56 #include <linux/rbtree.h> 57 #include <linux/spinlock.h> 58 #include <linux/delay.h> 59 #include <linux/kthread.h> 60 #include <linux/mmu_context.h> 61 #include <linux/module.h> 62 #include <linux/vmalloc.h> 63 64 #include "hfi.h" 65 #include "sdma.h" 66 #include "user_sdma.h" 67 #include "verbs.h" /* for the headers */ 68 #include "common.h" /* for struct hfi1_tid_info */ 69 #include "trace.h" 70 #include "mmu_rb.h" 71 72 static uint hfi1_sdma_comp_ring_size = 128; 73 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); 74 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); 75 76 /* The maximum number of Data io vectors per message/request */ 77 #define MAX_VECTORS_PER_REQ 8 78 /* 79 * Maximum number of packet to send from each message/request 80 * before moving to the next one. 81 */ 82 #define MAX_PKTS_PER_QUEUE 16 83 84 #define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) 85 86 #define req_opcode(x) \ 87 (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 88 #define req_version(x) \ 89 (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) 90 #define req_iovcnt(x) \ 91 (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) 92 93 /* Number of BTH.PSN bits used for sequence number in expected rcvs */ 94 #define BTH_SEQ_MASK 0x7ffull 95 96 /* 97 * Define fields in the KDETH header so we can update the header 98 * template. 99 */ 100 #define KDETH_OFFSET_SHIFT 0 101 #define KDETH_OFFSET_MASK 0x7fff 102 #define KDETH_OM_SHIFT 15 103 #define KDETH_OM_MASK 0x1 104 #define KDETH_TID_SHIFT 16 105 #define KDETH_TID_MASK 0x3ff 106 #define KDETH_TIDCTRL_SHIFT 26 107 #define KDETH_TIDCTRL_MASK 0x3 108 #define KDETH_INTR_SHIFT 28 109 #define KDETH_INTR_MASK 0x1 110 #define KDETH_SH_SHIFT 29 111 #define KDETH_SH_MASK 0x1 112 #define KDETH_HCRC_UPPER_SHIFT 16 113 #define KDETH_HCRC_UPPER_MASK 0xff 114 #define KDETH_HCRC_LOWER_SHIFT 24 115 #define KDETH_HCRC_LOWER_MASK 0xff 116 117 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) 118 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) 119 120 #define KDETH_GET(val, field) \ 121 (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) 122 #define KDETH_SET(dw, field, val) do { \ 123 u32 dwval = le32_to_cpu(dw); \ 124 dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ 125 dwval |= (((val) & KDETH_##field##_MASK) << \ 126 KDETH_##field##_SHIFT); \ 127 dw = cpu_to_le32(dwval); \ 128 } while (0) 129 130 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ 131 do { \ 132 if ((idx) < ARRAY_SIZE((arr))) \ 133 (arr)[(idx++)] = sdma_build_ahg_descriptor( \ 134 (__force u16)(value), (dw), (bit), \ 135 (width)); \ 136 else \ 137 return -ERANGE; \ 138 } while (0) 139 140 /* KDETH OM multipliers and switch over point */ 141 #define KDETH_OM_SMALL 4 142 #define KDETH_OM_LARGE 64 143 #define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) 144 145 /* Last packet in the request */ 146 #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0) 147 148 /* SDMA request flag bits */ 149 #define SDMA_REQ_FOR_THREAD 1 150 #define SDMA_REQ_SEND_DONE 2 151 #define SDMA_REQ_HAVE_AHG 3 152 #define SDMA_REQ_HAS_ERROR 4 153 #define SDMA_REQ_DONE_ERROR 5 154 155 #define SDMA_PKT_Q_INACTIVE BIT(0) 156 #define SDMA_PKT_Q_ACTIVE BIT(1) 157 #define SDMA_PKT_Q_DEFERRED BIT(2) 158 159 /* 160 * Maximum retry attempts to submit a TX request 161 * before putting the process to sleep. 162 */ 163 #define MAX_DEFER_RETRY_COUNT 1 164 165 static unsigned initial_pkt_count = 8; 166 167 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ 168 169 struct sdma_mmu_node; 170 171 struct user_sdma_iovec { 172 struct list_head list; 173 struct iovec iov; 174 /* number of pages in this vector */ 175 unsigned npages; 176 /* array of pinned pages for this vector */ 177 struct page **pages; 178 /* 179 * offset into the virtual address space of the vector at 180 * which we last left off. 181 */ 182 u64 offset; 183 struct sdma_mmu_node *node; 184 }; 185 186 struct sdma_mmu_node { 187 struct mmu_rb_node rb; 188 struct hfi1_user_sdma_pkt_q *pq; 189 atomic_t refcount; 190 struct page **pages; 191 unsigned npages; 192 }; 193 194 /* evict operation argument */ 195 struct evict_data { 196 u32 cleared; /* count evicted so far */ 197 u32 target; /* target count to evict */ 198 }; 199 200 struct user_sdma_request { 201 struct sdma_req_info info; 202 struct hfi1_user_sdma_pkt_q *pq; 203 struct hfi1_user_sdma_comp_q *cq; 204 /* This is the original header from user space */ 205 struct hfi1_pkt_header hdr; 206 /* 207 * Pointer to the SDMA engine for this request. 208 * Since different request could be on different VLs, 209 * each request will need it's own engine pointer. 210 */ 211 struct sdma_engine *sde; 212 u8 ahg_idx; 213 u32 ahg[9]; 214 /* 215 * KDETH.Offset (Eager) field 216 * We need to remember the initial value so the headers 217 * can be updated properly. 218 */ 219 u32 koffset; 220 /* 221 * KDETH.OFFSET (TID) field 222 * The offset can cover multiple packets, depending on the 223 * size of the TID entry. 224 */ 225 u32 tidoffset; 226 /* 227 * KDETH.OM 228 * Remember this because the header template always sets it 229 * to 0. 230 */ 231 u8 omfactor; 232 /* 233 * We copy the iovs for this request (based on 234 * info.iovcnt). These are only the data vectors 235 */ 236 unsigned data_iovs; 237 /* total length of the data in the request */ 238 u32 data_len; 239 /* progress index moving along the iovs array */ 240 unsigned iov_idx; 241 struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; 242 /* number of elements copied to the tids array */ 243 u16 n_tids; 244 /* TID array values copied from the tid_iov vector */ 245 u32 *tids; 246 u16 tididx; 247 u32 sent; 248 u64 seqnum; 249 u64 seqcomp; 250 u64 seqsubmitted; 251 struct list_head txps; 252 unsigned long flags; 253 /* status of the last txreq completed */ 254 int status; 255 }; 256 257 /* 258 * A single txreq could span up to 3 physical pages when the MTU 259 * is sufficiently large (> 4K). Each of the IOV pointers also 260 * needs it's own set of flags so the vector has been handled 261 * independently of each other. 262 */ 263 struct user_sdma_txreq { 264 /* Packet header for the txreq */ 265 struct hfi1_pkt_header hdr; 266 struct sdma_txreq txreq; 267 struct list_head list; 268 struct user_sdma_request *req; 269 u16 flags; 270 unsigned busycount; 271 u64 seqnum; 272 }; 273 274 #define SDMA_DBG(req, fmt, ...) \ 275 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ 276 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ 277 ##__VA_ARGS__) 278 #define SDMA_Q_DBG(pq, fmt, ...) \ 279 hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ 280 (pq)->subctxt, ##__VA_ARGS__) 281 282 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); 283 static int num_user_pages(const struct iovec *); 284 static void user_sdma_txreq_cb(struct sdma_txreq *, int); 285 static inline void pq_update(struct hfi1_user_sdma_pkt_q *); 286 static void user_sdma_free_request(struct user_sdma_request *, bool); 287 static int pin_vector_pages(struct user_sdma_request *, 288 struct user_sdma_iovec *); 289 static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned, 290 unsigned); 291 static int check_header_template(struct user_sdma_request *, 292 struct hfi1_pkt_header *, u32, u32); 293 static int set_txreq_header(struct user_sdma_request *, 294 struct user_sdma_txreq *, u32); 295 static int set_txreq_header_ahg(struct user_sdma_request *, 296 struct user_sdma_txreq *, u32); 297 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *, 298 struct hfi1_user_sdma_comp_q *, 299 u16, enum hfi1_sdma_comp_state, int); 300 static inline u32 set_pkt_bth_psn(__be32, u8, u32); 301 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); 302 303 static int defer_packet_queue( 304 struct sdma_engine *, 305 struct iowait *, 306 struct sdma_txreq *, 307 unsigned seq); 308 static void activate_packet_queue(struct iowait *, int); 309 static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); 310 static int sdma_rb_insert(void *, struct mmu_rb_node *); 311 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 312 void *arg2, bool *stop); 313 static void sdma_rb_remove(void *, struct mmu_rb_node *); 314 static int sdma_rb_invalidate(void *, struct mmu_rb_node *); 315 316 static struct mmu_rb_ops sdma_rb_ops = { 317 .filter = sdma_rb_filter, 318 .insert = sdma_rb_insert, 319 .evict = sdma_rb_evict, 320 .remove = sdma_rb_remove, 321 .invalidate = sdma_rb_invalidate 322 }; 323 324 static int defer_packet_queue( 325 struct sdma_engine *sde, 326 struct iowait *wait, 327 struct sdma_txreq *txreq, 328 unsigned seq) 329 { 330 struct hfi1_user_sdma_pkt_q *pq = 331 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 332 struct hfi1_ibdev *dev = &pq->dd->verbs_dev; 333 struct user_sdma_txreq *tx = 334 container_of(txreq, struct user_sdma_txreq, txreq); 335 336 if (sdma_progress(sde, seq, txreq)) { 337 if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) 338 goto eagain; 339 } 340 /* 341 * We are assuming that if the list is enqueued somewhere, it 342 * is to the dmawait list since that is the only place where 343 * it is supposed to be enqueued. 344 */ 345 xchg(&pq->state, SDMA_PKT_Q_DEFERRED); 346 write_seqlock(&dev->iowait_lock); 347 if (list_empty(&pq->busy.list)) 348 list_add_tail(&pq->busy.list, &sde->dmawait); 349 write_sequnlock(&dev->iowait_lock); 350 return -EBUSY; 351 eagain: 352 return -EAGAIN; 353 } 354 355 static void activate_packet_queue(struct iowait *wait, int reason) 356 { 357 struct hfi1_user_sdma_pkt_q *pq = 358 container_of(wait, struct hfi1_user_sdma_pkt_q, busy); 359 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 360 wake_up(&wait->wait_dma); 361 }; 362 363 static void sdma_kmem_cache_ctor(void *obj) 364 { 365 struct user_sdma_txreq *tx = obj; 366 367 memset(tx, 0, sizeof(*tx)); 368 } 369 370 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) 371 { 372 struct hfi1_filedata *fd; 373 int ret = 0; 374 unsigned memsize; 375 char buf[64]; 376 struct hfi1_devdata *dd; 377 struct hfi1_user_sdma_comp_q *cq; 378 struct hfi1_user_sdma_pkt_q *pq; 379 unsigned long flags; 380 381 if (!uctxt || !fp) { 382 ret = -EBADF; 383 goto done; 384 } 385 386 fd = fp->private_data; 387 388 if (!hfi1_sdma_comp_ring_size) { 389 ret = -EINVAL; 390 goto done; 391 } 392 393 dd = uctxt->dd; 394 395 pq = kzalloc(sizeof(*pq), GFP_KERNEL); 396 if (!pq) 397 goto pq_nomem; 398 399 memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size; 400 pq->reqs = kzalloc(memsize, GFP_KERNEL); 401 if (!pq->reqs) 402 goto pq_reqs_nomem; 403 404 memsize = BITS_TO_LONGS(hfi1_sdma_comp_ring_size) * sizeof(long); 405 pq->req_in_use = kzalloc(memsize, GFP_KERNEL); 406 if (!pq->req_in_use) 407 goto pq_reqs_no_in_use; 408 409 INIT_LIST_HEAD(&pq->list); 410 pq->dd = dd; 411 pq->ctxt = uctxt->ctxt; 412 pq->subctxt = fd->subctxt; 413 pq->n_max_reqs = hfi1_sdma_comp_ring_size; 414 pq->state = SDMA_PKT_Q_INACTIVE; 415 atomic_set(&pq->n_reqs, 0); 416 init_waitqueue_head(&pq->wait); 417 atomic_set(&pq->n_locked, 0); 418 pq->mm = fd->mm; 419 420 iowait_init(&pq->busy, 0, NULL, defer_packet_queue, 421 activate_packet_queue, NULL); 422 pq->reqidx = 0; 423 snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, 424 fd->subctxt); 425 pq->txreq_cache = kmem_cache_create(buf, 426 sizeof(struct user_sdma_txreq), 427 L1_CACHE_BYTES, 428 SLAB_HWCACHE_ALIGN, 429 sdma_kmem_cache_ctor); 430 if (!pq->txreq_cache) { 431 dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", 432 uctxt->ctxt); 433 goto pq_txreq_nomem; 434 } 435 fd->pq = pq; 436 cq = kzalloc(sizeof(*cq), GFP_KERNEL); 437 if (!cq) 438 goto cq_nomem; 439 440 memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size); 441 cq->comps = vmalloc_user(memsize); 442 if (!cq->comps) 443 goto cq_comps_nomem; 444 445 cq->nentries = hfi1_sdma_comp_ring_size; 446 fd->cq = cq; 447 448 ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, 449 &pq->handler); 450 if (ret) { 451 dd_dev_err(dd, "Failed to register with MMU %d", ret); 452 goto done; 453 } 454 455 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 456 list_add(&pq->list, &uctxt->sdma_queues); 457 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 458 goto done; 459 460 cq_comps_nomem: 461 kfree(cq); 462 cq_nomem: 463 kmem_cache_destroy(pq->txreq_cache); 464 pq_txreq_nomem: 465 kfree(pq->req_in_use); 466 pq_reqs_no_in_use: 467 kfree(pq->reqs); 468 pq_reqs_nomem: 469 kfree(pq); 470 fd->pq = NULL; 471 pq_nomem: 472 ret = -ENOMEM; 473 done: 474 return ret; 475 } 476 477 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) 478 { 479 struct hfi1_ctxtdata *uctxt = fd->uctxt; 480 struct hfi1_user_sdma_pkt_q *pq; 481 unsigned long flags; 482 483 hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, 484 uctxt->ctxt, fd->subctxt); 485 pq = fd->pq; 486 if (pq) { 487 if (pq->handler) 488 hfi1_mmu_rb_unregister(pq->handler); 489 spin_lock_irqsave(&uctxt->sdma_qlock, flags); 490 if (!list_empty(&pq->list)) 491 list_del_init(&pq->list); 492 spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); 493 iowait_sdma_drain(&pq->busy); 494 /* Wait until all requests have been freed. */ 495 wait_event_interruptible( 496 pq->wait, 497 (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); 498 kfree(pq->reqs); 499 kfree(pq->req_in_use); 500 kmem_cache_destroy(pq->txreq_cache); 501 kfree(pq); 502 fd->pq = NULL; 503 } 504 if (fd->cq) { 505 vfree(fd->cq->comps); 506 kfree(fd->cq); 507 fd->cq = NULL; 508 } 509 return 0; 510 } 511 512 static u8 dlid_to_selector(u16 dlid) 513 { 514 static u8 mapping[256]; 515 static int initialized; 516 static u8 next; 517 int hash; 518 519 if (!initialized) { 520 memset(mapping, 0xFF, 256); 521 initialized = 1; 522 } 523 524 hash = ((dlid >> 8) ^ dlid) & 0xFF; 525 if (mapping[hash] == 0xFF) { 526 mapping[hash] = next; 527 next = (next + 1) & 0x7F; 528 } 529 530 return mapping[hash]; 531 } 532 533 int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, 534 unsigned long dim, unsigned long *count) 535 { 536 int ret = 0, i; 537 struct hfi1_filedata *fd = fp->private_data; 538 struct hfi1_ctxtdata *uctxt = fd->uctxt; 539 struct hfi1_user_sdma_pkt_q *pq = fd->pq; 540 struct hfi1_user_sdma_comp_q *cq = fd->cq; 541 struct hfi1_devdata *dd = pq->dd; 542 unsigned long idx = 0; 543 u8 pcount = initial_pkt_count; 544 struct sdma_req_info info; 545 struct user_sdma_request *req; 546 u8 opcode, sc, vl; 547 int req_queued = 0; 548 u16 dlid; 549 u8 selector; 550 551 if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { 552 hfi1_cdbg( 553 SDMA, 554 "[%u:%u:%u] First vector not big enough for header %lu/%lu", 555 dd->unit, uctxt->ctxt, fd->subctxt, 556 iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); 557 return -EINVAL; 558 } 559 ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); 560 if (ret) { 561 hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", 562 dd->unit, uctxt->ctxt, fd->subctxt, ret); 563 return -EFAULT; 564 } 565 566 trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, 567 (u16 *)&info); 568 569 if (info.comp_idx >= hfi1_sdma_comp_ring_size) { 570 hfi1_cdbg(SDMA, 571 "[%u:%u:%u:%u] Invalid comp index", 572 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 573 return -EINVAL; 574 } 575 576 /* 577 * Sanity check the header io vector count. Need at least 1 vector 578 * (header) and cannot be larger than the actual io vector count. 579 */ 580 if (req_iovcnt(info.ctrl) < 1 || req_iovcnt(info.ctrl) > dim) { 581 hfi1_cdbg(SDMA, 582 "[%u:%u:%u:%u] Invalid iov count %d, dim %ld", 583 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx, 584 req_iovcnt(info.ctrl), dim); 585 return -EINVAL; 586 } 587 588 if (!info.fragsize) { 589 hfi1_cdbg(SDMA, 590 "[%u:%u:%u:%u] Request does not specify fragsize", 591 dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); 592 return -EINVAL; 593 } 594 595 /* Try to claim the request. */ 596 if (test_and_set_bit(info.comp_idx, pq->req_in_use)) { 597 hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in use", 598 dd->unit, uctxt->ctxt, fd->subctxt, 599 info.comp_idx); 600 return -EBADSLT; 601 } 602 /* 603 * All safety checks have been done and this request has been claimed. 604 */ 605 hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, 606 uctxt->ctxt, fd->subctxt, info.comp_idx); 607 req = pq->reqs + info.comp_idx; 608 memset(req, 0, sizeof(*req)); 609 req->data_iovs = req_iovcnt(info.ctrl) - 1; /* subtract header vector */ 610 req->pq = pq; 611 req->cq = cq; 612 req->status = -1; 613 INIT_LIST_HEAD(&req->txps); 614 615 memcpy(&req->info, &info, sizeof(info)); 616 617 if (req_opcode(info.ctrl) == EXPECTED) { 618 /* expected must have a TID info and at least one data vector */ 619 if (req->data_iovs < 2) { 620 SDMA_DBG(req, 621 "Not enough vectors for expected request"); 622 ret = -EINVAL; 623 goto free_req; 624 } 625 req->data_iovs--; 626 } 627 628 if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { 629 SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, 630 MAX_VECTORS_PER_REQ); 631 ret = -EINVAL; 632 goto free_req; 633 } 634 /* Copy the header from the user buffer */ 635 ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), 636 sizeof(req->hdr)); 637 if (ret) { 638 SDMA_DBG(req, "Failed to copy header template (%d)", ret); 639 ret = -EFAULT; 640 goto free_req; 641 } 642 643 /* If Static rate control is not enabled, sanitize the header. */ 644 if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) 645 req->hdr.pbc[2] = 0; 646 647 /* Validate the opcode. Do not trust packets from user space blindly. */ 648 opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; 649 if ((opcode & USER_OPCODE_CHECK_MASK) != 650 USER_OPCODE_CHECK_VAL) { 651 SDMA_DBG(req, "Invalid opcode (%d)", opcode); 652 ret = -EINVAL; 653 goto free_req; 654 } 655 /* 656 * Validate the vl. Do not trust packets from user space blindly. 657 * VL comes from PBC, SC comes from LRH, and the VL needs to 658 * match the SC look up. 659 */ 660 vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; 661 sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | 662 (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); 663 if (vl >= dd->pport->vls_operational || 664 vl != sc_to_vlt(dd, sc)) { 665 SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); 666 ret = -EINVAL; 667 goto free_req; 668 } 669 670 /* Checking P_KEY for requests from user-space */ 671 if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, 672 PKEY_CHECK_INVALID)) { 673 ret = -EINVAL; 674 goto free_req; 675 } 676 677 /* 678 * Also should check the BTH.lnh. If it says the next header is GRH then 679 * the RXE parsing will be off and will land in the middle of the KDETH 680 * or miss it entirely. 681 */ 682 if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { 683 SDMA_DBG(req, "User tried to pass in a GRH"); 684 ret = -EINVAL; 685 goto free_req; 686 } 687 688 req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); 689 /* 690 * Calculate the initial TID offset based on the values of 691 * KDETH.OFFSET and KDETH.OM that are passed in. 692 */ 693 req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * 694 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 695 KDETH_OM_LARGE : KDETH_OM_SMALL); 696 SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); 697 idx++; 698 699 /* Save all the IO vector structures */ 700 for (i = 0; i < req->data_iovs; i++) { 701 INIT_LIST_HEAD(&req->iovs[i].list); 702 memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); 703 ret = pin_vector_pages(req, &req->iovs[i]); 704 if (ret) { 705 req->status = ret; 706 goto free_req; 707 } 708 req->data_len += req->iovs[i].iov.iov_len; 709 } 710 SDMA_DBG(req, "total data length %u", req->data_len); 711 712 if (pcount > req->info.npkts) 713 pcount = req->info.npkts; 714 /* 715 * Copy any TID info 716 * User space will provide the TID info only when the 717 * request type is EXPECTED. This is true even if there is 718 * only one packet in the request and the header is already 719 * setup. The reason for the singular TID case is that the 720 * driver needs to perform safety checks. 721 */ 722 if (req_opcode(req->info.ctrl) == EXPECTED) { 723 u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); 724 725 if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { 726 ret = -EINVAL; 727 goto free_req; 728 } 729 req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL); 730 if (!req->tids) { 731 ret = -ENOMEM; 732 goto free_req; 733 } 734 /* 735 * We have to copy all of the tids because they may vary 736 * in size and, therefore, the TID count might not be 737 * equal to the pkt count. However, there is no way to 738 * tell at this point. 739 */ 740 ret = copy_from_user(req->tids, iovec[idx].iov_base, 741 ntids * sizeof(*req->tids)); 742 if (ret) { 743 SDMA_DBG(req, "Failed to copy %d TIDs (%d)", 744 ntids, ret); 745 ret = -EFAULT; 746 goto free_req; 747 } 748 req->n_tids = ntids; 749 idx++; 750 } 751 752 dlid = be16_to_cpu(req->hdr.lrh[1]); 753 selector = dlid_to_selector(dlid); 754 755 /* Have to select the engine */ 756 req->sde = sdma_select_engine_vl(dd, 757 (u32)(uctxt->ctxt + fd->subctxt + 758 selector), 759 vl); 760 if (!req->sde || !sdma_running(req->sde)) { 761 ret = -ECOMM; 762 goto free_req; 763 } 764 765 /* We don't need an AHG entry if the request contains only one packet */ 766 if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) { 767 int ahg = sdma_ahg_alloc(req->sde); 768 769 if (likely(ahg >= 0)) { 770 req->ahg_idx = (u8)ahg; 771 set_bit(SDMA_REQ_HAVE_AHG, &req->flags); 772 } 773 } 774 775 set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); 776 atomic_inc(&pq->n_reqs); 777 req_queued = 1; 778 /* Send the first N packets in the request to buy us some time */ 779 ret = user_sdma_send_pkts(req, pcount); 780 if (unlikely(ret < 0 && ret != -EBUSY)) { 781 req->status = ret; 782 goto free_req; 783 } 784 785 /* 786 * It is possible that the SDMA engine would have processed all the 787 * submitted packets by the time we get here. Therefore, only set 788 * packet queue state to ACTIVE if there are still uncompleted 789 * requests. 790 */ 791 if (atomic_read(&pq->n_reqs)) 792 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); 793 794 /* 795 * This is a somewhat blocking send implementation. 796 * The driver will block the caller until all packets of the 797 * request have been submitted to the SDMA engine. However, it 798 * will not wait for send completions. 799 */ 800 while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { 801 ret = user_sdma_send_pkts(req, pcount); 802 if (ret < 0) { 803 if (ret != -EBUSY) { 804 req->status = ret; 805 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 806 if (ACCESS_ONCE(req->seqcomp) == 807 req->seqsubmitted - 1) 808 goto free_req; 809 return ret; 810 } 811 wait_event_interruptible_timeout( 812 pq->busy.wait_dma, 813 (pq->state == SDMA_PKT_Q_ACTIVE), 814 msecs_to_jiffies( 815 SDMA_IOWAIT_TIMEOUT)); 816 } 817 } 818 *count += idx; 819 return 0; 820 free_req: 821 user_sdma_free_request(req, true); 822 if (req_queued) 823 pq_update(pq); 824 set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); 825 return ret; 826 } 827 828 static inline u32 compute_data_length(struct user_sdma_request *req, 829 struct user_sdma_txreq *tx) 830 { 831 /* 832 * Determine the proper size of the packet data. 833 * The size of the data of the first packet is in the header 834 * template. However, it includes the header and ICRC, which need 835 * to be subtracted. 836 * The minimum representable packet data length in a header is 4 bytes, 837 * therefore, when the data length request is less than 4 bytes, there's 838 * only one packet, and the packet data length is equal to that of the 839 * request data length. 840 * The size of the remaining packets is the minimum of the frag 841 * size (MTU) or remaining data in the request. 842 */ 843 u32 len; 844 845 if (!req->seqnum) { 846 if (req->data_len < sizeof(u32)) 847 len = req->data_len; 848 else 849 len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - 850 (sizeof(tx->hdr) - 4)); 851 } else if (req_opcode(req->info.ctrl) == EXPECTED) { 852 u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * 853 PAGE_SIZE; 854 /* 855 * Get the data length based on the remaining space in the 856 * TID pair. 857 */ 858 len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); 859 /* If we've filled up the TID pair, move to the next one. */ 860 if (unlikely(!len) && ++req->tididx < req->n_tids && 861 req->tids[req->tididx]) { 862 tidlen = EXP_TID_GET(req->tids[req->tididx], 863 LEN) * PAGE_SIZE; 864 req->tidoffset = 0; 865 len = min_t(u32, tidlen, req->info.fragsize); 866 } 867 /* 868 * Since the TID pairs map entire pages, make sure that we 869 * are not going to try to send more data that we have 870 * remaining. 871 */ 872 len = min(len, req->data_len - req->sent); 873 } else { 874 len = min(req->data_len - req->sent, (u32)req->info.fragsize); 875 } 876 SDMA_DBG(req, "Data Length = %u", len); 877 return len; 878 } 879 880 static inline u32 pad_len(u32 len) 881 { 882 if (len & (sizeof(u32) - 1)) 883 len += sizeof(u32) - (len & (sizeof(u32) - 1)); 884 return len; 885 } 886 887 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) 888 { 889 /* (Size of complete header - size of PBC) + 4B ICRC + data length */ 890 return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); 891 } 892 893 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) 894 { 895 int ret = 0; 896 unsigned npkts = 0; 897 struct user_sdma_txreq *tx = NULL; 898 struct hfi1_user_sdma_pkt_q *pq = NULL; 899 struct user_sdma_iovec *iovec = NULL; 900 901 if (!req->pq) 902 return -EINVAL; 903 904 pq = req->pq; 905 906 /* If tx completion has reported an error, we are done. */ 907 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 908 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 909 return -EFAULT; 910 } 911 912 /* 913 * Check if we might have sent the entire request already 914 */ 915 if (unlikely(req->seqnum == req->info.npkts)) { 916 if (!list_empty(&req->txps)) 917 goto dosend; 918 return ret; 919 } 920 921 if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) 922 maxpkts = req->info.npkts - req->seqnum; 923 924 while (npkts < maxpkts) { 925 u32 datalen = 0, queued = 0, data_sent = 0; 926 u64 iov_offset = 0; 927 928 /* 929 * Check whether any of the completions have come back 930 * with errors. If so, we are not going to process any 931 * more packets from this request. 932 */ 933 if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { 934 set_bit(SDMA_REQ_DONE_ERROR, &req->flags); 935 return -EFAULT; 936 } 937 938 tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); 939 if (!tx) 940 return -ENOMEM; 941 942 tx->flags = 0; 943 tx->req = req; 944 tx->busycount = 0; 945 INIT_LIST_HEAD(&tx->list); 946 947 if (req->seqnum == req->info.npkts - 1) 948 tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT; 949 950 /* 951 * Calculate the payload size - this is min of the fragment 952 * (MTU) size or the remaining bytes in the request but only 953 * if we have payload data. 954 */ 955 if (req->data_len) { 956 iovec = &req->iovs[req->iov_idx]; 957 if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { 958 if (++req->iov_idx == req->data_iovs) { 959 ret = -EFAULT; 960 goto free_txreq; 961 } 962 iovec = &req->iovs[req->iov_idx]; 963 WARN_ON(iovec->offset); 964 } 965 966 datalen = compute_data_length(req, tx); 967 if (!datalen) { 968 SDMA_DBG(req, 969 "Request has data but pkt len is 0"); 970 ret = -EFAULT; 971 goto free_tx; 972 } 973 } 974 975 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { 976 if (!req->seqnum) { 977 u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); 978 u32 lrhlen = get_lrh_len(req->hdr, 979 pad_len(datalen)); 980 /* 981 * Copy the request header into the tx header 982 * because the HW needs a cacheline-aligned 983 * address. 984 * This copy can be optimized out if the hdr 985 * member of user_sdma_request were also 986 * cacheline aligned. 987 */ 988 memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); 989 if (PBC2LRH(pbclen) != lrhlen) { 990 pbclen = (pbclen & 0xf000) | 991 LRH2PBC(lrhlen); 992 tx->hdr.pbc[0] = cpu_to_le16(pbclen); 993 } 994 ret = sdma_txinit_ahg(&tx->txreq, 995 SDMA_TXREQ_F_AHG_COPY, 996 sizeof(tx->hdr) + datalen, 997 req->ahg_idx, 0, NULL, 0, 998 user_sdma_txreq_cb); 999 if (ret) 1000 goto free_tx; 1001 ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, 1002 &tx->hdr, 1003 sizeof(tx->hdr)); 1004 if (ret) 1005 goto free_txreq; 1006 } else { 1007 int changes; 1008 1009 changes = set_txreq_header_ahg(req, tx, 1010 datalen); 1011 if (changes < 0) 1012 goto free_tx; 1013 sdma_txinit_ahg(&tx->txreq, 1014 SDMA_TXREQ_F_USE_AHG, 1015 datalen, req->ahg_idx, changes, 1016 req->ahg, sizeof(req->hdr), 1017 user_sdma_txreq_cb); 1018 } 1019 } else { 1020 ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + 1021 datalen, user_sdma_txreq_cb); 1022 if (ret) 1023 goto free_tx; 1024 /* 1025 * Modify the header for this packet. This only needs 1026 * to be done if we are not going to use AHG. Otherwise, 1027 * the HW will do it based on the changes we gave it 1028 * during sdma_txinit_ahg(). 1029 */ 1030 ret = set_txreq_header(req, tx, datalen); 1031 if (ret) 1032 goto free_txreq; 1033 } 1034 1035 /* 1036 * If the request contains any data vectors, add up to 1037 * fragsize bytes to the descriptor. 1038 */ 1039 while (queued < datalen && 1040 (req->sent + data_sent) < req->data_len) { 1041 unsigned long base, offset; 1042 unsigned pageidx, len; 1043 1044 base = (unsigned long)iovec->iov.iov_base; 1045 offset = offset_in_page(base + iovec->offset + 1046 iov_offset); 1047 pageidx = (((iovec->offset + iov_offset + 1048 base) - (base & PAGE_MASK)) >> PAGE_SHIFT); 1049 len = offset + req->info.fragsize > PAGE_SIZE ? 1050 PAGE_SIZE - offset : req->info.fragsize; 1051 len = min((datalen - queued), len); 1052 ret = sdma_txadd_page(pq->dd, &tx->txreq, 1053 iovec->pages[pageidx], 1054 offset, len); 1055 if (ret) { 1056 SDMA_DBG(req, "SDMA txreq add page failed %d\n", 1057 ret); 1058 goto free_txreq; 1059 } 1060 iov_offset += len; 1061 queued += len; 1062 data_sent += len; 1063 if (unlikely(queued < datalen && 1064 pageidx == iovec->npages && 1065 req->iov_idx < req->data_iovs - 1)) { 1066 iovec->offset += iov_offset; 1067 iovec = &req->iovs[++req->iov_idx]; 1068 iov_offset = 0; 1069 } 1070 } 1071 /* 1072 * The txreq was submitted successfully so we can update 1073 * the counters. 1074 */ 1075 req->koffset += datalen; 1076 if (req_opcode(req->info.ctrl) == EXPECTED) 1077 req->tidoffset += datalen; 1078 req->sent += data_sent; 1079 if (req->data_len) 1080 iovec->offset += iov_offset; 1081 list_add_tail(&tx->txreq.list, &req->txps); 1082 /* 1083 * It is important to increment this here as it is used to 1084 * generate the BTH.PSN and, therefore, can't be bulk-updated 1085 * outside of the loop. 1086 */ 1087 tx->seqnum = req->seqnum++; 1088 npkts++; 1089 } 1090 dosend: 1091 ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps); 1092 if (list_empty(&req->txps)) { 1093 req->seqsubmitted = req->seqnum; 1094 if (req->seqnum == req->info.npkts) { 1095 set_bit(SDMA_REQ_SEND_DONE, &req->flags); 1096 /* 1097 * The txreq has already been submitted to the HW queue 1098 * so we can free the AHG entry now. Corruption will not 1099 * happen due to the sequential manner in which 1100 * descriptors are processed. 1101 */ 1102 if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) 1103 sdma_ahg_free(req->sde, req->ahg_idx); 1104 } 1105 } else if (ret > 0) { 1106 req->seqsubmitted += ret; 1107 ret = 0; 1108 } 1109 return ret; 1110 1111 free_txreq: 1112 sdma_txclean(pq->dd, &tx->txreq); 1113 free_tx: 1114 kmem_cache_free(pq->txreq_cache, tx); 1115 return ret; 1116 } 1117 1118 /* 1119 * How many pages in this iovec element? 1120 */ 1121 static inline int num_user_pages(const struct iovec *iov) 1122 { 1123 const unsigned long addr = (unsigned long)iov->iov_base; 1124 const unsigned long len = iov->iov_len; 1125 const unsigned long spage = addr & PAGE_MASK; 1126 const unsigned long epage = (addr + len - 1) & PAGE_MASK; 1127 1128 return 1 + ((epage - spage) >> PAGE_SHIFT); 1129 } 1130 1131 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) 1132 { 1133 struct evict_data evict_data; 1134 1135 evict_data.cleared = 0; 1136 evict_data.target = npages; 1137 hfi1_mmu_rb_evict(pq->handler, &evict_data); 1138 return evict_data.cleared; 1139 } 1140 1141 static int pin_vector_pages(struct user_sdma_request *req, 1142 struct user_sdma_iovec *iovec) 1143 { 1144 int ret = 0, pinned, npages, cleared; 1145 struct page **pages; 1146 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1147 struct sdma_mmu_node *node = NULL; 1148 struct mmu_rb_node *rb_node; 1149 1150 rb_node = hfi1_mmu_rb_extract(pq->handler, 1151 (unsigned long)iovec->iov.iov_base, 1152 iovec->iov.iov_len); 1153 if (rb_node && !IS_ERR(rb_node)) 1154 node = container_of(rb_node, struct sdma_mmu_node, rb); 1155 else 1156 rb_node = NULL; 1157 1158 if (!node) { 1159 node = kzalloc(sizeof(*node), GFP_KERNEL); 1160 if (!node) 1161 return -ENOMEM; 1162 1163 node->rb.addr = (unsigned long)iovec->iov.iov_base; 1164 node->pq = pq; 1165 atomic_set(&node->refcount, 0); 1166 } 1167 1168 npages = num_user_pages(&iovec->iov); 1169 if (node->npages < npages) { 1170 pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); 1171 if (!pages) { 1172 SDMA_DBG(req, "Failed page array alloc"); 1173 ret = -ENOMEM; 1174 goto bail; 1175 } 1176 memcpy(pages, node->pages, node->npages * sizeof(*pages)); 1177 1178 npages -= node->npages; 1179 1180 retry: 1181 if (!hfi1_can_pin_pages(pq->dd, pq->mm, 1182 atomic_read(&pq->n_locked), npages)) { 1183 cleared = sdma_cache_evict(pq, npages); 1184 if (cleared >= npages) 1185 goto retry; 1186 } 1187 pinned = hfi1_acquire_user_pages(pq->mm, 1188 ((unsigned long)iovec->iov.iov_base + 1189 (node->npages * PAGE_SIZE)), npages, 0, 1190 pages + node->npages); 1191 if (pinned < 0) { 1192 kfree(pages); 1193 ret = pinned; 1194 goto bail; 1195 } 1196 if (pinned != npages) { 1197 unpin_vector_pages(pq->mm, pages, node->npages, 1198 pinned); 1199 ret = -EFAULT; 1200 goto bail; 1201 } 1202 kfree(node->pages); 1203 node->rb.len = iovec->iov.iov_len; 1204 node->pages = pages; 1205 node->npages += pinned; 1206 npages = node->npages; 1207 atomic_add(pinned, &pq->n_locked); 1208 } 1209 iovec->pages = node->pages; 1210 iovec->npages = npages; 1211 iovec->node = node; 1212 1213 ret = hfi1_mmu_rb_insert(req->pq->handler, &node->rb); 1214 if (ret) { 1215 atomic_sub(node->npages, &pq->n_locked); 1216 iovec->node = NULL; 1217 goto bail; 1218 } 1219 return 0; 1220 bail: 1221 if (rb_node) 1222 unpin_vector_pages(pq->mm, node->pages, 0, node->npages); 1223 kfree(node); 1224 return ret; 1225 } 1226 1227 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, 1228 unsigned start, unsigned npages) 1229 { 1230 hfi1_release_user_pages(mm, pages + start, npages, false); 1231 kfree(pages); 1232 } 1233 1234 static int check_header_template(struct user_sdma_request *req, 1235 struct hfi1_pkt_header *hdr, u32 lrhlen, 1236 u32 datalen) 1237 { 1238 /* 1239 * Perform safety checks for any type of packet: 1240 * - transfer size is multiple of 64bytes 1241 * - packet length is multiple of 4 bytes 1242 * - packet length is not larger than MTU size 1243 * 1244 * These checks are only done for the first packet of the 1245 * transfer since the header is "given" to us by user space. 1246 * For the remainder of the packets we compute the values. 1247 */ 1248 if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 || 1249 lrhlen > get_lrh_len(*hdr, req->info.fragsize)) 1250 return -EINVAL; 1251 1252 if (req_opcode(req->info.ctrl) == EXPECTED) { 1253 /* 1254 * The header is checked only on the first packet. Furthermore, 1255 * we ensure that at least one TID entry is copied when the 1256 * request is submitted. Therefore, we don't have to verify that 1257 * tididx points to something sane. 1258 */ 1259 u32 tidval = req->tids[req->tididx], 1260 tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, 1261 tididx = EXP_TID_GET(tidval, IDX), 1262 tidctrl = EXP_TID_GET(tidval, CTRL), 1263 tidoff; 1264 __le32 kval = hdr->kdeth.ver_tid_offset; 1265 1266 tidoff = KDETH_GET(kval, OFFSET) * 1267 (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? 1268 KDETH_OM_LARGE : KDETH_OM_SMALL); 1269 /* 1270 * Expected receive packets have the following 1271 * additional checks: 1272 * - offset is not larger than the TID size 1273 * - TIDCtrl values match between header and TID array 1274 * - TID indexes match between header and TID array 1275 */ 1276 if ((tidoff + datalen > tidlen) || 1277 KDETH_GET(kval, TIDCTRL) != tidctrl || 1278 KDETH_GET(kval, TID) != tididx) 1279 return -EINVAL; 1280 } 1281 return 0; 1282 } 1283 1284 /* 1285 * Correctly set the BTH.PSN field based on type of 1286 * transfer - eager packets can just increment the PSN but 1287 * expected packets encode generation and sequence in the 1288 * BTH.PSN field so just incrementing will result in errors. 1289 */ 1290 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) 1291 { 1292 u32 val = be32_to_cpu(bthpsn), 1293 mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : 1294 0xffffffull), 1295 psn = val & mask; 1296 if (expct) 1297 psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); 1298 else 1299 psn = psn + frags; 1300 return psn & mask; 1301 } 1302 1303 static int set_txreq_header(struct user_sdma_request *req, 1304 struct user_sdma_txreq *tx, u32 datalen) 1305 { 1306 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1307 struct hfi1_pkt_header *hdr = &tx->hdr; 1308 u16 pbclen; 1309 int ret; 1310 u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen)); 1311 1312 /* Copy the header template to the request before modification */ 1313 memcpy(hdr, &req->hdr, sizeof(*hdr)); 1314 1315 /* 1316 * Check if the PBC and LRH length are mismatched. If so 1317 * adjust both in the header. 1318 */ 1319 pbclen = le16_to_cpu(hdr->pbc[0]); 1320 if (PBC2LRH(pbclen) != lrhlen) { 1321 pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); 1322 hdr->pbc[0] = cpu_to_le16(pbclen); 1323 hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); 1324 /* 1325 * Third packet 1326 * This is the first packet in the sequence that has 1327 * a "static" size that can be used for the rest of 1328 * the packets (besides the last one). 1329 */ 1330 if (unlikely(req->seqnum == 2)) { 1331 /* 1332 * From this point on the lengths in both the 1333 * PBC and LRH are the same until the last 1334 * packet. 1335 * Adjust the template so we don't have to update 1336 * every packet 1337 */ 1338 req->hdr.pbc[0] = hdr->pbc[0]; 1339 req->hdr.lrh[2] = hdr->lrh[2]; 1340 } 1341 } 1342 /* 1343 * We only have to modify the header if this is not the 1344 * first packet in the request. Otherwise, we use the 1345 * header given to us. 1346 */ 1347 if (unlikely(!req->seqnum)) { 1348 ret = check_header_template(req, hdr, lrhlen, datalen); 1349 if (ret) 1350 return ret; 1351 goto done; 1352 } 1353 1354 hdr->bth[2] = cpu_to_be32( 1355 set_pkt_bth_psn(hdr->bth[2], 1356 (req_opcode(req->info.ctrl) == EXPECTED), 1357 req->seqnum)); 1358 1359 /* Set ACK request on last packet */ 1360 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) 1361 hdr->bth[2] |= cpu_to_be32(1UL << 31); 1362 1363 /* Set the new offset */ 1364 hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); 1365 /* Expected packets have to fill in the new TID information */ 1366 if (req_opcode(req->info.ctrl) == EXPECTED) { 1367 tidval = req->tids[req->tididx]; 1368 /* 1369 * If the offset puts us at the end of the current TID, 1370 * advance everything. 1371 */ 1372 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1373 PAGE_SIZE)) { 1374 req->tidoffset = 0; 1375 /* 1376 * Since we don't copy all the TIDs, all at once, 1377 * we have to check again. 1378 */ 1379 if (++req->tididx > req->n_tids - 1 || 1380 !req->tids[req->tididx]) { 1381 return -EINVAL; 1382 } 1383 tidval = req->tids[req->tididx]; 1384 } 1385 req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= 1386 KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL; 1387 /* Set KDETH.TIDCtrl based on value for this TID. */ 1388 KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, 1389 EXP_TID_GET(tidval, CTRL)); 1390 /* Set KDETH.TID based on value for this TID */ 1391 KDETH_SET(hdr->kdeth.ver_tid_offset, TID, 1392 EXP_TID_GET(tidval, IDX)); 1393 /* Clear KDETH.SH only on the last packet */ 1394 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) 1395 KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); 1396 /* 1397 * Set the KDETH.OFFSET and KDETH.OM based on size of 1398 * transfer. 1399 */ 1400 SDMA_DBG(req, "TID offset %ubytes %uunits om%u", 1401 req->tidoffset, req->tidoffset / req->omfactor, 1402 req->omfactor != KDETH_OM_SMALL); 1403 KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, 1404 req->tidoffset / req->omfactor); 1405 KDETH_SET(hdr->kdeth.ver_tid_offset, OM, 1406 req->omfactor != KDETH_OM_SMALL); 1407 } 1408 done: 1409 trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, 1410 req->info.comp_idx, hdr, tidval); 1411 return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); 1412 } 1413 1414 static int set_txreq_header_ahg(struct user_sdma_request *req, 1415 struct user_sdma_txreq *tx, u32 len) 1416 { 1417 int diff = 0; 1418 struct hfi1_user_sdma_pkt_q *pq = req->pq; 1419 struct hfi1_pkt_header *hdr = &req->hdr; 1420 u16 pbclen = le16_to_cpu(hdr->pbc[0]); 1421 u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len)); 1422 1423 if (PBC2LRH(pbclen) != lrhlen) { 1424 /* PBC.PbcLengthDWs */ 1425 AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, 1426 cpu_to_le16(LRH2PBC(lrhlen))); 1427 /* LRH.PktLen (we need the full 16 bits due to byte swap) */ 1428 AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, 1429 cpu_to_be16(lrhlen >> 2)); 1430 } 1431 1432 /* 1433 * Do the common updates 1434 */ 1435 /* BTH.PSN and BTH.A */ 1436 val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & 1437 (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); 1438 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) 1439 val32 |= 1UL << 31; 1440 AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); 1441 AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); 1442 /* KDETH.Offset */ 1443 AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, 1444 cpu_to_le16(req->koffset & 0xffff)); 1445 AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, 1446 cpu_to_le16(req->koffset >> 16)); 1447 if (req_opcode(req->info.ctrl) == EXPECTED) { 1448 __le16 val; 1449 1450 tidval = req->tids[req->tididx]; 1451 1452 /* 1453 * If the offset puts us at the end of the current TID, 1454 * advance everything. 1455 */ 1456 if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * 1457 PAGE_SIZE)) { 1458 req->tidoffset = 0; 1459 /* 1460 * Since we don't copy all the TIDs, all at once, 1461 * we have to check again. 1462 */ 1463 if (++req->tididx > req->n_tids - 1 || 1464 !req->tids[req->tididx]) { 1465 return -EINVAL; 1466 } 1467 tidval = req->tids[req->tididx]; 1468 } 1469 req->omfactor = ((EXP_TID_GET(tidval, LEN) * 1470 PAGE_SIZE) >= 1471 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE : 1472 KDETH_OM_SMALL; 1473 /* KDETH.OM and KDETH.OFFSET (TID) */ 1474 AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, 1475 ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 | 1476 ((req->tidoffset / req->omfactor) & 0x7fff))); 1477 /* KDETH.TIDCtrl, KDETH.TID */ 1478 val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | 1479 (EXP_TID_GET(tidval, IDX) & 0x3ff)); 1480 /* Clear KDETH.SH on last packet */ 1481 if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) { 1482 val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset, 1483 INTR) >> 16); 1484 val &= cpu_to_le16(~(1U << 13)); 1485 AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); 1486 } else { 1487 AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val); 1488 } 1489 } 1490 1491 trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, 1492 req->info.comp_idx, req->sde->this_idx, 1493 req->ahg_idx, req->ahg, diff, tidval); 1494 return diff; 1495 } 1496 1497 /* 1498 * SDMA tx request completion callback. Called when the SDMA progress 1499 * state machine gets notification that the SDMA descriptors for this 1500 * tx request have been processed by the DMA engine. Called in 1501 * interrupt context. 1502 */ 1503 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) 1504 { 1505 struct user_sdma_txreq *tx = 1506 container_of(txreq, struct user_sdma_txreq, txreq); 1507 struct user_sdma_request *req; 1508 struct hfi1_user_sdma_pkt_q *pq; 1509 struct hfi1_user_sdma_comp_q *cq; 1510 u16 idx; 1511 1512 if (!tx->req) 1513 return; 1514 1515 req = tx->req; 1516 pq = req->pq; 1517 cq = req->cq; 1518 1519 if (status != SDMA_TXREQ_S_OK) { 1520 SDMA_DBG(req, "SDMA completion with error %d", 1521 status); 1522 set_bit(SDMA_REQ_HAS_ERROR, &req->flags); 1523 } 1524 1525 req->seqcomp = tx->seqnum; 1526 kmem_cache_free(pq->txreq_cache, tx); 1527 tx = NULL; 1528 1529 idx = req->info.comp_idx; 1530 if (req->status == -1 && status == SDMA_TXREQ_S_OK) { 1531 if (req->seqcomp == req->info.npkts - 1) { 1532 req->status = 0; 1533 user_sdma_free_request(req, false); 1534 pq_update(pq); 1535 set_comp_state(pq, cq, idx, COMPLETE, 0); 1536 } 1537 } else { 1538 if (status != SDMA_TXREQ_S_OK) 1539 req->status = status; 1540 if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && 1541 (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || 1542 test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { 1543 user_sdma_free_request(req, false); 1544 pq_update(pq); 1545 set_comp_state(pq, cq, idx, ERROR, req->status); 1546 } 1547 } 1548 } 1549 1550 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) 1551 { 1552 if (atomic_dec_and_test(&pq->n_reqs)) { 1553 xchg(&pq->state, SDMA_PKT_Q_INACTIVE); 1554 wake_up(&pq->wait); 1555 } 1556 } 1557 1558 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) 1559 { 1560 if (!list_empty(&req->txps)) { 1561 struct sdma_txreq *t, *p; 1562 1563 list_for_each_entry_safe(t, p, &req->txps, list) { 1564 struct user_sdma_txreq *tx = 1565 container_of(t, struct user_sdma_txreq, txreq); 1566 list_del_init(&t->list); 1567 sdma_txclean(req->pq->dd, t); 1568 kmem_cache_free(req->pq->txreq_cache, tx); 1569 } 1570 } 1571 if (req->data_iovs) { 1572 struct sdma_mmu_node *node; 1573 int i; 1574 1575 for (i = 0; i < req->data_iovs; i++) { 1576 node = req->iovs[i].node; 1577 if (!node) 1578 continue; 1579 1580 if (unpin) 1581 hfi1_mmu_rb_remove(req->pq->handler, 1582 &node->rb); 1583 else 1584 atomic_dec(&node->refcount); 1585 } 1586 } 1587 kfree(req->tids); 1588 clear_bit(req->info.comp_idx, req->pq->req_in_use); 1589 } 1590 1591 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, 1592 struct hfi1_user_sdma_comp_q *cq, 1593 u16 idx, enum hfi1_sdma_comp_state state, 1594 int ret) 1595 { 1596 hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", 1597 pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); 1598 cq->comps[idx].status = state; 1599 if (state == ERROR) 1600 cq->comps[idx].errcode = -ret; 1601 trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, 1602 idx, state, ret); 1603 } 1604 1605 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, 1606 unsigned long len) 1607 { 1608 return (bool)(node->addr == addr); 1609 } 1610 1611 static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode) 1612 { 1613 struct sdma_mmu_node *node = 1614 container_of(mnode, struct sdma_mmu_node, rb); 1615 1616 atomic_inc(&node->refcount); 1617 return 0; 1618 } 1619 1620 /* 1621 * Return 1 to remove the node from the rb tree and call the remove op. 1622 * 1623 * Called with the rb tree lock held. 1624 */ 1625 static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, 1626 void *evict_arg, bool *stop) 1627 { 1628 struct sdma_mmu_node *node = 1629 container_of(mnode, struct sdma_mmu_node, rb); 1630 struct evict_data *evict_data = evict_arg; 1631 1632 /* is this node still being used? */ 1633 if (atomic_read(&node->refcount)) 1634 return 0; /* keep this node */ 1635 1636 /* this node will be evicted, add its pages to our count */ 1637 evict_data->cleared += node->npages; 1638 1639 /* have enough pages been cleared? */ 1640 if (evict_data->cleared >= evict_data->target) 1641 *stop = true; 1642 1643 return 1; /* remove this node */ 1644 } 1645 1646 static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) 1647 { 1648 struct sdma_mmu_node *node = 1649 container_of(mnode, struct sdma_mmu_node, rb); 1650 1651 atomic_sub(node->npages, &node->pq->n_locked); 1652 1653 unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); 1654 1655 kfree(node); 1656 } 1657 1658 static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode) 1659 { 1660 struct sdma_mmu_node *node = 1661 container_of(mnode, struct sdma_mmu_node, rb); 1662 1663 if (!atomic_read(&node->refcount)) 1664 return 1; 1665 return 0; 1666 } 1667