1 /* 2 * Copyright © 2008-2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/dma-fence-array.h> 26 #include <linux/irq_work.h> 27 #include <linux/prefetch.h> 28 #include <linux/sched.h> 29 #include <linux/sched/clock.h> 30 #include <linux/sched/signal.h> 31 32 #include "gem/i915_gem_context.h" 33 #include "gt/intel_context.h" 34 #include "gt/intel_ring.h" 35 #include "gt/intel_rps.h" 36 37 #include "i915_active.h" 38 #include "i915_drv.h" 39 #include "i915_globals.h" 40 #include "i915_trace.h" 41 #include "intel_pm.h" 42 43 struct execute_cb { 44 struct list_head link; 45 struct irq_work work; 46 struct i915_sw_fence *fence; 47 void (*hook)(struct i915_request *rq, struct dma_fence *signal); 48 struct i915_request *signal; 49 }; 50 51 static struct i915_global_request { 52 struct i915_global base; 53 struct kmem_cache *slab_requests; 54 struct kmem_cache *slab_dependencies; 55 struct kmem_cache *slab_execute_cbs; 56 } global; 57 58 static const char *i915_fence_get_driver_name(struct dma_fence *fence) 59 { 60 return "i915"; 61 } 62 63 static const char *i915_fence_get_timeline_name(struct dma_fence *fence) 64 { 65 /* 66 * The timeline struct (as part of the ppgtt underneath a context) 67 * may be freed when the request is no longer in use by the GPU. 68 * We could extend the life of a context to beyond that of all 69 * fences, possibly keeping the hw resource around indefinitely, 70 * or we just give them a false name. Since 71 * dma_fence_ops.get_timeline_name is a debug feature, the occasional 72 * lie seems justifiable. 73 */ 74 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 75 return "signaled"; 76 77 return to_request(fence)->gem_context->name ?: "[i915]"; 78 } 79 80 static bool i915_fence_signaled(struct dma_fence *fence) 81 { 82 return i915_request_completed(to_request(fence)); 83 } 84 85 static bool i915_fence_enable_signaling(struct dma_fence *fence) 86 { 87 return i915_request_enable_breadcrumb(to_request(fence)); 88 } 89 90 static signed long i915_fence_wait(struct dma_fence *fence, 91 bool interruptible, 92 signed long timeout) 93 { 94 return i915_request_wait(to_request(fence), 95 interruptible | I915_WAIT_PRIORITY, 96 timeout); 97 } 98 99 static void i915_fence_release(struct dma_fence *fence) 100 { 101 struct i915_request *rq = to_request(fence); 102 103 /* 104 * The request is put onto a RCU freelist (i.e. the address 105 * is immediately reused), mark the fences as being freed now. 106 * Otherwise the debugobjects for the fences are only marked as 107 * freed when the slab cache itself is freed, and so we would get 108 * caught trying to reuse dead objects. 109 */ 110 i915_sw_fence_fini(&rq->submit); 111 i915_sw_fence_fini(&rq->semaphore); 112 113 kmem_cache_free(global.slab_requests, rq); 114 } 115 116 const struct dma_fence_ops i915_fence_ops = { 117 .get_driver_name = i915_fence_get_driver_name, 118 .get_timeline_name = i915_fence_get_timeline_name, 119 .enable_signaling = i915_fence_enable_signaling, 120 .signaled = i915_fence_signaled, 121 .wait = i915_fence_wait, 122 .release = i915_fence_release, 123 }; 124 125 static void irq_execute_cb(struct irq_work *wrk) 126 { 127 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 128 129 i915_sw_fence_complete(cb->fence); 130 kmem_cache_free(global.slab_execute_cbs, cb); 131 } 132 133 static void irq_execute_cb_hook(struct irq_work *wrk) 134 { 135 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 136 137 cb->hook(container_of(cb->fence, struct i915_request, submit), 138 &cb->signal->fence); 139 i915_request_put(cb->signal); 140 141 irq_execute_cb(wrk); 142 } 143 144 static void __notify_execute_cb(struct i915_request *rq) 145 { 146 struct execute_cb *cb; 147 148 lockdep_assert_held(&rq->lock); 149 150 if (list_empty(&rq->execute_cb)) 151 return; 152 153 list_for_each_entry(cb, &rq->execute_cb, link) 154 irq_work_queue(&cb->work); 155 156 /* 157 * XXX Rollback on __i915_request_unsubmit() 158 * 159 * In the future, perhaps when we have an active time-slicing scheduler, 160 * it will be interesting to unsubmit parallel execution and remove 161 * busywaits from the GPU until their master is restarted. This is 162 * quite hairy, we have to carefully rollback the fence and do a 163 * preempt-to-idle cycle on the target engine, all the while the 164 * master execute_cb may refire. 165 */ 166 INIT_LIST_HEAD(&rq->execute_cb); 167 } 168 169 static inline void 170 remove_from_client(struct i915_request *request) 171 { 172 struct drm_i915_file_private *file_priv; 173 174 if (!READ_ONCE(request->file_priv)) 175 return; 176 177 rcu_read_lock(); 178 file_priv = xchg(&request->file_priv, NULL); 179 if (file_priv) { 180 spin_lock(&file_priv->mm.lock); 181 list_del(&request->client_link); 182 spin_unlock(&file_priv->mm.lock); 183 } 184 rcu_read_unlock(); 185 } 186 187 static void free_capture_list(struct i915_request *request) 188 { 189 struct i915_capture_list *capture; 190 191 capture = request->capture_list; 192 while (capture) { 193 struct i915_capture_list *next = capture->next; 194 195 kfree(capture); 196 capture = next; 197 } 198 } 199 200 static void remove_from_engine(struct i915_request *rq) 201 { 202 struct intel_engine_cs *engine, *locked; 203 204 /* 205 * Virtual engines complicate acquiring the engine timeline lock, 206 * as their rq->engine pointer is not stable until under that 207 * engine lock. The simple ploy we use is to take the lock then 208 * check that the rq still belongs to the newly locked engine. 209 */ 210 locked = READ_ONCE(rq->engine); 211 spin_lock_irq(&locked->active.lock); 212 while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { 213 spin_unlock(&locked->active.lock); 214 spin_lock(&engine->active.lock); 215 locked = engine; 216 } 217 list_del(&rq->sched.link); 218 spin_unlock_irq(&locked->active.lock); 219 } 220 221 bool i915_request_retire(struct i915_request *rq) 222 { 223 if (!i915_request_completed(rq)) 224 return false; 225 226 GEM_TRACE("%s fence %llx:%lld, current %d\n", 227 rq->engine->name, 228 rq->fence.context, rq->fence.seqno, 229 hwsp_seqno(rq)); 230 231 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 232 trace_i915_request_retire(rq); 233 234 /* 235 * We know the GPU must have read the request to have 236 * sent us the seqno + interrupt, so use the position 237 * of tail of the request to update the last known position 238 * of the GPU head. 239 * 240 * Note this requires that we are always called in request 241 * completion order. 242 */ 243 GEM_BUG_ON(!list_is_first(&rq->link, 244 &i915_request_timeline(rq)->requests)); 245 rq->ring->head = rq->postfix; 246 247 /* 248 * We only loosely track inflight requests across preemption, 249 * and so we may find ourselves attempting to retire a _completed_ 250 * request that we have removed from the HW and put back on a run 251 * queue. 252 */ 253 remove_from_engine(rq); 254 255 spin_lock_irq(&rq->lock); 256 i915_request_mark_complete(rq); 257 if (!i915_request_signaled(rq)) 258 dma_fence_signal_locked(&rq->fence); 259 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags)) 260 i915_request_cancel_breadcrumb(rq); 261 if (i915_request_has_waitboost(rq)) { 262 GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters)); 263 atomic_dec(&rq->engine->gt->rps.num_waiters); 264 } 265 if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { 266 set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); 267 __notify_execute_cb(rq); 268 } 269 GEM_BUG_ON(!list_empty(&rq->execute_cb)); 270 spin_unlock_irq(&rq->lock); 271 272 remove_from_client(rq); 273 list_del(&rq->link); 274 275 intel_context_exit(rq->hw_context); 276 intel_context_unpin(rq->hw_context); 277 278 free_capture_list(rq); 279 i915_sched_node_fini(&rq->sched); 280 i915_request_put(rq); 281 282 return true; 283 } 284 285 void i915_request_retire_upto(struct i915_request *rq) 286 { 287 struct intel_timeline * const tl = i915_request_timeline(rq); 288 struct i915_request *tmp; 289 290 GEM_TRACE("%s fence %llx:%lld, current %d\n", 291 rq->engine->name, 292 rq->fence.context, rq->fence.seqno, 293 hwsp_seqno(rq)); 294 295 GEM_BUG_ON(!i915_request_completed(rq)); 296 297 do { 298 tmp = list_first_entry(&tl->requests, typeof(*tmp), link); 299 } while (i915_request_retire(tmp) && tmp != rq); 300 } 301 302 static int 303 __await_execution(struct i915_request *rq, 304 struct i915_request *signal, 305 void (*hook)(struct i915_request *rq, 306 struct dma_fence *signal), 307 gfp_t gfp) 308 { 309 struct execute_cb *cb; 310 311 if (i915_request_is_active(signal)) { 312 if (hook) 313 hook(rq, &signal->fence); 314 return 0; 315 } 316 317 cb = kmem_cache_alloc(global.slab_execute_cbs, gfp); 318 if (!cb) 319 return -ENOMEM; 320 321 cb->fence = &rq->submit; 322 i915_sw_fence_await(cb->fence); 323 init_irq_work(&cb->work, irq_execute_cb); 324 325 if (hook) { 326 cb->hook = hook; 327 cb->signal = i915_request_get(signal); 328 cb->work.func = irq_execute_cb_hook; 329 } 330 331 spin_lock_irq(&signal->lock); 332 if (i915_request_is_active(signal)) { 333 if (hook) { 334 hook(rq, &signal->fence); 335 i915_request_put(signal); 336 } 337 i915_sw_fence_complete(cb->fence); 338 kmem_cache_free(global.slab_execute_cbs, cb); 339 } else { 340 list_add_tail(&cb->link, &signal->execute_cb); 341 } 342 spin_unlock_irq(&signal->lock); 343 344 /* Copy across semaphore status as we need the same behaviour */ 345 rq->sched.flags |= signal->sched.flags; 346 return 0; 347 } 348 349 bool __i915_request_submit(struct i915_request *request) 350 { 351 struct intel_engine_cs *engine = request->engine; 352 bool result = false; 353 354 GEM_TRACE("%s fence %llx:%lld, current %d\n", 355 engine->name, 356 request->fence.context, request->fence.seqno, 357 hwsp_seqno(request)); 358 359 GEM_BUG_ON(!irqs_disabled()); 360 lockdep_assert_held(&engine->active.lock); 361 362 /* 363 * With the advent of preempt-to-busy, we frequently encounter 364 * requests that we have unsubmitted from HW, but left running 365 * until the next ack and so have completed in the meantime. On 366 * resubmission of that completed request, we can skip 367 * updating the payload, and execlists can even skip submitting 368 * the request. 369 * 370 * We must remove the request from the caller's priority queue, 371 * and the caller must only call us when the request is in their 372 * priority queue, under the active.lock. This ensures that the 373 * request has *not* yet been retired and we can safely move 374 * the request into the engine->active.list where it will be 375 * dropped upon retiring. (Otherwise if resubmit a *retired* 376 * request, this would be a horrible use-after-free.) 377 */ 378 if (i915_request_completed(request)) 379 goto xfer; 380 381 if (i915_gem_context_is_banned(request->gem_context)) 382 i915_request_skip(request, -EIO); 383 384 /* 385 * Are we using semaphores when the gpu is already saturated? 386 * 387 * Using semaphores incurs a cost in having the GPU poll a 388 * memory location, busywaiting for it to change. The continual 389 * memory reads can have a noticeable impact on the rest of the 390 * system with the extra bus traffic, stalling the cpu as it too 391 * tries to access memory across the bus (perf stat -e bus-cycles). 392 * 393 * If we installed a semaphore on this request and we only submit 394 * the request after the signaler completed, that indicates the 395 * system is overloaded and using semaphores at this time only 396 * increases the amount of work we are doing. If so, we disable 397 * further use of semaphores until we are idle again, whence we 398 * optimistically try again. 399 */ 400 if (request->sched.semaphores && 401 i915_sw_fence_signaled(&request->semaphore)) 402 engine->saturated |= request->sched.semaphores; 403 404 engine->emit_fini_breadcrumb(request, 405 request->ring->vaddr + request->postfix); 406 407 trace_i915_request_execute(request); 408 engine->serial++; 409 result = true; 410 411 xfer: /* We may be recursing from the signal callback of another i915 fence */ 412 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 413 414 if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)) 415 list_move_tail(&request->sched.link, &engine->active.requests); 416 417 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && 418 !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && 419 !i915_request_enable_breadcrumb(request)) 420 intel_engine_queue_breadcrumbs(engine); 421 422 __notify_execute_cb(request); 423 424 spin_unlock(&request->lock); 425 426 return result; 427 } 428 429 void i915_request_submit(struct i915_request *request) 430 { 431 struct intel_engine_cs *engine = request->engine; 432 unsigned long flags; 433 434 /* Will be called from irq-context when using foreign fences. */ 435 spin_lock_irqsave(&engine->active.lock, flags); 436 437 __i915_request_submit(request); 438 439 spin_unlock_irqrestore(&engine->active.lock, flags); 440 } 441 442 void __i915_request_unsubmit(struct i915_request *request) 443 { 444 struct intel_engine_cs *engine = request->engine; 445 446 GEM_TRACE("%s fence %llx:%lld, current %d\n", 447 engine->name, 448 request->fence.context, request->fence.seqno, 449 hwsp_seqno(request)); 450 451 GEM_BUG_ON(!irqs_disabled()); 452 lockdep_assert_held(&engine->active.lock); 453 454 /* 455 * Only unwind in reverse order, required so that the per-context list 456 * is kept in seqno/ring order. 457 */ 458 459 /* We may be recursing from the signal callback of another i915 fence */ 460 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 461 462 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) 463 i915_request_cancel_breadcrumb(request); 464 465 GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); 466 clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); 467 468 spin_unlock(&request->lock); 469 470 /* We've already spun, don't charge on resubmitting. */ 471 if (request->sched.semaphores && i915_request_started(request)) { 472 request->sched.attr.priority |= I915_PRIORITY_NOSEMAPHORE; 473 request->sched.semaphores = 0; 474 } 475 476 /* 477 * We don't need to wake_up any waiters on request->execute, they 478 * will get woken by any other event or us re-adding this request 479 * to the engine timeline (__i915_request_submit()). The waiters 480 * should be quite adapt at finding that the request now has a new 481 * global_seqno to the one they went to sleep on. 482 */ 483 } 484 485 void i915_request_unsubmit(struct i915_request *request) 486 { 487 struct intel_engine_cs *engine = request->engine; 488 unsigned long flags; 489 490 /* Will be called from irq-context when using foreign fences. */ 491 spin_lock_irqsave(&engine->active.lock, flags); 492 493 __i915_request_unsubmit(request); 494 495 spin_unlock_irqrestore(&engine->active.lock, flags); 496 } 497 498 static int __i915_sw_fence_call 499 submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 500 { 501 struct i915_request *request = 502 container_of(fence, typeof(*request), submit); 503 504 switch (state) { 505 case FENCE_COMPLETE: 506 trace_i915_request_submit(request); 507 508 if (unlikely(fence->error)) 509 i915_request_skip(request, fence->error); 510 511 /* 512 * We need to serialize use of the submit_request() callback 513 * with its hotplugging performed during an emergency 514 * i915_gem_set_wedged(). We use the RCU mechanism to mark the 515 * critical section in order to force i915_gem_set_wedged() to 516 * wait until the submit_request() is completed before 517 * proceeding. 518 */ 519 rcu_read_lock(); 520 request->engine->submit_request(request); 521 rcu_read_unlock(); 522 break; 523 524 case FENCE_FREE: 525 i915_request_put(request); 526 break; 527 } 528 529 return NOTIFY_DONE; 530 } 531 532 static int __i915_sw_fence_call 533 semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 534 { 535 struct i915_request *request = 536 container_of(fence, typeof(*request), semaphore); 537 538 switch (state) { 539 case FENCE_COMPLETE: 540 i915_schedule_bump_priority(request, I915_PRIORITY_NOSEMAPHORE); 541 break; 542 543 case FENCE_FREE: 544 i915_request_put(request); 545 break; 546 } 547 548 return NOTIFY_DONE; 549 } 550 551 static void retire_requests(struct intel_timeline *tl) 552 { 553 struct i915_request *rq, *rn; 554 555 list_for_each_entry_safe(rq, rn, &tl->requests, link) 556 if (!i915_request_retire(rq)) 557 break; 558 } 559 560 static noinline struct i915_request * 561 request_alloc_slow(struct intel_timeline *tl, gfp_t gfp) 562 { 563 struct i915_request *rq; 564 565 if (list_empty(&tl->requests)) 566 goto out; 567 568 if (!gfpflags_allow_blocking(gfp)) 569 goto out; 570 571 /* Move our oldest request to the slab-cache (if not in use!) */ 572 rq = list_first_entry(&tl->requests, typeof(*rq), link); 573 i915_request_retire(rq); 574 575 rq = kmem_cache_alloc(global.slab_requests, 576 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 577 if (rq) 578 return rq; 579 580 /* Ratelimit ourselves to prevent oom from malicious clients */ 581 rq = list_last_entry(&tl->requests, typeof(*rq), link); 582 cond_synchronize_rcu(rq->rcustate); 583 584 /* Retire our old requests in the hope that we free some */ 585 retire_requests(tl); 586 587 out: 588 return kmem_cache_alloc(global.slab_requests, gfp); 589 } 590 591 struct i915_request * 592 __i915_request_create(struct intel_context *ce, gfp_t gfp) 593 { 594 struct intel_timeline *tl = ce->timeline; 595 struct i915_request *rq; 596 u32 seqno; 597 int ret; 598 599 might_sleep_if(gfpflags_allow_blocking(gfp)); 600 601 /* Check that the caller provided an already pinned context */ 602 __intel_context_pin(ce); 603 604 /* 605 * Beware: Dragons be flying overhead. 606 * 607 * We use RCU to look up requests in flight. The lookups may 608 * race with the request being allocated from the slab freelist. 609 * That is the request we are writing to here, may be in the process 610 * of being read by __i915_active_request_get_rcu(). As such, 611 * we have to be very careful when overwriting the contents. During 612 * the RCU lookup, we change chase the request->engine pointer, 613 * read the request->global_seqno and increment the reference count. 614 * 615 * The reference count is incremented atomically. If it is zero, 616 * the lookup knows the request is unallocated and complete. Otherwise, 617 * it is either still in use, or has been reallocated and reset 618 * with dma_fence_init(). This increment is safe for release as we 619 * check that the request we have a reference to and matches the active 620 * request. 621 * 622 * Before we increment the refcount, we chase the request->engine 623 * pointer. We must not call kmem_cache_zalloc() or else we set 624 * that pointer to NULL and cause a crash during the lookup. If 625 * we see the request is completed (based on the value of the 626 * old engine and seqno), the lookup is complete and reports NULL. 627 * If we decide the request is not completed (new engine or seqno), 628 * then we grab a reference and double check that it is still the 629 * active request - which it won't be and restart the lookup. 630 * 631 * Do not use kmem_cache_zalloc() here! 632 */ 633 rq = kmem_cache_alloc(global.slab_requests, 634 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 635 if (unlikely(!rq)) { 636 rq = request_alloc_slow(tl, gfp); 637 if (!rq) { 638 ret = -ENOMEM; 639 goto err_unreserve; 640 } 641 } 642 643 ret = intel_timeline_get_seqno(tl, rq, &seqno); 644 if (ret) 645 goto err_free; 646 647 rq->i915 = ce->engine->i915; 648 rq->hw_context = ce; 649 rq->gem_context = ce->gem_context; 650 rq->engine = ce->engine; 651 rq->ring = ce->ring; 652 rq->execution_mask = ce->engine->mask; 653 654 rcu_assign_pointer(rq->timeline, tl); 655 rq->hwsp_seqno = tl->hwsp_seqno; 656 rq->hwsp_cacheline = tl->hwsp_cacheline; 657 658 rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ 659 660 spin_lock_init(&rq->lock); 661 dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 662 tl->fence_context, seqno); 663 664 /* We bump the ref for the fence chain */ 665 i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify); 666 i915_sw_fence_init(&i915_request_get(rq)->semaphore, semaphore_notify); 667 668 i915_sched_node_init(&rq->sched); 669 670 /* No zalloc, must clear what we need by hand */ 671 rq->file_priv = NULL; 672 rq->batch = NULL; 673 rq->capture_list = NULL; 674 rq->flags = 0; 675 676 INIT_LIST_HEAD(&rq->execute_cb); 677 678 /* 679 * Reserve space in the ring buffer for all the commands required to 680 * eventually emit this request. This is to guarantee that the 681 * i915_request_add() call can't fail. Note that the reserve may need 682 * to be redone if the request is not actually submitted straight 683 * away, e.g. because a GPU scheduler has deferred it. 684 * 685 * Note that due to how we add reserved_space to intel_ring_begin() 686 * we need to double our request to ensure that if we need to wrap 687 * around inside i915_request_add() there is sufficient space at 688 * the beginning of the ring as well. 689 */ 690 rq->reserved_space = 691 2 * rq->engine->emit_fini_breadcrumb_dw * sizeof(u32); 692 693 /* 694 * Record the position of the start of the request so that 695 * should we detect the updated seqno part-way through the 696 * GPU processing the request, we never over-estimate the 697 * position of the head. 698 */ 699 rq->head = rq->ring->emit; 700 701 ret = rq->engine->request_alloc(rq); 702 if (ret) 703 goto err_unwind; 704 705 rq->infix = rq->ring->emit; /* end of header; start of user payload */ 706 707 intel_context_mark_active(ce); 708 return rq; 709 710 err_unwind: 711 ce->ring->emit = rq->head; 712 713 /* Make sure we didn't add ourselves to external state before freeing */ 714 GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); 715 GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); 716 717 err_free: 718 kmem_cache_free(global.slab_requests, rq); 719 err_unreserve: 720 intel_context_unpin(ce); 721 return ERR_PTR(ret); 722 } 723 724 struct i915_request * 725 i915_request_create(struct intel_context *ce) 726 { 727 struct i915_request *rq; 728 struct intel_timeline *tl; 729 730 tl = intel_context_timeline_lock(ce); 731 if (IS_ERR(tl)) 732 return ERR_CAST(tl); 733 734 /* Move our oldest request to the slab-cache (if not in use!) */ 735 rq = list_first_entry(&tl->requests, typeof(*rq), link); 736 if (!list_is_last(&rq->link, &tl->requests)) 737 i915_request_retire(rq); 738 739 intel_context_enter(ce); 740 rq = __i915_request_create(ce, GFP_KERNEL); 741 intel_context_exit(ce); /* active reference transferred to request */ 742 if (IS_ERR(rq)) 743 goto err_unlock; 744 745 /* Check that we do not interrupt ourselves with a new request */ 746 rq->cookie = lockdep_pin_lock(&tl->mutex); 747 748 return rq; 749 750 err_unlock: 751 intel_context_timeline_unlock(tl); 752 return rq; 753 } 754 755 static int 756 i915_request_await_start(struct i915_request *rq, struct i915_request *signal) 757 { 758 struct intel_timeline *tl; 759 struct dma_fence *fence; 760 int err; 761 762 GEM_BUG_ON(i915_request_timeline(rq) == 763 rcu_access_pointer(signal->timeline)); 764 765 rcu_read_lock(); 766 tl = rcu_dereference(signal->timeline); 767 if (i915_request_started(signal) || !kref_get_unless_zero(&tl->kref)) 768 tl = NULL; 769 rcu_read_unlock(); 770 if (!tl) /* already started or maybe even completed */ 771 return 0; 772 773 fence = ERR_PTR(-EBUSY); 774 if (mutex_trylock(&tl->mutex)) { 775 fence = NULL; 776 if (!i915_request_started(signal) && 777 !list_is_first(&signal->link, &tl->requests)) { 778 signal = list_prev_entry(signal, link); 779 fence = dma_fence_get(&signal->fence); 780 } 781 mutex_unlock(&tl->mutex); 782 } 783 intel_timeline_put(tl); 784 if (IS_ERR_OR_NULL(fence)) 785 return PTR_ERR_OR_ZERO(fence); 786 787 err = 0; 788 if (intel_timeline_sync_is_later(i915_request_timeline(rq), fence)) 789 err = i915_sw_fence_await_dma_fence(&rq->submit, 790 fence, 0, 791 I915_FENCE_GFP); 792 dma_fence_put(fence); 793 794 return err; 795 } 796 797 static intel_engine_mask_t 798 already_busywaiting(struct i915_request *rq) 799 { 800 /* 801 * Polling a semaphore causes bus traffic, delaying other users of 802 * both the GPU and CPU. We want to limit the impact on others, 803 * while taking advantage of early submission to reduce GPU 804 * latency. Therefore we restrict ourselves to not using more 805 * than one semaphore from each source, and not using a semaphore 806 * if we have detected the engine is saturated (i.e. would not be 807 * submitted early and cause bus traffic reading an already passed 808 * semaphore). 809 * 810 * See the are-we-too-late? check in __i915_request_submit(). 811 */ 812 return rq->sched.semaphores | rq->engine->saturated; 813 } 814 815 static int 816 __emit_semaphore_wait(struct i915_request *to, 817 struct i915_request *from, 818 u32 seqno) 819 { 820 const int has_token = INTEL_GEN(to->i915) >= 12; 821 u32 hwsp_offset; 822 int len, err; 823 u32 *cs; 824 825 GEM_BUG_ON(INTEL_GEN(to->i915) < 8); 826 827 /* We need to pin the signaler's HWSP until we are finished reading. */ 828 err = intel_timeline_read_hwsp(from, to, &hwsp_offset); 829 if (err) 830 return err; 831 832 len = 4; 833 if (has_token) 834 len += 2; 835 836 cs = intel_ring_begin(to, len); 837 if (IS_ERR(cs)) 838 return PTR_ERR(cs); 839 840 /* 841 * Using greater-than-or-equal here means we have to worry 842 * about seqno wraparound. To side step that issue, we swap 843 * the timeline HWSP upon wrapping, so that everyone listening 844 * for the old (pre-wrap) values do not see the much smaller 845 * (post-wrap) values than they were expecting (and so wait 846 * forever). 847 */ 848 *cs++ = (MI_SEMAPHORE_WAIT | 849 MI_SEMAPHORE_GLOBAL_GTT | 850 MI_SEMAPHORE_POLL | 851 MI_SEMAPHORE_SAD_GTE_SDD) + 852 has_token; 853 *cs++ = seqno; 854 *cs++ = hwsp_offset; 855 *cs++ = 0; 856 if (has_token) { 857 *cs++ = 0; 858 *cs++ = MI_NOOP; 859 } 860 861 intel_ring_advance(to, cs); 862 return 0; 863 } 864 865 static int 866 emit_semaphore_wait(struct i915_request *to, 867 struct i915_request *from, 868 gfp_t gfp) 869 { 870 /* Just emit the first semaphore we see as request space is limited. */ 871 if (already_busywaiting(to) & from->engine->mask) 872 goto await_fence; 873 874 if (i915_request_await_start(to, from) < 0) 875 goto await_fence; 876 877 /* Only submit our spinner after the signaler is running! */ 878 if (__await_execution(to, from, NULL, gfp)) 879 goto await_fence; 880 881 if (__emit_semaphore_wait(to, from, from->fence.seqno)) 882 goto await_fence; 883 884 to->sched.semaphores |= from->engine->mask; 885 to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN; 886 return 0; 887 888 await_fence: 889 return i915_sw_fence_await_dma_fence(&to->submit, 890 &from->fence, 0, 891 I915_FENCE_GFP); 892 } 893 894 static int 895 i915_request_await_request(struct i915_request *to, struct i915_request *from) 896 { 897 int ret; 898 899 GEM_BUG_ON(to == from); 900 GEM_BUG_ON(to->timeline == from->timeline); 901 902 if (i915_request_completed(from)) 903 return 0; 904 905 if (to->engine->schedule) { 906 ret = i915_sched_node_add_dependency(&to->sched, &from->sched); 907 if (ret < 0) 908 return ret; 909 } 910 911 if (to->engine == from->engine) { 912 ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, 913 &from->submit, 914 I915_FENCE_GFP); 915 } else if (intel_engine_has_semaphores(to->engine) && 916 to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) { 917 ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); 918 } else { 919 ret = i915_sw_fence_await_dma_fence(&to->submit, 920 &from->fence, 0, 921 I915_FENCE_GFP); 922 } 923 if (ret < 0) 924 return ret; 925 926 if (to->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN) { 927 ret = i915_sw_fence_await_dma_fence(&to->semaphore, 928 &from->fence, 0, 929 I915_FENCE_GFP); 930 if (ret < 0) 931 return ret; 932 } 933 934 return 0; 935 } 936 937 int 938 i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) 939 { 940 struct dma_fence **child = &fence; 941 unsigned int nchild = 1; 942 int ret; 943 944 /* 945 * Note that if the fence-array was created in signal-on-any mode, 946 * we should *not* decompose it into its individual fences. However, 947 * we don't currently store which mode the fence-array is operating 948 * in. Fortunately, the only user of signal-on-any is private to 949 * amdgpu and we should not see any incoming fence-array from 950 * sync-file being in signal-on-any mode. 951 */ 952 if (dma_fence_is_array(fence)) { 953 struct dma_fence_array *array = to_dma_fence_array(fence); 954 955 child = array->fences; 956 nchild = array->num_fences; 957 GEM_BUG_ON(!nchild); 958 } 959 960 do { 961 fence = *child++; 962 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 963 continue; 964 965 /* 966 * Requests on the same timeline are explicitly ordered, along 967 * with their dependencies, by i915_request_add() which ensures 968 * that requests are submitted in-order through each ring. 969 */ 970 if (fence->context == rq->fence.context) 971 continue; 972 973 /* Squash repeated waits to the same timelines */ 974 if (fence->context && 975 intel_timeline_sync_is_later(i915_request_timeline(rq), 976 fence)) 977 continue; 978 979 if (dma_fence_is_i915(fence)) 980 ret = i915_request_await_request(rq, to_request(fence)); 981 else 982 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 983 fence->context ? I915_FENCE_TIMEOUT : 0, 984 I915_FENCE_GFP); 985 if (ret < 0) 986 return ret; 987 988 /* Record the latest fence used against each timeline */ 989 if (fence->context) 990 intel_timeline_sync_set(i915_request_timeline(rq), 991 fence); 992 } while (--nchild); 993 994 return 0; 995 } 996 997 static bool intel_timeline_sync_has_start(struct intel_timeline *tl, 998 struct dma_fence *fence) 999 { 1000 return __intel_timeline_sync_is_later(tl, 1001 fence->context, 1002 fence->seqno - 1); 1003 } 1004 1005 static int intel_timeline_sync_set_start(struct intel_timeline *tl, 1006 const struct dma_fence *fence) 1007 { 1008 return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1); 1009 } 1010 1011 static int 1012 __i915_request_await_execution(struct i915_request *to, 1013 struct i915_request *from, 1014 void (*hook)(struct i915_request *rq, 1015 struct dma_fence *signal)) 1016 { 1017 int err; 1018 1019 /* Submit both requests at the same time */ 1020 err = __await_execution(to, from, hook, I915_FENCE_GFP); 1021 if (err) 1022 return err; 1023 1024 /* Squash repeated depenendices to the same timelines */ 1025 if (intel_timeline_sync_has_start(i915_request_timeline(to), 1026 &from->fence)) 1027 return 0; 1028 1029 /* Ensure both start together [after all semaphores in signal] */ 1030 if (intel_engine_has_semaphores(to->engine)) 1031 err = __emit_semaphore_wait(to, from, from->fence.seqno - 1); 1032 else 1033 err = i915_request_await_start(to, from); 1034 if (err < 0) 1035 return err; 1036 1037 /* Couple the dependency tree for PI on this exposed to->fence */ 1038 if (to->engine->schedule) { 1039 err = i915_sched_node_add_dependency(&to->sched, &from->sched); 1040 if (err < 0) 1041 return err; 1042 } 1043 1044 return intel_timeline_sync_set_start(i915_request_timeline(to), 1045 &from->fence); 1046 } 1047 1048 int 1049 i915_request_await_execution(struct i915_request *rq, 1050 struct dma_fence *fence, 1051 void (*hook)(struct i915_request *rq, 1052 struct dma_fence *signal)) 1053 { 1054 struct dma_fence **child = &fence; 1055 unsigned int nchild = 1; 1056 int ret; 1057 1058 if (dma_fence_is_array(fence)) { 1059 struct dma_fence_array *array = to_dma_fence_array(fence); 1060 1061 /* XXX Error for signal-on-any fence arrays */ 1062 1063 child = array->fences; 1064 nchild = array->num_fences; 1065 GEM_BUG_ON(!nchild); 1066 } 1067 1068 do { 1069 fence = *child++; 1070 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 1071 continue; 1072 1073 /* 1074 * We don't squash repeated fence dependencies here as we 1075 * want to run our callback in all cases. 1076 */ 1077 1078 if (dma_fence_is_i915(fence)) 1079 ret = __i915_request_await_execution(rq, 1080 to_request(fence), 1081 hook); 1082 else 1083 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 1084 I915_FENCE_TIMEOUT, 1085 GFP_KERNEL); 1086 if (ret < 0) 1087 return ret; 1088 } while (--nchild); 1089 1090 return 0; 1091 } 1092 1093 /** 1094 * i915_request_await_object - set this request to (async) wait upon a bo 1095 * @to: request we are wishing to use 1096 * @obj: object which may be in use on another ring. 1097 * @write: whether the wait is on behalf of a writer 1098 * 1099 * This code is meant to abstract object synchronization with the GPU. 1100 * Conceptually we serialise writes between engines inside the GPU. 1101 * We only allow one engine to write into a buffer at any time, but 1102 * multiple readers. To ensure each has a coherent view of memory, we must: 1103 * 1104 * - If there is an outstanding write request to the object, the new 1105 * request must wait for it to complete (either CPU or in hw, requests 1106 * on the same ring will be naturally ordered). 1107 * 1108 * - If we are a write request (pending_write_domain is set), the new 1109 * request must wait for outstanding read requests to complete. 1110 * 1111 * Returns 0 if successful, else propagates up the lower layer error. 1112 */ 1113 int 1114 i915_request_await_object(struct i915_request *to, 1115 struct drm_i915_gem_object *obj, 1116 bool write) 1117 { 1118 struct dma_fence *excl; 1119 int ret = 0; 1120 1121 if (write) { 1122 struct dma_fence **shared; 1123 unsigned int count, i; 1124 1125 ret = dma_resv_get_fences_rcu(obj->base.resv, 1126 &excl, &count, &shared); 1127 if (ret) 1128 return ret; 1129 1130 for (i = 0; i < count; i++) { 1131 ret = i915_request_await_dma_fence(to, shared[i]); 1132 if (ret) 1133 break; 1134 1135 dma_fence_put(shared[i]); 1136 } 1137 1138 for (; i < count; i++) 1139 dma_fence_put(shared[i]); 1140 kfree(shared); 1141 } else { 1142 excl = dma_resv_get_excl_rcu(obj->base.resv); 1143 } 1144 1145 if (excl) { 1146 if (ret == 0) 1147 ret = i915_request_await_dma_fence(to, excl); 1148 1149 dma_fence_put(excl); 1150 } 1151 1152 return ret; 1153 } 1154 1155 void i915_request_skip(struct i915_request *rq, int error) 1156 { 1157 void *vaddr = rq->ring->vaddr; 1158 u32 head; 1159 1160 GEM_BUG_ON(!IS_ERR_VALUE((long)error)); 1161 dma_fence_set_error(&rq->fence, error); 1162 1163 if (rq->infix == rq->postfix) 1164 return; 1165 1166 /* 1167 * As this request likely depends on state from the lost 1168 * context, clear out all the user operations leaving the 1169 * breadcrumb at the end (so we get the fence notifications). 1170 */ 1171 head = rq->infix; 1172 if (rq->postfix < head) { 1173 memset(vaddr + head, 0, rq->ring->size - head); 1174 head = 0; 1175 } 1176 memset(vaddr + head, 0, rq->postfix - head); 1177 rq->infix = rq->postfix; 1178 } 1179 1180 static struct i915_request * 1181 __i915_request_add_to_timeline(struct i915_request *rq) 1182 { 1183 struct intel_timeline *timeline = i915_request_timeline(rq); 1184 struct i915_request *prev; 1185 1186 /* 1187 * Dependency tracking and request ordering along the timeline 1188 * is special cased so that we can eliminate redundant ordering 1189 * operations while building the request (we know that the timeline 1190 * itself is ordered, and here we guarantee it). 1191 * 1192 * As we know we will need to emit tracking along the timeline, 1193 * we embed the hooks into our request struct -- at the cost of 1194 * having to have specialised no-allocation interfaces (which will 1195 * be beneficial elsewhere). 1196 * 1197 * A second benefit to open-coding i915_request_await_request is 1198 * that we can apply a slight variant of the rules specialised 1199 * for timelines that jump between engines (such as virtual engines). 1200 * If we consider the case of virtual engine, we must emit a dma-fence 1201 * to prevent scheduling of the second request until the first is 1202 * complete (to maximise our greedy late load balancing) and this 1203 * precludes optimising to use semaphores serialisation of a single 1204 * timeline across engines. 1205 */ 1206 prev = to_request(__i915_active_fence_set(&timeline->last_request, 1207 &rq->fence)); 1208 if (prev && !i915_request_completed(prev)) { 1209 if (is_power_of_2(prev->engine->mask | rq->engine->mask)) 1210 i915_sw_fence_await_sw_fence(&rq->submit, 1211 &prev->submit, 1212 &rq->submitq); 1213 else 1214 __i915_sw_fence_await_dma_fence(&rq->submit, 1215 &prev->fence, 1216 &rq->dmaq); 1217 if (rq->engine->schedule) 1218 __i915_sched_node_add_dependency(&rq->sched, 1219 &prev->sched, 1220 &rq->dep, 1221 0); 1222 } 1223 1224 list_add_tail(&rq->link, &timeline->requests); 1225 1226 /* 1227 * Make sure that no request gazumped us - if it was allocated after 1228 * our i915_request_alloc() and called __i915_request_add() before 1229 * us, the timeline will hold its seqno which is later than ours. 1230 */ 1231 GEM_BUG_ON(timeline->seqno != rq->fence.seqno); 1232 1233 return prev; 1234 } 1235 1236 /* 1237 * NB: This function is not allowed to fail. Doing so would mean the the 1238 * request is not being tracked for completion but the work itself is 1239 * going to happen on the hardware. This would be a Bad Thing(tm). 1240 */ 1241 struct i915_request *__i915_request_commit(struct i915_request *rq) 1242 { 1243 struct intel_engine_cs *engine = rq->engine; 1244 struct intel_ring *ring = rq->ring; 1245 u32 *cs; 1246 1247 GEM_TRACE("%s fence %llx:%lld\n", 1248 engine->name, rq->fence.context, rq->fence.seqno); 1249 1250 /* 1251 * To ensure that this call will not fail, space for its emissions 1252 * should already have been reserved in the ring buffer. Let the ring 1253 * know that it is time to use that space up. 1254 */ 1255 GEM_BUG_ON(rq->reserved_space > ring->space); 1256 rq->reserved_space = 0; 1257 rq->emitted_jiffies = jiffies; 1258 1259 /* 1260 * Record the position of the start of the breadcrumb so that 1261 * should we detect the updated seqno part-way through the 1262 * GPU processing the request, we never over-estimate the 1263 * position of the ring's HEAD. 1264 */ 1265 cs = intel_ring_begin(rq, engine->emit_fini_breadcrumb_dw); 1266 GEM_BUG_ON(IS_ERR(cs)); 1267 rq->postfix = intel_ring_offset(rq, cs); 1268 1269 return __i915_request_add_to_timeline(rq); 1270 } 1271 1272 void __i915_request_queue(struct i915_request *rq, 1273 const struct i915_sched_attr *attr) 1274 { 1275 /* 1276 * Let the backend know a new request has arrived that may need 1277 * to adjust the existing execution schedule due to a high priority 1278 * request - i.e. we may want to preempt the current request in order 1279 * to run a high priority dependency chain *before* we can execute this 1280 * request. 1281 * 1282 * This is called before the request is ready to run so that we can 1283 * decide whether to preempt the entire chain so that it is ready to 1284 * run at the earliest possible convenience. 1285 */ 1286 i915_sw_fence_commit(&rq->semaphore); 1287 if (attr && rq->engine->schedule) 1288 rq->engine->schedule(rq, attr); 1289 i915_sw_fence_commit(&rq->submit); 1290 } 1291 1292 void i915_request_add(struct i915_request *rq) 1293 { 1294 struct i915_sched_attr attr = rq->gem_context->sched; 1295 struct intel_timeline * const tl = i915_request_timeline(rq); 1296 struct i915_request *prev; 1297 1298 lockdep_assert_held(&tl->mutex); 1299 lockdep_unpin_lock(&tl->mutex, rq->cookie); 1300 1301 trace_i915_request_add(rq); 1302 1303 prev = __i915_request_commit(rq); 1304 1305 /* 1306 * Boost actual workloads past semaphores! 1307 * 1308 * With semaphores we spin on one engine waiting for another, 1309 * simply to reduce the latency of starting our work when 1310 * the signaler completes. However, if there is any other 1311 * work that we could be doing on this engine instead, that 1312 * is better utilisation and will reduce the overall duration 1313 * of the current work. To avoid PI boosting a semaphore 1314 * far in the distance past over useful work, we keep a history 1315 * of any semaphore use along our dependency chain. 1316 */ 1317 if (!(rq->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN)) 1318 attr.priority |= I915_PRIORITY_NOSEMAPHORE; 1319 1320 /* 1321 * Boost priorities to new clients (new request flows). 1322 * 1323 * Allow interactive/synchronous clients to jump ahead of 1324 * the bulk clients. (FQ_CODEL) 1325 */ 1326 if (list_empty(&rq->sched.signalers_list)) 1327 attr.priority |= I915_PRIORITY_WAIT; 1328 1329 local_bh_disable(); 1330 __i915_request_queue(rq, &attr); 1331 local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ 1332 1333 /* 1334 * In typical scenarios, we do not expect the previous request on 1335 * the timeline to be still tracked by timeline->last_request if it 1336 * has been completed. If the completed request is still here, that 1337 * implies that request retirement is a long way behind submission, 1338 * suggesting that we haven't been retiring frequently enough from 1339 * the combination of retire-before-alloc, waiters and the background 1340 * retirement worker. So if the last request on this timeline was 1341 * already completed, do a catch up pass, flushing the retirement queue 1342 * up to this client. Since we have now moved the heaviest operations 1343 * during retirement onto secondary workers, such as freeing objects 1344 * or contexts, retiring a bunch of requests is mostly list management 1345 * (and cache misses), and so we should not be overly penalizing this 1346 * client by performing excess work, though we may still performing 1347 * work on behalf of others -- but instead we should benefit from 1348 * improved resource management. (Well, that's the theory at least.) 1349 */ 1350 if (prev && 1351 i915_request_completed(prev) && 1352 rcu_access_pointer(prev->timeline) == tl) 1353 i915_request_retire_upto(prev); 1354 1355 mutex_unlock(&tl->mutex); 1356 } 1357 1358 static unsigned long local_clock_us(unsigned int *cpu) 1359 { 1360 unsigned long t; 1361 1362 /* 1363 * Cheaply and approximately convert from nanoseconds to microseconds. 1364 * The result and subsequent calculations are also defined in the same 1365 * approximate microseconds units. The principal source of timing 1366 * error here is from the simple truncation. 1367 * 1368 * Note that local_clock() is only defined wrt to the current CPU; 1369 * the comparisons are no longer valid if we switch CPUs. Instead of 1370 * blocking preemption for the entire busywait, we can detect the CPU 1371 * switch and use that as indicator of system load and a reason to 1372 * stop busywaiting, see busywait_stop(). 1373 */ 1374 *cpu = get_cpu(); 1375 t = local_clock() >> 10; 1376 put_cpu(); 1377 1378 return t; 1379 } 1380 1381 static bool busywait_stop(unsigned long timeout, unsigned int cpu) 1382 { 1383 unsigned int this_cpu; 1384 1385 if (time_after(local_clock_us(&this_cpu), timeout)) 1386 return true; 1387 1388 return this_cpu != cpu; 1389 } 1390 1391 static bool __i915_spin_request(const struct i915_request * const rq, 1392 int state, unsigned long timeout_us) 1393 { 1394 unsigned int cpu; 1395 1396 /* 1397 * Only wait for the request if we know it is likely to complete. 1398 * 1399 * We don't track the timestamps around requests, nor the average 1400 * request length, so we do not have a good indicator that this 1401 * request will complete within the timeout. What we do know is the 1402 * order in which requests are executed by the context and so we can 1403 * tell if the request has been started. If the request is not even 1404 * running yet, it is a fair assumption that it will not complete 1405 * within our relatively short timeout. 1406 */ 1407 if (!i915_request_is_running(rq)) 1408 return false; 1409 1410 /* 1411 * When waiting for high frequency requests, e.g. during synchronous 1412 * rendering split between the CPU and GPU, the finite amount of time 1413 * required to set up the irq and wait upon it limits the response 1414 * rate. By busywaiting on the request completion for a short while we 1415 * can service the high frequency waits as quick as possible. However, 1416 * if it is a slow request, we want to sleep as quickly as possible. 1417 * The tradeoff between waiting and sleeping is roughly the time it 1418 * takes to sleep on a request, on the order of a microsecond. 1419 */ 1420 1421 timeout_us += local_clock_us(&cpu); 1422 do { 1423 if (i915_request_completed(rq)) 1424 return true; 1425 1426 if (signal_pending_state(state, current)) 1427 break; 1428 1429 if (busywait_stop(timeout_us, cpu)) 1430 break; 1431 1432 cpu_relax(); 1433 } while (!need_resched()); 1434 1435 return false; 1436 } 1437 1438 struct request_wait { 1439 struct dma_fence_cb cb; 1440 struct task_struct *tsk; 1441 }; 1442 1443 static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb) 1444 { 1445 struct request_wait *wait = container_of(cb, typeof(*wait), cb); 1446 1447 wake_up_process(wait->tsk); 1448 } 1449 1450 /** 1451 * i915_request_wait - wait until execution of request has finished 1452 * @rq: the request to wait upon 1453 * @flags: how to wait 1454 * @timeout: how long to wait in jiffies 1455 * 1456 * i915_request_wait() waits for the request to be completed, for a 1457 * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an 1458 * unbounded wait). 1459 * 1460 * Returns the remaining time (in jiffies) if the request completed, which may 1461 * be zero or -ETIME if the request is unfinished after the timeout expires. 1462 * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is 1463 * pending before the request completes. 1464 */ 1465 long i915_request_wait(struct i915_request *rq, 1466 unsigned int flags, 1467 long timeout) 1468 { 1469 const int state = flags & I915_WAIT_INTERRUPTIBLE ? 1470 TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; 1471 struct request_wait wait; 1472 1473 might_sleep(); 1474 GEM_BUG_ON(timeout < 0); 1475 1476 if (dma_fence_is_signaled(&rq->fence)) 1477 return timeout; 1478 1479 if (!timeout) 1480 return -ETIME; 1481 1482 trace_i915_request_wait_begin(rq, flags); 1483 1484 /* 1485 * We must never wait on the GPU while holding a lock as we 1486 * may need to perform a GPU reset. So while we don't need to 1487 * serialise wait/reset with an explicit lock, we do want 1488 * lockdep to detect potential dependency cycles. 1489 */ 1490 mutex_acquire(&rq->engine->gt->reset.mutex.dep_map, 0, 0, _THIS_IP_); 1491 1492 /* 1493 * Optimistic spin before touching IRQs. 1494 * 1495 * We may use a rather large value here to offset the penalty of 1496 * switching away from the active task. Frequently, the client will 1497 * wait upon an old swapbuffer to throttle itself to remain within a 1498 * frame of the gpu. If the client is running in lockstep with the gpu, 1499 * then it should not be waiting long at all, and a sleep now will incur 1500 * extra scheduler latency in producing the next frame. To try to 1501 * avoid adding the cost of enabling/disabling the interrupt to the 1502 * short wait, we first spin to see if the request would have completed 1503 * in the time taken to setup the interrupt. 1504 * 1505 * We need upto 5us to enable the irq, and upto 20us to hide the 1506 * scheduler latency of a context switch, ignoring the secondary 1507 * impacts from a context switch such as cache eviction. 1508 * 1509 * The scheme used for low-latency IO is called "hybrid interrupt 1510 * polling". The suggestion there is to sleep until just before you 1511 * expect to be woken by the device interrupt and then poll for its 1512 * completion. That requires having a good predictor for the request 1513 * duration, which we currently lack. 1514 */ 1515 if (IS_ACTIVE(CONFIG_DRM_I915_SPIN_REQUEST) && 1516 __i915_spin_request(rq, state, CONFIG_DRM_I915_SPIN_REQUEST)) { 1517 dma_fence_signal(&rq->fence); 1518 goto out; 1519 } 1520 1521 /* 1522 * This client is about to stall waiting for the GPU. In many cases 1523 * this is undesirable and limits the throughput of the system, as 1524 * many clients cannot continue processing user input/output whilst 1525 * blocked. RPS autotuning may take tens of milliseconds to respond 1526 * to the GPU load and thus incurs additional latency for the client. 1527 * We can circumvent that by promoting the GPU frequency to maximum 1528 * before we sleep. This makes the GPU throttle up much more quickly 1529 * (good for benchmarks and user experience, e.g. window animations), 1530 * but at a cost of spending more power processing the workload 1531 * (bad for battery). 1532 */ 1533 if (flags & I915_WAIT_PRIORITY) { 1534 if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6) 1535 intel_rps_boost(rq); 1536 i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT); 1537 } 1538 1539 wait.tsk = current; 1540 if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake)) 1541 goto out; 1542 1543 for (;;) { 1544 set_current_state(state); 1545 1546 if (i915_request_completed(rq)) { 1547 dma_fence_signal(&rq->fence); 1548 break; 1549 } 1550 1551 if (signal_pending_state(state, current)) { 1552 timeout = -ERESTARTSYS; 1553 break; 1554 } 1555 1556 if (!timeout) { 1557 timeout = -ETIME; 1558 break; 1559 } 1560 1561 intel_engine_flush_submission(rq->engine); 1562 timeout = io_schedule_timeout(timeout); 1563 } 1564 __set_current_state(TASK_RUNNING); 1565 1566 dma_fence_remove_callback(&rq->fence, &wait.cb); 1567 1568 out: 1569 mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_); 1570 trace_i915_request_wait_end(rq); 1571 return timeout; 1572 } 1573 1574 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1575 #include "selftests/mock_request.c" 1576 #include "selftests/i915_request.c" 1577 #endif 1578 1579 static void i915_global_request_shrink(void) 1580 { 1581 kmem_cache_shrink(global.slab_dependencies); 1582 kmem_cache_shrink(global.slab_execute_cbs); 1583 kmem_cache_shrink(global.slab_requests); 1584 } 1585 1586 static void i915_global_request_exit(void) 1587 { 1588 kmem_cache_destroy(global.slab_dependencies); 1589 kmem_cache_destroy(global.slab_execute_cbs); 1590 kmem_cache_destroy(global.slab_requests); 1591 } 1592 1593 static struct i915_global_request global = { { 1594 .shrink = i915_global_request_shrink, 1595 .exit = i915_global_request_exit, 1596 } }; 1597 1598 int __init i915_global_request_init(void) 1599 { 1600 global.slab_requests = KMEM_CACHE(i915_request, 1601 SLAB_HWCACHE_ALIGN | 1602 SLAB_RECLAIM_ACCOUNT | 1603 SLAB_TYPESAFE_BY_RCU); 1604 if (!global.slab_requests) 1605 return -ENOMEM; 1606 1607 global.slab_execute_cbs = KMEM_CACHE(execute_cb, 1608 SLAB_HWCACHE_ALIGN | 1609 SLAB_RECLAIM_ACCOUNT | 1610 SLAB_TYPESAFE_BY_RCU); 1611 if (!global.slab_execute_cbs) 1612 goto err_requests; 1613 1614 global.slab_dependencies = KMEM_CACHE(i915_dependency, 1615 SLAB_HWCACHE_ALIGN | 1616 SLAB_RECLAIM_ACCOUNT); 1617 if (!global.slab_dependencies) 1618 goto err_execute_cbs; 1619 1620 i915_global_register(&global.base); 1621 return 0; 1622 1623 err_execute_cbs: 1624 kmem_cache_destroy(global.slab_execute_cbs); 1625 err_requests: 1626 kmem_cache_destroy(global.slab_requests); 1627 return -ENOMEM; 1628 } 1629