1 /* 2 * Copyright © 2008-2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25 #include <linux/dma-fence-array.h> 26 #include <linux/irq_work.h> 27 #include <linux/prefetch.h> 28 #include <linux/sched.h> 29 #include <linux/sched/clock.h> 30 #include <linux/sched/signal.h> 31 32 #include "gem/i915_gem_context.h" 33 #include "gt/intel_context.h" 34 35 #include "i915_active.h" 36 #include "i915_drv.h" 37 #include "i915_globals.h" 38 #include "intel_pm.h" 39 40 struct execute_cb { 41 struct list_head link; 42 struct irq_work work; 43 struct i915_sw_fence *fence; 44 void (*hook)(struct i915_request *rq, struct dma_fence *signal); 45 struct i915_request *signal; 46 }; 47 48 static struct i915_global_request { 49 struct i915_global base; 50 struct kmem_cache *slab_requests; 51 struct kmem_cache *slab_dependencies; 52 struct kmem_cache *slab_execute_cbs; 53 } global; 54 55 static const char *i915_fence_get_driver_name(struct dma_fence *fence) 56 { 57 return "i915"; 58 } 59 60 static const char *i915_fence_get_timeline_name(struct dma_fence *fence) 61 { 62 /* 63 * The timeline struct (as part of the ppgtt underneath a context) 64 * may be freed when the request is no longer in use by the GPU. 65 * We could extend the life of a context to beyond that of all 66 * fences, possibly keeping the hw resource around indefinitely, 67 * or we just give them a false name. Since 68 * dma_fence_ops.get_timeline_name is a debug feature, the occasional 69 * lie seems justifiable. 70 */ 71 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 72 return "signaled"; 73 74 return to_request(fence)->gem_context->name ?: "[i915]"; 75 } 76 77 static bool i915_fence_signaled(struct dma_fence *fence) 78 { 79 return i915_request_completed(to_request(fence)); 80 } 81 82 static bool i915_fence_enable_signaling(struct dma_fence *fence) 83 { 84 return i915_request_enable_breadcrumb(to_request(fence)); 85 } 86 87 static signed long i915_fence_wait(struct dma_fence *fence, 88 bool interruptible, 89 signed long timeout) 90 { 91 return i915_request_wait(to_request(fence), 92 interruptible | I915_WAIT_PRIORITY, 93 timeout); 94 } 95 96 static void i915_fence_release(struct dma_fence *fence) 97 { 98 struct i915_request *rq = to_request(fence); 99 100 /* 101 * The request is put onto a RCU freelist (i.e. the address 102 * is immediately reused), mark the fences as being freed now. 103 * Otherwise the debugobjects for the fences are only marked as 104 * freed when the slab cache itself is freed, and so we would get 105 * caught trying to reuse dead objects. 106 */ 107 i915_sw_fence_fini(&rq->submit); 108 i915_sw_fence_fini(&rq->semaphore); 109 110 kmem_cache_free(global.slab_requests, rq); 111 } 112 113 const struct dma_fence_ops i915_fence_ops = { 114 .get_driver_name = i915_fence_get_driver_name, 115 .get_timeline_name = i915_fence_get_timeline_name, 116 .enable_signaling = i915_fence_enable_signaling, 117 .signaled = i915_fence_signaled, 118 .wait = i915_fence_wait, 119 .release = i915_fence_release, 120 }; 121 122 static inline void 123 i915_request_remove_from_client(struct i915_request *request) 124 { 125 struct drm_i915_file_private *file_priv; 126 127 file_priv = request->file_priv; 128 if (!file_priv) 129 return; 130 131 spin_lock(&file_priv->mm.lock); 132 if (request->file_priv) { 133 list_del(&request->client_link); 134 request->file_priv = NULL; 135 } 136 spin_unlock(&file_priv->mm.lock); 137 } 138 139 static void advance_ring(struct i915_request *request) 140 { 141 struct intel_ring *ring = request->ring; 142 unsigned int tail; 143 144 /* 145 * We know the GPU must have read the request to have 146 * sent us the seqno + interrupt, so use the position 147 * of tail of the request to update the last known position 148 * of the GPU head. 149 * 150 * Note this requires that we are always called in request 151 * completion order. 152 */ 153 GEM_BUG_ON(!list_is_first(&request->ring_link, &ring->request_list)); 154 if (list_is_last(&request->ring_link, &ring->request_list)) { 155 /* 156 * We may race here with execlists resubmitting this request 157 * as we retire it. The resubmission will move the ring->tail 158 * forwards (to request->wa_tail). We either read the 159 * current value that was written to hw, or the value that 160 * is just about to be. Either works, if we miss the last two 161 * noops - they are safe to be replayed on a reset. 162 */ 163 tail = READ_ONCE(request->tail); 164 list_del(&ring->active_link); 165 } else { 166 tail = request->postfix; 167 } 168 list_del_init(&request->ring_link); 169 170 ring->head = tail; 171 } 172 173 static void free_capture_list(struct i915_request *request) 174 { 175 struct i915_capture_list *capture; 176 177 capture = request->capture_list; 178 while (capture) { 179 struct i915_capture_list *next = capture->next; 180 181 kfree(capture); 182 capture = next; 183 } 184 } 185 186 static bool i915_request_retire(struct i915_request *rq) 187 { 188 struct i915_active_request *active, *next; 189 190 lockdep_assert_held(&rq->i915->drm.struct_mutex); 191 if (!i915_request_completed(rq)) 192 return false; 193 194 GEM_TRACE("%s fence %llx:%lld, current %d\n", 195 rq->engine->name, 196 rq->fence.context, rq->fence.seqno, 197 hwsp_seqno(rq)); 198 199 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 200 trace_i915_request_retire(rq); 201 202 advance_ring(rq); 203 204 /* 205 * Walk through the active list, calling retire on each. This allows 206 * objects to track their GPU activity and mark themselves as idle 207 * when their *last* active request is completed (updating state 208 * tracking lists for eviction, active references for GEM, etc). 209 * 210 * As the ->retire() may free the node, we decouple it first and 211 * pass along the auxiliary information (to avoid dereferencing 212 * the node after the callback). 213 */ 214 list_for_each_entry_safe(active, next, &rq->active_list, link) { 215 /* 216 * In microbenchmarks or focusing upon time inside the kernel, 217 * we may spend an inordinate amount of time simply handling 218 * the retirement of requests and processing their callbacks. 219 * Of which, this loop itself is particularly hot due to the 220 * cache misses when jumping around the list of 221 * i915_active_request. So we try to keep this loop as 222 * streamlined as possible and also prefetch the next 223 * i915_active_request to try and hide the likely cache miss. 224 */ 225 prefetchw(next); 226 227 INIT_LIST_HEAD(&active->link); 228 RCU_INIT_POINTER(active->request, NULL); 229 230 active->retire(active, rq); 231 } 232 233 local_irq_disable(); 234 235 spin_lock(&rq->engine->active.lock); 236 list_del(&rq->sched.link); 237 spin_unlock(&rq->engine->active.lock); 238 239 spin_lock(&rq->lock); 240 i915_request_mark_complete(rq); 241 if (!i915_request_signaled(rq)) 242 dma_fence_signal_locked(&rq->fence); 243 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags)) 244 i915_request_cancel_breadcrumb(rq); 245 if (rq->waitboost) { 246 GEM_BUG_ON(!atomic_read(&rq->i915->gt_pm.rps.num_waiters)); 247 atomic_dec(&rq->i915->gt_pm.rps.num_waiters); 248 } 249 spin_unlock(&rq->lock); 250 251 local_irq_enable(); 252 253 intel_context_exit(rq->hw_context); 254 intel_context_unpin(rq->hw_context); 255 256 i915_request_remove_from_client(rq); 257 list_del(&rq->link); 258 259 free_capture_list(rq); 260 i915_sched_node_fini(&rq->sched); 261 i915_request_put(rq); 262 263 return true; 264 } 265 266 void i915_request_retire_upto(struct i915_request *rq) 267 { 268 struct intel_ring *ring = rq->ring; 269 struct i915_request *tmp; 270 271 GEM_TRACE("%s fence %llx:%lld, current %d\n", 272 rq->engine->name, 273 rq->fence.context, rq->fence.seqno, 274 hwsp_seqno(rq)); 275 276 lockdep_assert_held(&rq->i915->drm.struct_mutex); 277 GEM_BUG_ON(!i915_request_completed(rq)); 278 279 if (list_empty(&rq->ring_link)) 280 return; 281 282 do { 283 tmp = list_first_entry(&ring->request_list, 284 typeof(*tmp), ring_link); 285 } while (i915_request_retire(tmp) && tmp != rq); 286 } 287 288 static void irq_execute_cb(struct irq_work *wrk) 289 { 290 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 291 292 i915_sw_fence_complete(cb->fence); 293 kmem_cache_free(global.slab_execute_cbs, cb); 294 } 295 296 static void irq_execute_cb_hook(struct irq_work *wrk) 297 { 298 struct execute_cb *cb = container_of(wrk, typeof(*cb), work); 299 300 cb->hook(container_of(cb->fence, struct i915_request, submit), 301 &cb->signal->fence); 302 i915_request_put(cb->signal); 303 304 irq_execute_cb(wrk); 305 } 306 307 static void __notify_execute_cb(struct i915_request *rq) 308 { 309 struct execute_cb *cb; 310 311 lockdep_assert_held(&rq->lock); 312 313 if (list_empty(&rq->execute_cb)) 314 return; 315 316 list_for_each_entry(cb, &rq->execute_cb, link) 317 irq_work_queue(&cb->work); 318 319 /* 320 * XXX Rollback on __i915_request_unsubmit() 321 * 322 * In the future, perhaps when we have an active time-slicing scheduler, 323 * it will be interesting to unsubmit parallel execution and remove 324 * busywaits from the GPU until their master is restarted. This is 325 * quite hairy, we have to carefully rollback the fence and do a 326 * preempt-to-idle cycle on the target engine, all the while the 327 * master execute_cb may refire. 328 */ 329 INIT_LIST_HEAD(&rq->execute_cb); 330 } 331 332 static int 333 __i915_request_await_execution(struct i915_request *rq, 334 struct i915_request *signal, 335 void (*hook)(struct i915_request *rq, 336 struct dma_fence *signal), 337 gfp_t gfp) 338 { 339 struct execute_cb *cb; 340 341 if (i915_request_is_active(signal)) { 342 if (hook) 343 hook(rq, &signal->fence); 344 return 0; 345 } 346 347 cb = kmem_cache_alloc(global.slab_execute_cbs, gfp); 348 if (!cb) 349 return -ENOMEM; 350 351 cb->fence = &rq->submit; 352 i915_sw_fence_await(cb->fence); 353 init_irq_work(&cb->work, irq_execute_cb); 354 355 if (hook) { 356 cb->hook = hook; 357 cb->signal = i915_request_get(signal); 358 cb->work.func = irq_execute_cb_hook; 359 } 360 361 spin_lock_irq(&signal->lock); 362 if (i915_request_is_active(signal)) { 363 if (hook) { 364 hook(rq, &signal->fence); 365 i915_request_put(signal); 366 } 367 i915_sw_fence_complete(cb->fence); 368 kmem_cache_free(global.slab_execute_cbs, cb); 369 } else { 370 list_add_tail(&cb->link, &signal->execute_cb); 371 } 372 spin_unlock_irq(&signal->lock); 373 374 return 0; 375 } 376 377 void __i915_request_submit(struct i915_request *request) 378 { 379 struct intel_engine_cs *engine = request->engine; 380 381 GEM_TRACE("%s fence %llx:%lld, current %d\n", 382 engine->name, 383 request->fence.context, request->fence.seqno, 384 hwsp_seqno(request)); 385 386 GEM_BUG_ON(!irqs_disabled()); 387 lockdep_assert_held(&engine->active.lock); 388 389 if (i915_gem_context_is_banned(request->gem_context)) 390 i915_request_skip(request, -EIO); 391 392 /* 393 * Are we using semaphores when the gpu is already saturated? 394 * 395 * Using semaphores incurs a cost in having the GPU poll a 396 * memory location, busywaiting for it to change. The continual 397 * memory reads can have a noticeable impact on the rest of the 398 * system with the extra bus traffic, stalling the cpu as it too 399 * tries to access memory across the bus (perf stat -e bus-cycles). 400 * 401 * If we installed a semaphore on this request and we only submit 402 * the request after the signaler completed, that indicates the 403 * system is overloaded and using semaphores at this time only 404 * increases the amount of work we are doing. If so, we disable 405 * further use of semaphores until we are idle again, whence we 406 * optimistically try again. 407 */ 408 if (request->sched.semaphores && 409 i915_sw_fence_signaled(&request->semaphore)) 410 engine->saturated |= request->sched.semaphores; 411 412 /* We may be recursing from the signal callback of another i915 fence */ 413 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 414 415 list_move_tail(&request->sched.link, &engine->active.requests); 416 417 GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); 418 set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); 419 420 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && 421 !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) && 422 !i915_request_enable_breadcrumb(request)) 423 intel_engine_queue_breadcrumbs(engine); 424 425 __notify_execute_cb(request); 426 427 spin_unlock(&request->lock); 428 429 engine->emit_fini_breadcrumb(request, 430 request->ring->vaddr + request->postfix); 431 432 engine->serial++; 433 434 trace_i915_request_execute(request); 435 } 436 437 void i915_request_submit(struct i915_request *request) 438 { 439 struct intel_engine_cs *engine = request->engine; 440 unsigned long flags; 441 442 /* Will be called from irq-context when using foreign fences. */ 443 spin_lock_irqsave(&engine->active.lock, flags); 444 445 __i915_request_submit(request); 446 447 spin_unlock_irqrestore(&engine->active.lock, flags); 448 } 449 450 void __i915_request_unsubmit(struct i915_request *request) 451 { 452 struct intel_engine_cs *engine = request->engine; 453 454 GEM_TRACE("%s fence %llx:%lld, current %d\n", 455 engine->name, 456 request->fence.context, request->fence.seqno, 457 hwsp_seqno(request)); 458 459 GEM_BUG_ON(!irqs_disabled()); 460 lockdep_assert_held(&engine->active.lock); 461 462 /* 463 * Only unwind in reverse order, required so that the per-context list 464 * is kept in seqno/ring order. 465 */ 466 467 /* We may be recursing from the signal callback of another i915 fence */ 468 spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); 469 470 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) 471 i915_request_cancel_breadcrumb(request); 472 473 GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); 474 clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); 475 476 spin_unlock(&request->lock); 477 478 /* We've already spun, don't charge on resubmitting. */ 479 if (request->sched.semaphores && i915_request_started(request)) { 480 request->sched.attr.priority |= I915_PRIORITY_NOSEMAPHORE; 481 request->sched.semaphores = 0; 482 } 483 484 /* 485 * We don't need to wake_up any waiters on request->execute, they 486 * will get woken by any other event or us re-adding this request 487 * to the engine timeline (__i915_request_submit()). The waiters 488 * should be quite adapt at finding that the request now has a new 489 * global_seqno to the one they went to sleep on. 490 */ 491 } 492 493 void i915_request_unsubmit(struct i915_request *request) 494 { 495 struct intel_engine_cs *engine = request->engine; 496 unsigned long flags; 497 498 /* Will be called from irq-context when using foreign fences. */ 499 spin_lock_irqsave(&engine->active.lock, flags); 500 501 __i915_request_unsubmit(request); 502 503 spin_unlock_irqrestore(&engine->active.lock, flags); 504 } 505 506 static int __i915_sw_fence_call 507 submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 508 { 509 struct i915_request *request = 510 container_of(fence, typeof(*request), submit); 511 512 switch (state) { 513 case FENCE_COMPLETE: 514 trace_i915_request_submit(request); 515 /* 516 * We need to serialize use of the submit_request() callback 517 * with its hotplugging performed during an emergency 518 * i915_gem_set_wedged(). We use the RCU mechanism to mark the 519 * critical section in order to force i915_gem_set_wedged() to 520 * wait until the submit_request() is completed before 521 * proceeding. 522 */ 523 rcu_read_lock(); 524 request->engine->submit_request(request); 525 rcu_read_unlock(); 526 break; 527 528 case FENCE_FREE: 529 i915_request_put(request); 530 break; 531 } 532 533 return NOTIFY_DONE; 534 } 535 536 static int __i915_sw_fence_call 537 semaphore_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) 538 { 539 struct i915_request *request = 540 container_of(fence, typeof(*request), semaphore); 541 542 switch (state) { 543 case FENCE_COMPLETE: 544 i915_schedule_bump_priority(request, I915_PRIORITY_NOSEMAPHORE); 545 break; 546 547 case FENCE_FREE: 548 i915_request_put(request); 549 break; 550 } 551 552 return NOTIFY_DONE; 553 } 554 555 static void ring_retire_requests(struct intel_ring *ring) 556 { 557 struct i915_request *rq, *rn; 558 559 list_for_each_entry_safe(rq, rn, &ring->request_list, ring_link) 560 if (!i915_request_retire(rq)) 561 break; 562 } 563 564 static noinline struct i915_request * 565 request_alloc_slow(struct intel_context *ce, gfp_t gfp) 566 { 567 struct intel_ring *ring = ce->ring; 568 struct i915_request *rq; 569 570 if (list_empty(&ring->request_list)) 571 goto out; 572 573 if (!gfpflags_allow_blocking(gfp)) 574 goto out; 575 576 /* Move our oldest request to the slab-cache (if not in use!) */ 577 rq = list_first_entry(&ring->request_list, typeof(*rq), ring_link); 578 i915_request_retire(rq); 579 580 rq = kmem_cache_alloc(global.slab_requests, 581 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 582 if (rq) 583 return rq; 584 585 /* Ratelimit ourselves to prevent oom from malicious clients */ 586 rq = list_last_entry(&ring->request_list, typeof(*rq), ring_link); 587 cond_synchronize_rcu(rq->rcustate); 588 589 /* Retire our old requests in the hope that we free some */ 590 ring_retire_requests(ring); 591 592 out: 593 return kmem_cache_alloc(global.slab_requests, gfp); 594 } 595 596 struct i915_request * 597 __i915_request_create(struct intel_context *ce, gfp_t gfp) 598 { 599 struct i915_timeline *tl = ce->ring->timeline; 600 struct i915_request *rq; 601 u32 seqno; 602 int ret; 603 604 might_sleep_if(gfpflags_allow_blocking(gfp)); 605 606 /* Check that the caller provided an already pinned context */ 607 __intel_context_pin(ce); 608 609 /* 610 * Beware: Dragons be flying overhead. 611 * 612 * We use RCU to look up requests in flight. The lookups may 613 * race with the request being allocated from the slab freelist. 614 * That is the request we are writing to here, may be in the process 615 * of being read by __i915_active_request_get_rcu(). As such, 616 * we have to be very careful when overwriting the contents. During 617 * the RCU lookup, we change chase the request->engine pointer, 618 * read the request->global_seqno and increment the reference count. 619 * 620 * The reference count is incremented atomically. If it is zero, 621 * the lookup knows the request is unallocated and complete. Otherwise, 622 * it is either still in use, or has been reallocated and reset 623 * with dma_fence_init(). This increment is safe for release as we 624 * check that the request we have a reference to and matches the active 625 * request. 626 * 627 * Before we increment the refcount, we chase the request->engine 628 * pointer. We must not call kmem_cache_zalloc() or else we set 629 * that pointer to NULL and cause a crash during the lookup. If 630 * we see the request is completed (based on the value of the 631 * old engine and seqno), the lookup is complete and reports NULL. 632 * If we decide the request is not completed (new engine or seqno), 633 * then we grab a reference and double check that it is still the 634 * active request - which it won't be and restart the lookup. 635 * 636 * Do not use kmem_cache_zalloc() here! 637 */ 638 rq = kmem_cache_alloc(global.slab_requests, 639 gfp | __GFP_RETRY_MAYFAIL | __GFP_NOWARN); 640 if (unlikely(!rq)) { 641 rq = request_alloc_slow(ce, gfp); 642 if (!rq) { 643 ret = -ENOMEM; 644 goto err_unreserve; 645 } 646 } 647 648 ret = i915_timeline_get_seqno(tl, rq, &seqno); 649 if (ret) 650 goto err_free; 651 652 rq->i915 = ce->engine->i915; 653 rq->hw_context = ce; 654 rq->gem_context = ce->gem_context; 655 rq->engine = ce->engine; 656 rq->ring = ce->ring; 657 rq->timeline = tl; 658 rq->hwsp_seqno = tl->hwsp_seqno; 659 rq->hwsp_cacheline = tl->hwsp_cacheline; 660 rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */ 661 662 spin_lock_init(&rq->lock); 663 dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock, 664 tl->fence_context, seqno); 665 666 /* We bump the ref for the fence chain */ 667 i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify); 668 i915_sw_fence_init(&i915_request_get(rq)->semaphore, semaphore_notify); 669 670 i915_sched_node_init(&rq->sched); 671 672 /* No zalloc, must clear what we need by hand */ 673 rq->file_priv = NULL; 674 rq->batch = NULL; 675 rq->capture_list = NULL; 676 rq->waitboost = false; 677 rq->execution_mask = ALL_ENGINES; 678 679 INIT_LIST_HEAD(&rq->active_list); 680 INIT_LIST_HEAD(&rq->execute_cb); 681 682 /* 683 * Reserve space in the ring buffer for all the commands required to 684 * eventually emit this request. This is to guarantee that the 685 * i915_request_add() call can't fail. Note that the reserve may need 686 * to be redone if the request is not actually submitted straight 687 * away, e.g. because a GPU scheduler has deferred it. 688 * 689 * Note that due to how we add reserved_space to intel_ring_begin() 690 * we need to double our request to ensure that if we need to wrap 691 * around inside i915_request_add() there is sufficient space at 692 * the beginning of the ring as well. 693 */ 694 rq->reserved_space = 695 2 * rq->engine->emit_fini_breadcrumb_dw * sizeof(u32); 696 697 /* 698 * Record the position of the start of the request so that 699 * should we detect the updated seqno part-way through the 700 * GPU processing the request, we never over-estimate the 701 * position of the head. 702 */ 703 rq->head = rq->ring->emit; 704 705 ret = rq->engine->request_alloc(rq); 706 if (ret) 707 goto err_unwind; 708 709 rq->infix = rq->ring->emit; /* end of header; start of user payload */ 710 711 intel_context_mark_active(ce); 712 return rq; 713 714 err_unwind: 715 ce->ring->emit = rq->head; 716 717 /* Make sure we didn't add ourselves to external state before freeing */ 718 GEM_BUG_ON(!list_empty(&rq->active_list)); 719 GEM_BUG_ON(!list_empty(&rq->sched.signalers_list)); 720 GEM_BUG_ON(!list_empty(&rq->sched.waiters_list)); 721 722 err_free: 723 kmem_cache_free(global.slab_requests, rq); 724 err_unreserve: 725 intel_context_unpin(ce); 726 return ERR_PTR(ret); 727 } 728 729 struct i915_request * 730 i915_request_create(struct intel_context *ce) 731 { 732 struct i915_request *rq; 733 int err; 734 735 err = intel_context_timeline_lock(ce); 736 if (err) 737 return ERR_PTR(err); 738 739 /* Move our oldest request to the slab-cache (if not in use!) */ 740 rq = list_first_entry(&ce->ring->request_list, typeof(*rq), ring_link); 741 if (!list_is_last(&rq->ring_link, &ce->ring->request_list)) 742 i915_request_retire(rq); 743 744 intel_context_enter(ce); 745 rq = __i915_request_create(ce, GFP_KERNEL); 746 intel_context_exit(ce); /* active reference transferred to request */ 747 if (IS_ERR(rq)) 748 goto err_unlock; 749 750 /* Check that we do not interrupt ourselves with a new request */ 751 rq->cookie = lockdep_pin_lock(&ce->ring->timeline->mutex); 752 753 return rq; 754 755 err_unlock: 756 intel_context_timeline_unlock(ce); 757 return rq; 758 } 759 760 static int 761 i915_request_await_start(struct i915_request *rq, struct i915_request *signal) 762 { 763 if (list_is_first(&signal->ring_link, &signal->ring->request_list)) 764 return 0; 765 766 signal = list_prev_entry(signal, ring_link); 767 if (i915_timeline_sync_is_later(rq->timeline, &signal->fence)) 768 return 0; 769 770 return i915_sw_fence_await_dma_fence(&rq->submit, 771 &signal->fence, 0, 772 I915_FENCE_GFP); 773 } 774 775 static intel_engine_mask_t 776 already_busywaiting(struct i915_request *rq) 777 { 778 /* 779 * Polling a semaphore causes bus traffic, delaying other users of 780 * both the GPU and CPU. We want to limit the impact on others, 781 * while taking advantage of early submission to reduce GPU 782 * latency. Therefore we restrict ourselves to not using more 783 * than one semaphore from each source, and not using a semaphore 784 * if we have detected the engine is saturated (i.e. would not be 785 * submitted early and cause bus traffic reading an already passed 786 * semaphore). 787 * 788 * See the are-we-too-late? check in __i915_request_submit(). 789 */ 790 return rq->sched.semaphores | rq->engine->saturated; 791 } 792 793 static int 794 emit_semaphore_wait(struct i915_request *to, 795 struct i915_request *from, 796 gfp_t gfp) 797 { 798 u32 hwsp_offset; 799 u32 *cs; 800 int err; 801 802 GEM_BUG_ON(!from->timeline->has_initial_breadcrumb); 803 GEM_BUG_ON(INTEL_GEN(to->i915) < 8); 804 805 /* Just emit the first semaphore we see as request space is limited. */ 806 if (already_busywaiting(to) & from->engine->mask) 807 return i915_sw_fence_await_dma_fence(&to->submit, 808 &from->fence, 0, 809 I915_FENCE_GFP); 810 811 err = i915_request_await_start(to, from); 812 if (err < 0) 813 return err; 814 815 /* Only submit our spinner after the signaler is running! */ 816 err = __i915_request_await_execution(to, from, NULL, gfp); 817 if (err) 818 return err; 819 820 /* We need to pin the signaler's HWSP until we are finished reading. */ 821 err = i915_timeline_read_hwsp(from, to, &hwsp_offset); 822 if (err) 823 return err; 824 825 cs = intel_ring_begin(to, 4); 826 if (IS_ERR(cs)) 827 return PTR_ERR(cs); 828 829 /* 830 * Using greater-than-or-equal here means we have to worry 831 * about seqno wraparound. To side step that issue, we swap 832 * the timeline HWSP upon wrapping, so that everyone listening 833 * for the old (pre-wrap) values do not see the much smaller 834 * (post-wrap) values than they were expecting (and so wait 835 * forever). 836 */ 837 *cs++ = MI_SEMAPHORE_WAIT | 838 MI_SEMAPHORE_GLOBAL_GTT | 839 MI_SEMAPHORE_POLL | 840 MI_SEMAPHORE_SAD_GTE_SDD; 841 *cs++ = from->fence.seqno; 842 *cs++ = hwsp_offset; 843 *cs++ = 0; 844 845 intel_ring_advance(to, cs); 846 to->sched.semaphores |= from->engine->mask; 847 to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN; 848 return 0; 849 } 850 851 static int 852 i915_request_await_request(struct i915_request *to, struct i915_request *from) 853 { 854 int ret; 855 856 GEM_BUG_ON(to == from); 857 GEM_BUG_ON(to->timeline == from->timeline); 858 859 if (i915_request_completed(from)) 860 return 0; 861 862 if (to->engine->schedule) { 863 ret = i915_sched_node_add_dependency(&to->sched, &from->sched); 864 if (ret < 0) 865 return ret; 866 } 867 868 if (to->engine == from->engine) { 869 ret = i915_sw_fence_await_sw_fence_gfp(&to->submit, 870 &from->submit, 871 I915_FENCE_GFP); 872 } else if (intel_engine_has_semaphores(to->engine) && 873 to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) { 874 ret = emit_semaphore_wait(to, from, I915_FENCE_GFP); 875 } else { 876 ret = i915_sw_fence_await_dma_fence(&to->submit, 877 &from->fence, 0, 878 I915_FENCE_GFP); 879 } 880 if (ret < 0) 881 return ret; 882 883 if (to->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN) { 884 ret = i915_sw_fence_await_dma_fence(&to->semaphore, 885 &from->fence, 0, 886 I915_FENCE_GFP); 887 if (ret < 0) 888 return ret; 889 } 890 891 return 0; 892 } 893 894 int 895 i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence) 896 { 897 struct dma_fence **child = &fence; 898 unsigned int nchild = 1; 899 int ret; 900 901 /* 902 * Note that if the fence-array was created in signal-on-any mode, 903 * we should *not* decompose it into its individual fences. However, 904 * we don't currently store which mode the fence-array is operating 905 * in. Fortunately, the only user of signal-on-any is private to 906 * amdgpu and we should not see any incoming fence-array from 907 * sync-file being in signal-on-any mode. 908 */ 909 if (dma_fence_is_array(fence)) { 910 struct dma_fence_array *array = to_dma_fence_array(fence); 911 912 child = array->fences; 913 nchild = array->num_fences; 914 GEM_BUG_ON(!nchild); 915 } 916 917 do { 918 fence = *child++; 919 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 920 continue; 921 922 /* 923 * Requests on the same timeline are explicitly ordered, along 924 * with their dependencies, by i915_request_add() which ensures 925 * that requests are submitted in-order through each ring. 926 */ 927 if (fence->context == rq->fence.context) 928 continue; 929 930 /* Squash repeated waits to the same timelines */ 931 if (fence->context != rq->i915->mm.unordered_timeline && 932 i915_timeline_sync_is_later(rq->timeline, fence)) 933 continue; 934 935 if (dma_fence_is_i915(fence)) 936 ret = i915_request_await_request(rq, to_request(fence)); 937 else 938 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 939 I915_FENCE_TIMEOUT, 940 I915_FENCE_GFP); 941 if (ret < 0) 942 return ret; 943 944 /* Record the latest fence used against each timeline */ 945 if (fence->context != rq->i915->mm.unordered_timeline) 946 i915_timeline_sync_set(rq->timeline, fence); 947 } while (--nchild); 948 949 return 0; 950 } 951 952 int 953 i915_request_await_execution(struct i915_request *rq, 954 struct dma_fence *fence, 955 void (*hook)(struct i915_request *rq, 956 struct dma_fence *signal)) 957 { 958 struct dma_fence **child = &fence; 959 unsigned int nchild = 1; 960 int ret; 961 962 if (dma_fence_is_array(fence)) { 963 struct dma_fence_array *array = to_dma_fence_array(fence); 964 965 /* XXX Error for signal-on-any fence arrays */ 966 967 child = array->fences; 968 nchild = array->num_fences; 969 GEM_BUG_ON(!nchild); 970 } 971 972 do { 973 fence = *child++; 974 if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags)) 975 continue; 976 977 /* 978 * We don't squash repeated fence dependencies here as we 979 * want to run our callback in all cases. 980 */ 981 982 if (dma_fence_is_i915(fence)) 983 ret = __i915_request_await_execution(rq, 984 to_request(fence), 985 hook, 986 I915_FENCE_GFP); 987 else 988 ret = i915_sw_fence_await_dma_fence(&rq->submit, fence, 989 I915_FENCE_TIMEOUT, 990 GFP_KERNEL); 991 if (ret < 0) 992 return ret; 993 } while (--nchild); 994 995 return 0; 996 } 997 998 /** 999 * i915_request_await_object - set this request to (async) wait upon a bo 1000 * @to: request we are wishing to use 1001 * @obj: object which may be in use on another ring. 1002 * @write: whether the wait is on behalf of a writer 1003 * 1004 * This code is meant to abstract object synchronization with the GPU. 1005 * Conceptually we serialise writes between engines inside the GPU. 1006 * We only allow one engine to write into a buffer at any time, but 1007 * multiple readers. To ensure each has a coherent view of memory, we must: 1008 * 1009 * - If there is an outstanding write request to the object, the new 1010 * request must wait for it to complete (either CPU or in hw, requests 1011 * on the same ring will be naturally ordered). 1012 * 1013 * - If we are a write request (pending_write_domain is set), the new 1014 * request must wait for outstanding read requests to complete. 1015 * 1016 * Returns 0 if successful, else propagates up the lower layer error. 1017 */ 1018 int 1019 i915_request_await_object(struct i915_request *to, 1020 struct drm_i915_gem_object *obj, 1021 bool write) 1022 { 1023 struct dma_fence *excl; 1024 int ret = 0; 1025 1026 if (write) { 1027 struct dma_fence **shared; 1028 unsigned int count, i; 1029 1030 ret = reservation_object_get_fences_rcu(obj->base.resv, 1031 &excl, &count, &shared); 1032 if (ret) 1033 return ret; 1034 1035 for (i = 0; i < count; i++) { 1036 ret = i915_request_await_dma_fence(to, shared[i]); 1037 if (ret) 1038 break; 1039 1040 dma_fence_put(shared[i]); 1041 } 1042 1043 for (; i < count; i++) 1044 dma_fence_put(shared[i]); 1045 kfree(shared); 1046 } else { 1047 excl = reservation_object_get_excl_rcu(obj->base.resv); 1048 } 1049 1050 if (excl) { 1051 if (ret == 0) 1052 ret = i915_request_await_dma_fence(to, excl); 1053 1054 dma_fence_put(excl); 1055 } 1056 1057 return ret; 1058 } 1059 1060 void i915_request_skip(struct i915_request *rq, int error) 1061 { 1062 void *vaddr = rq->ring->vaddr; 1063 u32 head; 1064 1065 GEM_BUG_ON(!IS_ERR_VALUE((long)error)); 1066 dma_fence_set_error(&rq->fence, error); 1067 1068 /* 1069 * As this request likely depends on state from the lost 1070 * context, clear out all the user operations leaving the 1071 * breadcrumb at the end (so we get the fence notifications). 1072 */ 1073 head = rq->infix; 1074 if (rq->postfix < head) { 1075 memset(vaddr + head, 0, rq->ring->size - head); 1076 head = 0; 1077 } 1078 memset(vaddr + head, 0, rq->postfix - head); 1079 } 1080 1081 static struct i915_request * 1082 __i915_request_add_to_timeline(struct i915_request *rq) 1083 { 1084 struct i915_timeline *timeline = rq->timeline; 1085 struct i915_request *prev; 1086 1087 /* 1088 * Dependency tracking and request ordering along the timeline 1089 * is special cased so that we can eliminate redundant ordering 1090 * operations while building the request (we know that the timeline 1091 * itself is ordered, and here we guarantee it). 1092 * 1093 * As we know we will need to emit tracking along the timeline, 1094 * we embed the hooks into our request struct -- at the cost of 1095 * having to have specialised no-allocation interfaces (which will 1096 * be beneficial elsewhere). 1097 * 1098 * A second benefit to open-coding i915_request_await_request is 1099 * that we can apply a slight variant of the rules specialised 1100 * for timelines that jump between engines (such as virtual engines). 1101 * If we consider the case of virtual engine, we must emit a dma-fence 1102 * to prevent scheduling of the second request until the first is 1103 * complete (to maximise our greedy late load balancing) and this 1104 * precludes optimising to use semaphores serialisation of a single 1105 * timeline across engines. 1106 */ 1107 prev = rcu_dereference_protected(timeline->last_request.request, 1); 1108 if (prev && !i915_request_completed(prev)) { 1109 if (is_power_of_2(prev->engine->mask | rq->engine->mask)) 1110 i915_sw_fence_await_sw_fence(&rq->submit, 1111 &prev->submit, 1112 &rq->submitq); 1113 else 1114 __i915_sw_fence_await_dma_fence(&rq->submit, 1115 &prev->fence, 1116 &rq->dmaq); 1117 if (rq->engine->schedule) 1118 __i915_sched_node_add_dependency(&rq->sched, 1119 &prev->sched, 1120 &rq->dep, 1121 0); 1122 } 1123 1124 list_add_tail(&rq->link, &timeline->requests); 1125 1126 /* 1127 * Make sure that no request gazumped us - if it was allocated after 1128 * our i915_request_alloc() and called __i915_request_add() before 1129 * us, the timeline will hold its seqno which is later than ours. 1130 */ 1131 GEM_BUG_ON(timeline->seqno != rq->fence.seqno); 1132 __i915_active_request_set(&timeline->last_request, rq); 1133 1134 return prev; 1135 } 1136 1137 /* 1138 * NB: This function is not allowed to fail. Doing so would mean the the 1139 * request is not being tracked for completion but the work itself is 1140 * going to happen on the hardware. This would be a Bad Thing(tm). 1141 */ 1142 struct i915_request *__i915_request_commit(struct i915_request *rq) 1143 { 1144 struct intel_engine_cs *engine = rq->engine; 1145 struct intel_ring *ring = rq->ring; 1146 struct i915_request *prev; 1147 u32 *cs; 1148 1149 GEM_TRACE("%s fence %llx:%lld\n", 1150 engine->name, rq->fence.context, rq->fence.seqno); 1151 1152 /* 1153 * To ensure that this call will not fail, space for its emissions 1154 * should already have been reserved in the ring buffer. Let the ring 1155 * know that it is time to use that space up. 1156 */ 1157 GEM_BUG_ON(rq->reserved_space > ring->space); 1158 rq->reserved_space = 0; 1159 1160 /* 1161 * Record the position of the start of the breadcrumb so that 1162 * should we detect the updated seqno part-way through the 1163 * GPU processing the request, we never over-estimate the 1164 * position of the ring's HEAD. 1165 */ 1166 cs = intel_ring_begin(rq, engine->emit_fini_breadcrumb_dw); 1167 GEM_BUG_ON(IS_ERR(cs)); 1168 rq->postfix = intel_ring_offset(rq, cs); 1169 1170 prev = __i915_request_add_to_timeline(rq); 1171 1172 list_add_tail(&rq->ring_link, &ring->request_list); 1173 if (list_is_first(&rq->ring_link, &ring->request_list)) 1174 list_add(&ring->active_link, &rq->i915->gt.active_rings); 1175 rq->emitted_jiffies = jiffies; 1176 1177 /* 1178 * Let the backend know a new request has arrived that may need 1179 * to adjust the existing execution schedule due to a high priority 1180 * request - i.e. we may want to preempt the current request in order 1181 * to run a high priority dependency chain *before* we can execute this 1182 * request. 1183 * 1184 * This is called before the request is ready to run so that we can 1185 * decide whether to preempt the entire chain so that it is ready to 1186 * run at the earliest possible convenience. 1187 */ 1188 local_bh_disable(); 1189 i915_sw_fence_commit(&rq->semaphore); 1190 rcu_read_lock(); /* RCU serialisation for set-wedged protection */ 1191 if (engine->schedule) { 1192 struct i915_sched_attr attr = rq->gem_context->sched; 1193 1194 /* 1195 * Boost actual workloads past semaphores! 1196 * 1197 * With semaphores we spin on one engine waiting for another, 1198 * simply to reduce the latency of starting our work when 1199 * the signaler completes. However, if there is any other 1200 * work that we could be doing on this engine instead, that 1201 * is better utilisation and will reduce the overall duration 1202 * of the current work. To avoid PI boosting a semaphore 1203 * far in the distance past over useful work, we keep a history 1204 * of any semaphore use along our dependency chain. 1205 */ 1206 if (!(rq->sched.flags & I915_SCHED_HAS_SEMAPHORE_CHAIN)) 1207 attr.priority |= I915_PRIORITY_NOSEMAPHORE; 1208 1209 /* 1210 * Boost priorities to new clients (new request flows). 1211 * 1212 * Allow interactive/synchronous clients to jump ahead of 1213 * the bulk clients. (FQ_CODEL) 1214 */ 1215 if (list_empty(&rq->sched.signalers_list)) 1216 attr.priority |= I915_PRIORITY_WAIT; 1217 1218 engine->schedule(rq, &attr); 1219 } 1220 rcu_read_unlock(); 1221 i915_sw_fence_commit(&rq->submit); 1222 local_bh_enable(); /* Kick the execlists tasklet if just scheduled */ 1223 1224 return prev; 1225 } 1226 1227 void i915_request_add(struct i915_request *rq) 1228 { 1229 struct i915_request *prev; 1230 1231 lockdep_assert_held(&rq->timeline->mutex); 1232 lockdep_unpin_lock(&rq->timeline->mutex, rq->cookie); 1233 1234 trace_i915_request_add(rq); 1235 1236 prev = __i915_request_commit(rq); 1237 1238 /* 1239 * In typical scenarios, we do not expect the previous request on 1240 * the timeline to be still tracked by timeline->last_request if it 1241 * has been completed. If the completed request is still here, that 1242 * implies that request retirement is a long way behind submission, 1243 * suggesting that we haven't been retiring frequently enough from 1244 * the combination of retire-before-alloc, waiters and the background 1245 * retirement worker. So if the last request on this timeline was 1246 * already completed, do a catch up pass, flushing the retirement queue 1247 * up to this client. Since we have now moved the heaviest operations 1248 * during retirement onto secondary workers, such as freeing objects 1249 * or contexts, retiring a bunch of requests is mostly list management 1250 * (and cache misses), and so we should not be overly penalizing this 1251 * client by performing excess work, though we may still performing 1252 * work on behalf of others -- but instead we should benefit from 1253 * improved resource management. (Well, that's the theory at least.) 1254 */ 1255 if (prev && i915_request_completed(prev)) 1256 i915_request_retire_upto(prev); 1257 1258 mutex_unlock(&rq->timeline->mutex); 1259 } 1260 1261 static unsigned long local_clock_us(unsigned int *cpu) 1262 { 1263 unsigned long t; 1264 1265 /* 1266 * Cheaply and approximately convert from nanoseconds to microseconds. 1267 * The result and subsequent calculations are also defined in the same 1268 * approximate microseconds units. The principal source of timing 1269 * error here is from the simple truncation. 1270 * 1271 * Note that local_clock() is only defined wrt to the current CPU; 1272 * the comparisons are no longer valid if we switch CPUs. Instead of 1273 * blocking preemption for the entire busywait, we can detect the CPU 1274 * switch and use that as indicator of system load and a reason to 1275 * stop busywaiting, see busywait_stop(). 1276 */ 1277 *cpu = get_cpu(); 1278 t = local_clock() >> 10; 1279 put_cpu(); 1280 1281 return t; 1282 } 1283 1284 static bool busywait_stop(unsigned long timeout, unsigned int cpu) 1285 { 1286 unsigned int this_cpu; 1287 1288 if (time_after(local_clock_us(&this_cpu), timeout)) 1289 return true; 1290 1291 return this_cpu != cpu; 1292 } 1293 1294 static bool __i915_spin_request(const struct i915_request * const rq, 1295 int state, unsigned long timeout_us) 1296 { 1297 unsigned int cpu; 1298 1299 /* 1300 * Only wait for the request if we know it is likely to complete. 1301 * 1302 * We don't track the timestamps around requests, nor the average 1303 * request length, so we do not have a good indicator that this 1304 * request will complete within the timeout. What we do know is the 1305 * order in which requests are executed by the context and so we can 1306 * tell if the request has been started. If the request is not even 1307 * running yet, it is a fair assumption that it will not complete 1308 * within our relatively short timeout. 1309 */ 1310 if (!i915_request_is_running(rq)) 1311 return false; 1312 1313 /* 1314 * When waiting for high frequency requests, e.g. during synchronous 1315 * rendering split between the CPU and GPU, the finite amount of time 1316 * required to set up the irq and wait upon it limits the response 1317 * rate. By busywaiting on the request completion for a short while we 1318 * can service the high frequency waits as quick as possible. However, 1319 * if it is a slow request, we want to sleep as quickly as possible. 1320 * The tradeoff between waiting and sleeping is roughly the time it 1321 * takes to sleep on a request, on the order of a microsecond. 1322 */ 1323 1324 timeout_us += local_clock_us(&cpu); 1325 do { 1326 if (i915_request_completed(rq)) 1327 return true; 1328 1329 if (signal_pending_state(state, current)) 1330 break; 1331 1332 if (busywait_stop(timeout_us, cpu)) 1333 break; 1334 1335 cpu_relax(); 1336 } while (!need_resched()); 1337 1338 return false; 1339 } 1340 1341 struct request_wait { 1342 struct dma_fence_cb cb; 1343 struct task_struct *tsk; 1344 }; 1345 1346 static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb) 1347 { 1348 struct request_wait *wait = container_of(cb, typeof(*wait), cb); 1349 1350 wake_up_process(wait->tsk); 1351 } 1352 1353 /** 1354 * i915_request_wait - wait until execution of request has finished 1355 * @rq: the request to wait upon 1356 * @flags: how to wait 1357 * @timeout: how long to wait in jiffies 1358 * 1359 * i915_request_wait() waits for the request to be completed, for a 1360 * maximum of @timeout jiffies (with MAX_SCHEDULE_TIMEOUT implying an 1361 * unbounded wait). 1362 * 1363 * Returns the remaining time (in jiffies) if the request completed, which may 1364 * be zero or -ETIME if the request is unfinished after the timeout expires. 1365 * May return -EINTR is called with I915_WAIT_INTERRUPTIBLE and a signal is 1366 * pending before the request completes. 1367 */ 1368 long i915_request_wait(struct i915_request *rq, 1369 unsigned int flags, 1370 long timeout) 1371 { 1372 const int state = flags & I915_WAIT_INTERRUPTIBLE ? 1373 TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; 1374 struct request_wait wait; 1375 1376 might_sleep(); 1377 GEM_BUG_ON(timeout < 0); 1378 1379 if (dma_fence_is_signaled(&rq->fence)) 1380 return timeout; 1381 1382 if (!timeout) 1383 return -ETIME; 1384 1385 trace_i915_request_wait_begin(rq, flags); 1386 1387 /* 1388 * We must never wait on the GPU while holding a lock as we 1389 * may need to perform a GPU reset. So while we don't need to 1390 * serialise wait/reset with an explicit lock, we do want 1391 * lockdep to detect potential dependency cycles. 1392 */ 1393 mutex_acquire(&rq->i915->gpu_error.wedge_mutex.dep_map, 1394 0, 0, _THIS_IP_); 1395 1396 /* 1397 * Optimistic spin before touching IRQs. 1398 * 1399 * We may use a rather large value here to offset the penalty of 1400 * switching away from the active task. Frequently, the client will 1401 * wait upon an old swapbuffer to throttle itself to remain within a 1402 * frame of the gpu. If the client is running in lockstep with the gpu, 1403 * then it should not be waiting long at all, and a sleep now will incur 1404 * extra scheduler latency in producing the next frame. To try to 1405 * avoid adding the cost of enabling/disabling the interrupt to the 1406 * short wait, we first spin to see if the request would have completed 1407 * in the time taken to setup the interrupt. 1408 * 1409 * We need upto 5us to enable the irq, and upto 20us to hide the 1410 * scheduler latency of a context switch, ignoring the secondary 1411 * impacts from a context switch such as cache eviction. 1412 * 1413 * The scheme used for low-latency IO is called "hybrid interrupt 1414 * polling". The suggestion there is to sleep until just before you 1415 * expect to be woken by the device interrupt and then poll for its 1416 * completion. That requires having a good predictor for the request 1417 * duration, which we currently lack. 1418 */ 1419 if (CONFIG_DRM_I915_SPIN_REQUEST && 1420 __i915_spin_request(rq, state, CONFIG_DRM_I915_SPIN_REQUEST)) { 1421 dma_fence_signal(&rq->fence); 1422 goto out; 1423 } 1424 1425 /* 1426 * This client is about to stall waiting for the GPU. In many cases 1427 * this is undesirable and limits the throughput of the system, as 1428 * many clients cannot continue processing user input/output whilst 1429 * blocked. RPS autotuning may take tens of milliseconds to respond 1430 * to the GPU load and thus incurs additional latency for the client. 1431 * We can circumvent that by promoting the GPU frequency to maximum 1432 * before we sleep. This makes the GPU throttle up much more quickly 1433 * (good for benchmarks and user experience, e.g. window animations), 1434 * but at a cost of spending more power processing the workload 1435 * (bad for battery). 1436 */ 1437 if (flags & I915_WAIT_PRIORITY) { 1438 if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6) 1439 gen6_rps_boost(rq); 1440 i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT); 1441 } 1442 1443 wait.tsk = current; 1444 if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake)) 1445 goto out; 1446 1447 for (;;) { 1448 set_current_state(state); 1449 1450 if (i915_request_completed(rq)) 1451 break; 1452 1453 if (signal_pending_state(state, current)) { 1454 timeout = -ERESTARTSYS; 1455 break; 1456 } 1457 1458 if (!timeout) { 1459 timeout = -ETIME; 1460 break; 1461 } 1462 1463 timeout = io_schedule_timeout(timeout); 1464 } 1465 __set_current_state(TASK_RUNNING); 1466 1467 dma_fence_remove_callback(&rq->fence, &wait.cb); 1468 1469 out: 1470 mutex_release(&rq->i915->gpu_error.wedge_mutex.dep_map, 0, _THIS_IP_); 1471 trace_i915_request_wait_end(rq); 1472 return timeout; 1473 } 1474 1475 bool i915_retire_requests(struct drm_i915_private *i915) 1476 { 1477 struct intel_ring *ring, *tmp; 1478 1479 lockdep_assert_held(&i915->drm.struct_mutex); 1480 1481 list_for_each_entry_safe(ring, tmp, 1482 &i915->gt.active_rings, active_link) { 1483 intel_ring_get(ring); /* last rq holds reference! */ 1484 ring_retire_requests(ring); 1485 intel_ring_put(ring); 1486 } 1487 1488 return !list_empty(&i915->gt.active_rings); 1489 } 1490 1491 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1492 #include "selftests/mock_request.c" 1493 #include "selftests/i915_request.c" 1494 #endif 1495 1496 static void i915_global_request_shrink(void) 1497 { 1498 kmem_cache_shrink(global.slab_dependencies); 1499 kmem_cache_shrink(global.slab_execute_cbs); 1500 kmem_cache_shrink(global.slab_requests); 1501 } 1502 1503 static void i915_global_request_exit(void) 1504 { 1505 kmem_cache_destroy(global.slab_dependencies); 1506 kmem_cache_destroy(global.slab_execute_cbs); 1507 kmem_cache_destroy(global.slab_requests); 1508 } 1509 1510 static struct i915_global_request global = { { 1511 .shrink = i915_global_request_shrink, 1512 .exit = i915_global_request_exit, 1513 } }; 1514 1515 int __init i915_global_request_init(void) 1516 { 1517 global.slab_requests = KMEM_CACHE(i915_request, 1518 SLAB_HWCACHE_ALIGN | 1519 SLAB_RECLAIM_ACCOUNT | 1520 SLAB_TYPESAFE_BY_RCU); 1521 if (!global.slab_requests) 1522 return -ENOMEM; 1523 1524 global.slab_execute_cbs = KMEM_CACHE(execute_cb, 1525 SLAB_HWCACHE_ALIGN | 1526 SLAB_RECLAIM_ACCOUNT | 1527 SLAB_TYPESAFE_BY_RCU); 1528 if (!global.slab_execute_cbs) 1529 goto err_requests; 1530 1531 global.slab_dependencies = KMEM_CACHE(i915_dependency, 1532 SLAB_HWCACHE_ALIGN | 1533 SLAB_RECLAIM_ACCOUNT); 1534 if (!global.slab_dependencies) 1535 goto err_execute_cbs; 1536 1537 i915_global_register(&global.base); 1538 return 0; 1539 1540 err_execute_cbs: 1541 kmem_cache_destroy(global.slab_execute_cbs); 1542 err_requests: 1543 kmem_cache_destroy(global.slab_requests); 1544 return -ENOMEM; 1545 } 1546