1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "gem/i915_gem_context.h" 137 138 #include "i915_drv.h" 139 #include "i915_perf.h" 140 #include "i915_trace.h" 141 #include "i915_vgpu.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_workarounds.h" 149 150 #define RING_EXECLIST_QFULL (1 << 0x2) 151 #define RING_EXECLIST1_VALID (1 << 0x3) 152 #define RING_EXECLIST0_VALID (1 << 0x4) 153 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 154 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 155 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 156 157 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 158 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 159 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 160 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 161 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 162 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 163 164 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 165 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 166 167 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 168 169 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 170 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 171 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 172 #define GEN12_IDLE_CTX_ID 0x7FF 173 #define GEN12_CSB_CTX_VALID(csb_dw) \ 174 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 175 176 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 177 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 178 #define WA_TAIL_DWORDS 2 179 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 180 181 struct virtual_engine { 182 struct intel_engine_cs base; 183 struct intel_context context; 184 185 /* 186 * We allow only a single request through the virtual engine at a time 187 * (each request in the timeline waits for the completion fence of 188 * the previous before being submitted). By restricting ourselves to 189 * only submitting a single request, each request is placed on to a 190 * physical to maximise load spreading (by virtue of the late greedy 191 * scheduling -- each real engine takes the next available request 192 * upon idling). 193 */ 194 struct i915_request *request; 195 196 /* 197 * We keep a rbtree of available virtual engines inside each physical 198 * engine, sorted by priority. Here we preallocate the nodes we need 199 * for the virtual engine, indexed by physical_engine->id. 200 */ 201 struct ve_node { 202 struct rb_node rb; 203 int prio; 204 } nodes[I915_NUM_ENGINES]; 205 206 /* 207 * Keep track of bonded pairs -- restrictions upon on our selection 208 * of physical engines any particular request may be submitted to. 209 * If we receive a submit-fence from a master engine, we will only 210 * use one of sibling_mask physical engines. 211 */ 212 struct ve_bond { 213 const struct intel_engine_cs *master; 214 intel_engine_mask_t sibling_mask; 215 } *bonds; 216 unsigned int num_bonds; 217 218 /* And finally, which physical engines this virtual engine maps onto. */ 219 unsigned int num_siblings; 220 struct intel_engine_cs *siblings[0]; 221 }; 222 223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 224 { 225 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 226 return container_of(engine, struct virtual_engine, base); 227 } 228 229 static int __execlists_context_alloc(struct intel_context *ce, 230 struct intel_engine_cs *engine); 231 232 static void execlists_init_reg_state(u32 *reg_state, 233 struct intel_context *ce, 234 struct intel_engine_cs *engine, 235 struct intel_ring *ring); 236 237 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 238 { 239 return (i915_ggtt_offset(engine->status_page.vma) + 240 I915_GEM_HWS_PREEMPT_ADDR); 241 } 242 243 static inline void 244 ring_set_paused(const struct intel_engine_cs *engine, int state) 245 { 246 /* 247 * We inspect HWS_PREEMPT with a semaphore inside 248 * engine->emit_fini_breadcrumb. If the dword is true, 249 * the ring is paused as the semaphore will busywait 250 * until the dword is false. 251 */ 252 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 253 if (state) 254 wmb(); 255 } 256 257 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 258 { 259 return rb_entry(rb, struct i915_priolist, node); 260 } 261 262 static inline int rq_prio(const struct i915_request *rq) 263 { 264 return rq->sched.attr.priority; 265 } 266 267 static int effective_prio(const struct i915_request *rq) 268 { 269 int prio = rq_prio(rq); 270 271 /* 272 * If this request is special and must not be interrupted at any 273 * cost, so be it. Note we are only checking the most recent request 274 * in the context and so may be masking an earlier vip request. It 275 * is hoped that under the conditions where nopreempt is used, this 276 * will not matter (i.e. all requests to that context will be 277 * nopreempt for as long as desired). 278 */ 279 if (i915_request_has_nopreempt(rq)) 280 prio = I915_PRIORITY_UNPREEMPTABLE; 281 282 /* 283 * On unwinding the active request, we give it a priority bump 284 * if it has completed waiting on any semaphore. If we know that 285 * the request has already started, we can prevent an unwanted 286 * preempt-to-idle cycle by taking that into account now. 287 */ 288 if (__i915_request_has_started(rq)) 289 prio |= I915_PRIORITY_NOSEMAPHORE; 290 291 /* Restrict mere WAIT boosts from triggering preemption */ 292 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 293 return prio | __NO_PREEMPTION; 294 } 295 296 static int queue_prio(const struct intel_engine_execlists *execlists) 297 { 298 struct i915_priolist *p; 299 struct rb_node *rb; 300 301 rb = rb_first_cached(&execlists->queue); 302 if (!rb) 303 return INT_MIN; 304 305 /* 306 * As the priolist[] are inverted, with the highest priority in [0], 307 * we have to flip the index value to become priority. 308 */ 309 p = to_priolist(rb); 310 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 311 } 312 313 static inline bool need_preempt(const struct intel_engine_cs *engine, 314 const struct i915_request *rq, 315 struct rb_node *rb) 316 { 317 int last_prio; 318 319 if (!intel_engine_has_semaphores(engine)) 320 return false; 321 322 /* 323 * Check if the current priority hint merits a preemption attempt. 324 * 325 * We record the highest value priority we saw during rescheduling 326 * prior to this dequeue, therefore we know that if it is strictly 327 * less than the current tail of ESLP[0], we do not need to force 328 * a preempt-to-idle cycle. 329 * 330 * However, the priority hint is a mere hint that we may need to 331 * preempt. If that hint is stale or we may be trying to preempt 332 * ourselves, ignore the request. 333 */ 334 last_prio = effective_prio(rq); 335 if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint, 336 last_prio)) 337 return false; 338 339 /* 340 * Check against the first request in ELSP[1], it will, thanks to the 341 * power of PI, be the highest priority of that context. 342 */ 343 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 344 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 345 return true; 346 347 if (rb) { 348 struct virtual_engine *ve = 349 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 350 bool preempt = false; 351 352 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 353 struct i915_request *next; 354 355 rcu_read_lock(); 356 next = READ_ONCE(ve->request); 357 if (next) 358 preempt = rq_prio(next) > last_prio; 359 rcu_read_unlock(); 360 } 361 362 if (preempt) 363 return preempt; 364 } 365 366 /* 367 * If the inflight context did not trigger the preemption, then maybe 368 * it was the set of queued requests? Pick the highest priority in 369 * the queue (the first active priolist) and see if it deserves to be 370 * running instead of ELSP[0]. 371 * 372 * The highest priority request in the queue can not be either 373 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 374 * context, it's priority would not exceed ELSP[0] aka last_prio. 375 */ 376 return queue_prio(&engine->execlists) > last_prio; 377 } 378 379 __maybe_unused static inline bool 380 assert_priority_queue(const struct i915_request *prev, 381 const struct i915_request *next) 382 { 383 /* 384 * Without preemption, the prev may refer to the still active element 385 * which we refuse to let go. 386 * 387 * Even with preemption, there are times when we think it is better not 388 * to preempt and leave an ostensibly lower priority request in flight. 389 */ 390 if (i915_request_is_active(prev)) 391 return true; 392 393 return rq_prio(prev) >= rq_prio(next); 394 } 395 396 /* 397 * The context descriptor encodes various attributes of a context, 398 * including its GTT address and some flags. Because it's fairly 399 * expensive to calculate, we'll just do it once and cache the result, 400 * which remains valid until the context is unpinned. 401 * 402 * This is what a descriptor looks like, from LSB to MSB:: 403 * 404 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 405 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 406 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 407 * bits 53-54: mbz, reserved for use by hardware 408 * bits 55-63: group ID, currently unused and set to 0 409 * 410 * Starting from Gen11, the upper dword of the descriptor has a new format: 411 * 412 * bits 32-36: reserved 413 * bits 37-47: SW context ID 414 * bits 48:53: engine instance 415 * bit 54: mbz, reserved for use by hardware 416 * bits 55-60: SW counter 417 * bits 61-63: engine class 418 * 419 * engine info, SW context ID and SW counter need to form a unique number 420 * (Context ID) per lrc. 421 */ 422 static u64 423 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 424 { 425 struct i915_gem_context *ctx = ce->gem_context; 426 u64 desc; 427 428 BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH))); 429 BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH))); 430 431 desc = INTEL_LEGACY_32B_CONTEXT; 432 if (i915_vm_is_4lvl(ce->vm)) 433 desc = INTEL_LEGACY_64B_CONTEXT; 434 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 435 436 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 437 if (IS_GEN(engine->i915, 8)) 438 desc |= GEN8_CTX_L3LLC_COHERENT; 439 440 desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE; 441 /* bits 12-31 */ 442 /* 443 * The following 32bits are copied into the OA reports (dword 2). 444 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 445 * anything below. 446 */ 447 if (INTEL_GEN(engine->i915) >= 11) { 448 GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH)); 449 desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT; 450 /* bits 37-47 */ 451 452 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 453 /* bits 48-53 */ 454 455 /* TODO: decide what to do with SW counter (bits 55-60) */ 456 457 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 458 /* bits 61-63 */ 459 } else { 460 GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH)); 461 desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */ 462 } 463 464 return desc; 465 } 466 467 static void unwind_wa_tail(struct i915_request *rq) 468 { 469 rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); 470 assert_ring_tail_valid(rq->ring, rq->tail); 471 } 472 473 static struct i915_request * 474 __unwind_incomplete_requests(struct intel_engine_cs *engine) 475 { 476 struct i915_request *rq, *rn, *active = NULL; 477 struct list_head *uninitialized_var(pl); 478 int prio = I915_PRIORITY_INVALID; 479 480 lockdep_assert_held(&engine->active.lock); 481 482 list_for_each_entry_safe_reverse(rq, rn, 483 &engine->active.requests, 484 sched.link) { 485 struct intel_engine_cs *owner; 486 487 if (i915_request_completed(rq)) 488 continue; /* XXX */ 489 490 __i915_request_unsubmit(rq); 491 unwind_wa_tail(rq); 492 493 /* 494 * Push the request back into the queue for later resubmission. 495 * If this request is not native to this physical engine (i.e. 496 * it came from a virtual source), push it back onto the virtual 497 * engine so that it can be moved across onto another physical 498 * engine as load dictates. 499 */ 500 owner = rq->hw_context->engine; 501 if (likely(owner == engine)) { 502 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 503 if (rq_prio(rq) != prio) { 504 prio = rq_prio(rq); 505 pl = i915_sched_lookup_priolist(engine, prio); 506 } 507 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 508 509 list_move(&rq->sched.link, pl); 510 active = rq; 511 } else { 512 /* 513 * Decouple the virtual breadcrumb before moving it 514 * back to the virtual engine -- we don't want the 515 * request to complete in the background and try 516 * and cancel the breadcrumb on the virtual engine 517 * (instead of the old engine where it is linked)! 518 */ 519 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 520 &rq->fence.flags)) { 521 spin_lock(&rq->lock); 522 i915_request_cancel_breadcrumb(rq); 523 spin_unlock(&rq->lock); 524 } 525 rq->engine = owner; 526 owner->submit_request(rq); 527 active = NULL; 528 } 529 } 530 531 return active; 532 } 533 534 struct i915_request * 535 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 536 { 537 struct intel_engine_cs *engine = 538 container_of(execlists, typeof(*engine), execlists); 539 540 return __unwind_incomplete_requests(engine); 541 } 542 543 static inline void 544 execlists_context_status_change(struct i915_request *rq, unsigned long status) 545 { 546 /* 547 * Only used when GVT-g is enabled now. When GVT-g is disabled, 548 * The compiler should eliminate this function as dead-code. 549 */ 550 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 551 return; 552 553 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 554 status, rq); 555 } 556 557 static inline struct intel_engine_cs * 558 __execlists_schedule_in(struct i915_request *rq) 559 { 560 struct intel_engine_cs * const engine = rq->engine; 561 struct intel_context * const ce = rq->hw_context; 562 563 intel_context_get(ce); 564 565 intel_gt_pm_get(engine->gt); 566 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 567 intel_engine_context_in(engine); 568 569 return engine; 570 } 571 572 static inline struct i915_request * 573 execlists_schedule_in(struct i915_request *rq, int idx) 574 { 575 struct intel_context * const ce = rq->hw_context; 576 struct intel_engine_cs *old; 577 578 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 579 trace_i915_request_in(rq, idx); 580 581 old = READ_ONCE(ce->inflight); 582 do { 583 if (!old) { 584 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 585 break; 586 } 587 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 588 589 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 590 return i915_request_get(rq); 591 } 592 593 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 594 { 595 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 596 struct i915_request *next = READ_ONCE(ve->request); 597 598 if (next && next->execution_mask & ~rq->execution_mask) 599 tasklet_schedule(&ve->base.execlists.tasklet); 600 } 601 602 static inline void 603 __execlists_schedule_out(struct i915_request *rq, 604 struct intel_engine_cs * const engine) 605 { 606 struct intel_context * const ce = rq->hw_context; 607 608 intel_engine_context_out(engine); 609 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 610 intel_gt_pm_put(engine->gt); 611 612 /* 613 * If this is part of a virtual engine, its next request may 614 * have been blocked waiting for access to the active context. 615 * We have to kick all the siblings again in case we need to 616 * switch (e.g. the next request is not runnable on this 617 * engine). Hopefully, we will already have submitted the next 618 * request before the tasklet runs and do not need to rebuild 619 * each virtual tree and kick everyone again. 620 */ 621 if (ce->engine != engine) 622 kick_siblings(rq, ce); 623 624 intel_context_put(ce); 625 } 626 627 static inline void 628 execlists_schedule_out(struct i915_request *rq) 629 { 630 struct intel_context * const ce = rq->hw_context; 631 struct intel_engine_cs *cur, *old; 632 633 trace_i915_request_out(rq); 634 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 635 636 old = READ_ONCE(ce->inflight); 637 do 638 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 639 while (!try_cmpxchg(&ce->inflight, &old, cur)); 640 if (!cur) 641 __execlists_schedule_out(rq, old); 642 643 i915_request_put(rq); 644 } 645 646 static u64 execlists_update_context(const struct i915_request *rq) 647 { 648 struct intel_context *ce = rq->hw_context; 649 u64 desc; 650 651 ce->lrc_reg_state[CTX_RING_TAIL + 1] = 652 intel_ring_set_tail(rq->ring, rq->tail); 653 654 /* 655 * Make sure the context image is complete before we submit it to HW. 656 * 657 * Ostensibly, writes (including the WCB) should be flushed prior to 658 * an uncached write such as our mmio register access, the empirical 659 * evidence (esp. on Braswell) suggests that the WC write into memory 660 * may not be visible to the HW prior to the completion of the UC 661 * register write and that we may begin execution from the context 662 * before its image is complete leading to invalid PD chasing. 663 * 664 * Furthermore, Braswell, at least, wants a full mb to be sure that 665 * the writes are coherent in memory (visible to the GPU) prior to 666 * execution, and not just visible to other CPUs (as is the result of 667 * wmb). 668 */ 669 mb(); 670 671 desc = ce->lrc_desc; 672 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 673 674 return desc; 675 } 676 677 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 678 { 679 if (execlists->ctrl_reg) { 680 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 681 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 682 } else { 683 writel(upper_32_bits(desc), execlists->submit_reg); 684 writel(lower_32_bits(desc), execlists->submit_reg); 685 } 686 } 687 688 static __maybe_unused void 689 trace_ports(const struct intel_engine_execlists *execlists, 690 const char *msg, 691 struct i915_request * const *ports) 692 { 693 const struct intel_engine_cs *engine = 694 container_of(execlists, typeof(*engine), execlists); 695 696 GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n", 697 engine->name, msg, 698 ports[0]->fence.context, 699 ports[0]->fence.seqno, 700 i915_request_completed(ports[0]) ? "!" : 701 i915_request_started(ports[0]) ? "*" : 702 "", 703 ports[1] ? ports[1]->fence.context : 0, 704 ports[1] ? ports[1]->fence.seqno : 0); 705 } 706 707 static __maybe_unused bool 708 assert_pending_valid(const struct intel_engine_execlists *execlists, 709 const char *msg) 710 { 711 struct i915_request * const *port, *rq; 712 struct intel_context *ce = NULL; 713 714 trace_ports(execlists, msg, execlists->pending); 715 716 if (!execlists->pending[0]) 717 return false; 718 719 if (execlists->pending[execlists_num_ports(execlists)]) 720 return false; 721 722 for (port = execlists->pending; (rq = *port); port++) { 723 if (ce == rq->hw_context) 724 return false; 725 726 ce = rq->hw_context; 727 if (i915_request_completed(rq)) 728 continue; 729 730 if (i915_active_is_idle(&ce->active)) 731 return false; 732 733 if (!i915_vma_is_pinned(ce->state)) 734 return false; 735 } 736 737 return ce; 738 } 739 740 static void execlists_submit_ports(struct intel_engine_cs *engine) 741 { 742 struct intel_engine_execlists *execlists = &engine->execlists; 743 unsigned int n; 744 745 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 746 747 /* 748 * We can skip acquiring intel_runtime_pm_get() here as it was taken 749 * on our behalf by the request (see i915_gem_mark_busy()) and it will 750 * not be relinquished until the device is idle (see 751 * i915_gem_idle_work_handler()). As a precaution, we make sure 752 * that all ELSP are drained i.e. we have processed the CSB, 753 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 754 */ 755 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 756 757 /* 758 * ELSQ note: the submit queue is not cleared after being submitted 759 * to the HW so we need to make sure we always clean it up. This is 760 * currently ensured by the fact that we always write the same number 761 * of elsq entries, keep this in mind before changing the loop below. 762 */ 763 for (n = execlists_num_ports(execlists); n--; ) { 764 struct i915_request *rq = execlists->pending[n]; 765 766 write_desc(execlists, 767 rq ? execlists_update_context(rq) : 0, 768 n); 769 } 770 771 /* we need to manually load the submit queue */ 772 if (execlists->ctrl_reg) 773 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 774 } 775 776 static bool ctx_single_port_submission(const struct intel_context *ce) 777 { 778 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 779 i915_gem_context_force_single_submission(ce->gem_context)); 780 } 781 782 static bool can_merge_ctx(const struct intel_context *prev, 783 const struct intel_context *next) 784 { 785 if (prev != next) 786 return false; 787 788 if (ctx_single_port_submission(prev)) 789 return false; 790 791 return true; 792 } 793 794 static bool can_merge_rq(const struct i915_request *prev, 795 const struct i915_request *next) 796 { 797 GEM_BUG_ON(prev == next); 798 GEM_BUG_ON(!assert_priority_queue(prev, next)); 799 800 if (!can_merge_ctx(prev->hw_context, next->hw_context)) 801 return false; 802 803 return true; 804 } 805 806 static void virtual_update_register_offsets(u32 *regs, 807 struct intel_engine_cs *engine) 808 { 809 u32 base = engine->mmio_base; 810 811 /* Must match execlists_init_reg_state()! */ 812 813 regs[CTX_CONTEXT_CONTROL] = 814 i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base)); 815 regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base)); 816 regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base)); 817 regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base)); 818 regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base)); 819 820 regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base)); 821 regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base)); 822 regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base)); 823 regs[CTX_SECOND_BB_HEAD_U] = 824 i915_mmio_reg_offset(RING_SBBADDR_UDW(base)); 825 regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base)); 826 regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base)); 827 828 regs[CTX_CTX_TIMESTAMP] = 829 i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base)); 830 regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3)); 831 regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3)); 832 regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2)); 833 regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2)); 834 regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1)); 835 regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1)); 836 regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0)); 837 regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0)); 838 839 if (engine->class == RENDER_CLASS) { 840 regs[CTX_RCS_INDIRECT_CTX] = 841 i915_mmio_reg_offset(RING_INDIRECT_CTX(base)); 842 regs[CTX_RCS_INDIRECT_CTX_OFFSET] = 843 i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base)); 844 regs[CTX_BB_PER_CTX_PTR] = 845 i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base)); 846 847 regs[CTX_R_PWR_CLK_STATE] = 848 i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE); 849 } 850 } 851 852 static bool virtual_matches(const struct virtual_engine *ve, 853 const struct i915_request *rq, 854 const struct intel_engine_cs *engine) 855 { 856 const struct intel_engine_cs *inflight; 857 858 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 859 return false; 860 861 /* 862 * We track when the HW has completed saving the context image 863 * (i.e. when we have seen the final CS event switching out of 864 * the context) and must not overwrite the context image before 865 * then. This restricts us to only using the active engine 866 * while the previous virtualized request is inflight (so 867 * we reuse the register offsets). This is a very small 868 * hystersis on the greedy seelction algorithm. 869 */ 870 inflight = intel_context_inflight(&ve->context); 871 if (inflight && inflight != engine) 872 return false; 873 874 return true; 875 } 876 877 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 878 struct intel_engine_cs *engine) 879 { 880 struct intel_engine_cs *old = ve->siblings[0]; 881 882 /* All unattached (rq->engine == old) must already be completed */ 883 884 spin_lock(&old->breadcrumbs.irq_lock); 885 if (!list_empty(&ve->context.signal_link)) { 886 list_move_tail(&ve->context.signal_link, 887 &engine->breadcrumbs.signalers); 888 intel_engine_queue_breadcrumbs(engine); 889 } 890 spin_unlock(&old->breadcrumbs.irq_lock); 891 } 892 893 static struct i915_request * 894 last_active(const struct intel_engine_execlists *execlists) 895 { 896 struct i915_request * const *last = execlists->active; 897 898 while (*last && i915_request_completed(*last)) 899 last++; 900 901 return *last; 902 } 903 904 static void defer_request(struct i915_request *rq, struct list_head * const pl) 905 { 906 LIST_HEAD(list); 907 908 /* 909 * We want to move the interrupted request to the back of 910 * the round-robin list (i.e. its priority level), but 911 * in doing so, we must then move all requests that were in 912 * flight and were waiting for the interrupted request to 913 * be run after it again. 914 */ 915 do { 916 struct i915_dependency *p; 917 918 GEM_BUG_ON(i915_request_is_active(rq)); 919 list_move_tail(&rq->sched.link, pl); 920 921 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 922 struct i915_request *w = 923 container_of(p->waiter, typeof(*w), sched); 924 925 /* Leave semaphores spinning on the other engines */ 926 if (w->engine != rq->engine) 927 continue; 928 929 /* No waiter should start before its signaler */ 930 GEM_BUG_ON(i915_request_started(w) && 931 !i915_request_completed(rq)); 932 933 GEM_BUG_ON(i915_request_is_active(w)); 934 if (list_empty(&w->sched.link)) 935 continue; /* Not yet submitted; unready */ 936 937 if (rq_prio(w) < rq_prio(rq)) 938 continue; 939 940 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 941 list_move_tail(&w->sched.link, &list); 942 } 943 944 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 945 } while (rq); 946 } 947 948 static void defer_active(struct intel_engine_cs *engine) 949 { 950 struct i915_request *rq; 951 952 rq = __unwind_incomplete_requests(engine); 953 if (!rq) 954 return; 955 956 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 957 } 958 959 static bool 960 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 961 { 962 int hint; 963 964 if (!intel_engine_has_semaphores(engine)) 965 return false; 966 967 if (list_is_last(&rq->sched.link, &engine->active.requests)) 968 return false; 969 970 hint = max(rq_prio(list_next_entry(rq, sched.link)), 971 engine->execlists.queue_priority_hint); 972 973 return hint >= effective_prio(rq); 974 } 975 976 static int 977 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 978 { 979 if (list_is_last(&rq->sched.link, &engine->active.requests)) 980 return INT_MIN; 981 982 return rq_prio(list_next_entry(rq, sched.link)); 983 } 984 985 static bool 986 enable_timeslice(const struct intel_engine_execlists *execlists) 987 { 988 const struct i915_request *rq = *execlists->active; 989 990 if (i915_request_completed(rq)) 991 return false; 992 993 return execlists->switch_priority_hint >= effective_prio(rq); 994 } 995 996 static void record_preemption(struct intel_engine_execlists *execlists) 997 { 998 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 999 } 1000 1001 static void execlists_dequeue(struct intel_engine_cs *engine) 1002 { 1003 struct intel_engine_execlists * const execlists = &engine->execlists; 1004 struct i915_request **port = execlists->pending; 1005 struct i915_request ** const last_port = port + execlists->port_mask; 1006 struct i915_request *last; 1007 struct rb_node *rb; 1008 bool submit = false; 1009 1010 /* 1011 * Hardware submission is through 2 ports. Conceptually each port 1012 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1013 * static for a context, and unique to each, so we only execute 1014 * requests belonging to a single context from each ring. RING_HEAD 1015 * is maintained by the CS in the context image, it marks the place 1016 * where it got up to last time, and through RING_TAIL we tell the CS 1017 * where we want to execute up to this time. 1018 * 1019 * In this list the requests are in order of execution. Consecutive 1020 * requests from the same context are adjacent in the ringbuffer. We 1021 * can combine these requests into a single RING_TAIL update: 1022 * 1023 * RING_HEAD...req1...req2 1024 * ^- RING_TAIL 1025 * since to execute req2 the CS must first execute req1. 1026 * 1027 * Our goal then is to point each port to the end of a consecutive 1028 * sequence of requests as being the most optimal (fewest wake ups 1029 * and context switches) submission. 1030 */ 1031 1032 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1033 struct virtual_engine *ve = 1034 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1035 struct i915_request *rq = READ_ONCE(ve->request); 1036 1037 if (!rq) { /* lazily cleanup after another engine handled rq */ 1038 rb_erase_cached(rb, &execlists->virtual); 1039 RB_CLEAR_NODE(rb); 1040 rb = rb_first_cached(&execlists->virtual); 1041 continue; 1042 } 1043 1044 if (!virtual_matches(ve, rq, engine)) { 1045 rb = rb_next(rb); 1046 continue; 1047 } 1048 1049 break; 1050 } 1051 1052 /* 1053 * If the queue is higher priority than the last 1054 * request in the currently active context, submit afresh. 1055 * We will resubmit again afterwards in case we need to split 1056 * the active context to interject the preemption request, 1057 * i.e. we will retrigger preemption following the ack in case 1058 * of trouble. 1059 */ 1060 last = last_active(execlists); 1061 if (last) { 1062 if (need_preempt(engine, last, rb)) { 1063 GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n", 1064 engine->name, 1065 last->fence.context, 1066 last->fence.seqno, 1067 last->sched.attr.priority, 1068 execlists->queue_priority_hint); 1069 record_preemption(execlists); 1070 1071 /* 1072 * Don't let the RING_HEAD advance past the breadcrumb 1073 * as we unwind (and until we resubmit) so that we do 1074 * not accidentally tell it to go backwards. 1075 */ 1076 ring_set_paused(engine, 1); 1077 1078 /* 1079 * Note that we have not stopped the GPU at this point, 1080 * so we are unwinding the incomplete requests as they 1081 * remain inflight and so by the time we do complete 1082 * the preemption, some of the unwound requests may 1083 * complete! 1084 */ 1085 __unwind_incomplete_requests(engine); 1086 1087 /* 1088 * If we need to return to the preempted context, we 1089 * need to skip the lite-restore and force it to 1090 * reload the RING_TAIL. Otherwise, the HW has a 1091 * tendency to ignore us rewinding the TAIL to the 1092 * end of an earlier request. 1093 */ 1094 last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1095 last = NULL; 1096 } else if (need_timeslice(engine, last) && 1097 !timer_pending(&engine->execlists.timer)) { 1098 GEM_TRACE("%s: expired last=%llx:%lld, prio=%d, hint=%d\n", 1099 engine->name, 1100 last->fence.context, 1101 last->fence.seqno, 1102 last->sched.attr.priority, 1103 execlists->queue_priority_hint); 1104 1105 ring_set_paused(engine, 1); 1106 defer_active(engine); 1107 1108 /* 1109 * Unlike for preemption, if we rewind and continue 1110 * executing the same context as previously active, 1111 * the order of execution will remain the same and 1112 * the tail will only advance. We do not need to 1113 * force a full context restore, as a lite-restore 1114 * is sufficient to resample the monotonic TAIL. 1115 * 1116 * If we switch to any other context, similarly we 1117 * will not rewind TAIL of current context, and 1118 * normal save/restore will preserve state and allow 1119 * us to later continue executing the same request. 1120 */ 1121 last = NULL; 1122 } else { 1123 /* 1124 * Otherwise if we already have a request pending 1125 * for execution after the current one, we can 1126 * just wait until the next CS event before 1127 * queuing more. In either case we will force a 1128 * lite-restore preemption event, but if we wait 1129 * we hopefully coalesce several updates into a single 1130 * submission. 1131 */ 1132 if (!list_is_last(&last->sched.link, 1133 &engine->active.requests)) 1134 return; 1135 1136 /* 1137 * WaIdleLiteRestore:bdw,skl 1138 * Apply the wa NOOPs to prevent 1139 * ring:HEAD == rq:TAIL as we resubmit the 1140 * request. See gen8_emit_fini_breadcrumb() for 1141 * where we prepare the padding after the 1142 * end of the request. 1143 */ 1144 last->tail = last->wa_tail; 1145 } 1146 } 1147 1148 while (rb) { /* XXX virtual is always taking precedence */ 1149 struct virtual_engine *ve = 1150 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1151 struct i915_request *rq; 1152 1153 spin_lock(&ve->base.active.lock); 1154 1155 rq = ve->request; 1156 if (unlikely(!rq)) { /* lost the race to a sibling */ 1157 spin_unlock(&ve->base.active.lock); 1158 rb_erase_cached(rb, &execlists->virtual); 1159 RB_CLEAR_NODE(rb); 1160 rb = rb_first_cached(&execlists->virtual); 1161 continue; 1162 } 1163 1164 GEM_BUG_ON(rq != ve->request); 1165 GEM_BUG_ON(rq->engine != &ve->base); 1166 GEM_BUG_ON(rq->hw_context != &ve->context); 1167 1168 if (rq_prio(rq) >= queue_prio(execlists)) { 1169 if (!virtual_matches(ve, rq, engine)) { 1170 spin_unlock(&ve->base.active.lock); 1171 rb = rb_next(rb); 1172 continue; 1173 } 1174 1175 if (i915_request_completed(rq)) { 1176 ve->request = NULL; 1177 ve->base.execlists.queue_priority_hint = INT_MIN; 1178 rb_erase_cached(rb, &execlists->virtual); 1179 RB_CLEAR_NODE(rb); 1180 1181 rq->engine = engine; 1182 __i915_request_submit(rq); 1183 1184 spin_unlock(&ve->base.active.lock); 1185 1186 rb = rb_first_cached(&execlists->virtual); 1187 continue; 1188 } 1189 1190 if (last && !can_merge_rq(last, rq)) { 1191 spin_unlock(&ve->base.active.lock); 1192 return; /* leave this for another */ 1193 } 1194 1195 GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", 1196 engine->name, 1197 rq->fence.context, 1198 rq->fence.seqno, 1199 i915_request_completed(rq) ? "!" : 1200 i915_request_started(rq) ? "*" : 1201 "", 1202 yesno(engine != ve->siblings[0])); 1203 1204 ve->request = NULL; 1205 ve->base.execlists.queue_priority_hint = INT_MIN; 1206 rb_erase_cached(rb, &execlists->virtual); 1207 RB_CLEAR_NODE(rb); 1208 1209 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1210 rq->engine = engine; 1211 1212 if (engine != ve->siblings[0]) { 1213 u32 *regs = ve->context.lrc_reg_state; 1214 unsigned int n; 1215 1216 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1217 virtual_update_register_offsets(regs, engine); 1218 1219 if (!list_empty(&ve->context.signals)) 1220 virtual_xfer_breadcrumbs(ve, engine); 1221 1222 /* 1223 * Move the bound engine to the top of the list 1224 * for future execution. We then kick this 1225 * tasklet first before checking others, so that 1226 * we preferentially reuse this set of bound 1227 * registers. 1228 */ 1229 for (n = 1; n < ve->num_siblings; n++) { 1230 if (ve->siblings[n] == engine) { 1231 swap(ve->siblings[n], 1232 ve->siblings[0]); 1233 break; 1234 } 1235 } 1236 1237 GEM_BUG_ON(ve->siblings[0] != engine); 1238 } 1239 1240 __i915_request_submit(rq); 1241 if (!i915_request_completed(rq)) { 1242 submit = true; 1243 last = rq; 1244 } 1245 } 1246 1247 spin_unlock(&ve->base.active.lock); 1248 break; 1249 } 1250 1251 while ((rb = rb_first_cached(&execlists->queue))) { 1252 struct i915_priolist *p = to_priolist(rb); 1253 struct i915_request *rq, *rn; 1254 int i; 1255 1256 priolist_for_each_request_consume(rq, rn, p, i) { 1257 if (i915_request_completed(rq)) 1258 goto skip; 1259 1260 /* 1261 * Can we combine this request with the current port? 1262 * It has to be the same context/ringbuffer and not 1263 * have any exceptions (e.g. GVT saying never to 1264 * combine contexts). 1265 * 1266 * If we can combine the requests, we can execute both 1267 * by updating the RING_TAIL to point to the end of the 1268 * second request, and so we never need to tell the 1269 * hardware about the first. 1270 */ 1271 if (last && !can_merge_rq(last, rq)) { 1272 /* 1273 * If we are on the second port and cannot 1274 * combine this request with the last, then we 1275 * are done. 1276 */ 1277 if (port == last_port) 1278 goto done; 1279 1280 /* 1281 * We must not populate both ELSP[] with the 1282 * same LRCA, i.e. we must submit 2 different 1283 * contexts if we submit 2 ELSP. 1284 */ 1285 if (last->hw_context == rq->hw_context) 1286 goto done; 1287 1288 /* 1289 * If GVT overrides us we only ever submit 1290 * port[0], leaving port[1] empty. Note that we 1291 * also have to be careful that we don't queue 1292 * the same context (even though a different 1293 * request) to the second port. 1294 */ 1295 if (ctx_single_port_submission(last->hw_context) || 1296 ctx_single_port_submission(rq->hw_context)) 1297 goto done; 1298 1299 *port = execlists_schedule_in(last, port - execlists->pending); 1300 port++; 1301 } 1302 1303 last = rq; 1304 submit = true; 1305 skip: 1306 __i915_request_submit(rq); 1307 } 1308 1309 rb_erase_cached(&p->node, &execlists->queue); 1310 i915_priolist_free(p); 1311 } 1312 1313 done: 1314 /* 1315 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 1316 * 1317 * We choose the priority hint such that if we add a request of greater 1318 * priority than this, we kick the submission tasklet to decide on 1319 * the right order of submitting the requests to hardware. We must 1320 * also be prepared to reorder requests as they are in-flight on the 1321 * HW. We derive the priority hint then as the first "hole" in 1322 * the HW submission ports and if there are no available slots, 1323 * the priority of the lowest executing request, i.e. last. 1324 * 1325 * When we do receive a higher priority request ready to run from the 1326 * user, see queue_request(), the priority hint is bumped to that 1327 * request triggering preemption on the next dequeue (or subsequent 1328 * interrupt for secondary ports). 1329 */ 1330 execlists->queue_priority_hint = queue_prio(execlists); 1331 GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n", 1332 engine->name, execlists->queue_priority_hint, 1333 yesno(submit)); 1334 1335 if (submit) { 1336 *port = execlists_schedule_in(last, port - execlists->pending); 1337 memset(port + 1, 0, (last_port - port) * sizeof(*port)); 1338 execlists->switch_priority_hint = 1339 switch_prio(engine, *execlists->pending); 1340 execlists_submit_ports(engine); 1341 } else { 1342 ring_set_paused(engine, 0); 1343 } 1344 } 1345 1346 static void 1347 cancel_port_requests(struct intel_engine_execlists * const execlists) 1348 { 1349 struct i915_request * const *port, *rq; 1350 1351 for (port = execlists->pending; (rq = *port); port++) 1352 execlists_schedule_out(rq); 1353 memset(execlists->pending, 0, sizeof(execlists->pending)); 1354 1355 for (port = execlists->active; (rq = *port); port++) 1356 execlists_schedule_out(rq); 1357 execlists->active = 1358 memset(execlists->inflight, 0, sizeof(execlists->inflight)); 1359 } 1360 1361 static inline void 1362 invalidate_csb_entries(const u32 *first, const u32 *last) 1363 { 1364 clflush((void *)first); 1365 clflush((void *)last); 1366 } 1367 1368 static inline bool 1369 reset_in_progress(const struct intel_engine_execlists *execlists) 1370 { 1371 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1372 } 1373 1374 enum csb_step { 1375 CSB_NOP, 1376 CSB_PROMOTE, 1377 CSB_PREEMPT, 1378 CSB_COMPLETE, 1379 }; 1380 1381 /* 1382 * Starting with Gen12, the status has a new format: 1383 * 1384 * bit 0: switched to new queue 1385 * bit 1: reserved 1386 * bit 2: semaphore wait mode (poll or signal), only valid when 1387 * switch detail is set to "wait on semaphore" 1388 * bits 3-5: engine class 1389 * bits 6-11: engine instance 1390 * bits 12-14: reserved 1391 * bits 15-25: sw context id of the lrc the GT switched to 1392 * bits 26-31: sw counter of the lrc the GT switched to 1393 * bits 32-35: context switch detail 1394 * - 0: ctx complete 1395 * - 1: wait on sync flip 1396 * - 2: wait on vblank 1397 * - 3: wait on scanline 1398 * - 4: wait on semaphore 1399 * - 5: context preempted (not on SEMAPHORE_WAIT or 1400 * WAIT_FOR_EVENT) 1401 * bit 36: reserved 1402 * bits 37-43: wait detail (for switch detail 1 to 4) 1403 * bits 44-46: reserved 1404 * bits 47-57: sw context id of the lrc the GT switched away from 1405 * bits 58-63: sw counter of the lrc the GT switched away from 1406 */ 1407 static inline enum csb_step 1408 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 1409 { 1410 u32 lower_dw = csb[0]; 1411 u32 upper_dw = csb[1]; 1412 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 1413 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 1414 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 1415 1416 if (!ctx_away_valid && ctx_to_valid) 1417 return CSB_PROMOTE; 1418 1419 /* 1420 * The context switch detail is not guaranteed to be 5 when a preemption 1421 * occurs, so we can't just check for that. The check below works for 1422 * all the cases we care about, including preemptions of WAIT 1423 * instructions and lite-restore. Preempt-to-idle via the CTRL register 1424 * would require some extra handling, but we don't support that. 1425 */ 1426 if (new_queue && ctx_away_valid) 1427 return CSB_PREEMPT; 1428 1429 /* 1430 * switch detail = 5 is covered by the case above and we do not expect a 1431 * context switch on an unsuccessful wait instruction since we always 1432 * use polling mode. 1433 */ 1434 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 1435 1436 if (*execlists->active) { 1437 GEM_BUG_ON(!ctx_away_valid); 1438 return CSB_COMPLETE; 1439 } 1440 1441 return CSB_NOP; 1442 } 1443 1444 static inline enum csb_step 1445 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 1446 { 1447 unsigned int status = *csb; 1448 1449 if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) 1450 return CSB_PROMOTE; 1451 1452 if (status & GEN8_CTX_STATUS_PREEMPTED) 1453 return CSB_PREEMPT; 1454 1455 if (*execlists->active) 1456 return CSB_COMPLETE; 1457 1458 return CSB_NOP; 1459 } 1460 1461 static void process_csb(struct intel_engine_cs *engine) 1462 { 1463 struct intel_engine_execlists * const execlists = &engine->execlists; 1464 const u32 * const buf = execlists->csb_status; 1465 const u8 num_entries = execlists->csb_size; 1466 u8 head, tail; 1467 1468 GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915)); 1469 1470 /* 1471 * Note that csb_write, csb_status may be either in HWSP or mmio. 1472 * When reading from the csb_write mmio register, we have to be 1473 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 1474 * the low 4bits. As it happens we know the next 4bits are always 1475 * zero and so we can simply masked off the low u8 of the register 1476 * and treat it identically to reading from the HWSP (without having 1477 * to use explicit shifting and masking, and probably bifurcating 1478 * the code to handle the legacy mmio read). 1479 */ 1480 head = execlists->csb_head; 1481 tail = READ_ONCE(*execlists->csb_write); 1482 GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail); 1483 if (unlikely(head == tail)) 1484 return; 1485 1486 /* 1487 * Hopefully paired with a wmb() in HW! 1488 * 1489 * We must complete the read of the write pointer before any reads 1490 * from the CSB, so that we do not see stale values. Without an rmb 1491 * (lfence) the HW may speculatively perform the CSB[] reads *before* 1492 * we perform the READ_ONCE(*csb_write). 1493 */ 1494 rmb(); 1495 1496 do { 1497 enum csb_step csb_step; 1498 1499 if (++head == num_entries) 1500 head = 0; 1501 1502 /* 1503 * We are flying near dragons again. 1504 * 1505 * We hold a reference to the request in execlist_port[] 1506 * but no more than that. We are operating in softirq 1507 * context and so cannot hold any mutex or sleep. That 1508 * prevents us stopping the requests we are processing 1509 * in port[] from being retired simultaneously (the 1510 * breadcrumb will be complete before we see the 1511 * context-switch). As we only hold the reference to the 1512 * request, any pointer chasing underneath the request 1513 * is subject to a potential use-after-free. Thus we 1514 * store all of the bookkeeping within port[] as 1515 * required, and avoid using unguarded pointers beneath 1516 * request itself. The same applies to the atomic 1517 * status notifier. 1518 */ 1519 1520 GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n", 1521 engine->name, head, 1522 buf[2 * head + 0], buf[2 * head + 1]); 1523 1524 if (INTEL_GEN(engine->i915) >= 12) 1525 csb_step = gen12_csb_parse(execlists, buf + 2 * head); 1526 else 1527 csb_step = gen8_csb_parse(execlists, buf + 2 * head); 1528 1529 switch (csb_step) { 1530 case CSB_PREEMPT: /* cancel old inflight, prepare for switch */ 1531 trace_ports(execlists, "preempted", execlists->active); 1532 1533 while (*execlists->active) 1534 execlists_schedule_out(*execlists->active++); 1535 1536 /* fallthrough */ 1537 case CSB_PROMOTE: /* switch pending to inflight */ 1538 GEM_BUG_ON(*execlists->active); 1539 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 1540 execlists->active = 1541 memcpy(execlists->inflight, 1542 execlists->pending, 1543 execlists_num_ports(execlists) * 1544 sizeof(*execlists->pending)); 1545 1546 if (enable_timeslice(execlists)) 1547 mod_timer(&execlists->timer, jiffies + 1); 1548 1549 if (!inject_preempt_hang(execlists)) 1550 ring_set_paused(engine, 0); 1551 1552 WRITE_ONCE(execlists->pending[0], NULL); 1553 break; 1554 1555 case CSB_COMPLETE: /* port0 completed, advanced to port1 */ 1556 trace_ports(execlists, "completed", execlists->active); 1557 1558 /* 1559 * We rely on the hardware being strongly 1560 * ordered, that the breadcrumb write is 1561 * coherent (visible from the CPU) before the 1562 * user interrupt and CSB is processed. 1563 */ 1564 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 1565 !reset_in_progress(execlists)); 1566 execlists_schedule_out(*execlists->active++); 1567 1568 GEM_BUG_ON(execlists->active - execlists->inflight > 1569 execlists_num_ports(execlists)); 1570 break; 1571 1572 case CSB_NOP: 1573 break; 1574 } 1575 } while (head != tail); 1576 1577 execlists->csb_head = head; 1578 1579 /* 1580 * Gen11 has proven to fail wrt global observation point between 1581 * entry and tail update, failing on the ordering and thus 1582 * we see an old entry in the context status buffer. 1583 * 1584 * Forcibly evict out entries for the next gpu csb update, 1585 * to increase the odds that we get a fresh entries with non 1586 * working hardware. The cost for doing so comes out mostly with 1587 * the wash as hardware, working or not, will need to do the 1588 * invalidation before. 1589 */ 1590 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 1591 } 1592 1593 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 1594 { 1595 lockdep_assert_held(&engine->active.lock); 1596 if (!engine->execlists.pending[0]) 1597 execlists_dequeue(engine); 1598 } 1599 1600 /* 1601 * Check the unread Context Status Buffers and manage the submission of new 1602 * contexts to the ELSP accordingly. 1603 */ 1604 static void execlists_submission_tasklet(unsigned long data) 1605 { 1606 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 1607 unsigned long flags; 1608 1609 process_csb(engine); 1610 if (!READ_ONCE(engine->execlists.pending[0])) { 1611 spin_lock_irqsave(&engine->active.lock, flags); 1612 __execlists_submission_tasklet(engine); 1613 spin_unlock_irqrestore(&engine->active.lock, flags); 1614 } 1615 } 1616 1617 static void execlists_submission_timer(struct timer_list *timer) 1618 { 1619 struct intel_engine_cs *engine = 1620 from_timer(engine, timer, execlists.timer); 1621 1622 /* Kick the tasklet for some interrupt coalescing and reset handling */ 1623 tasklet_hi_schedule(&engine->execlists.tasklet); 1624 } 1625 1626 static void queue_request(struct intel_engine_cs *engine, 1627 struct i915_sched_node *node, 1628 int prio) 1629 { 1630 GEM_BUG_ON(!list_empty(&node->link)); 1631 list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); 1632 } 1633 1634 static void __submit_queue_imm(struct intel_engine_cs *engine) 1635 { 1636 struct intel_engine_execlists * const execlists = &engine->execlists; 1637 1638 if (reset_in_progress(execlists)) 1639 return; /* defer until we restart the engine following reset */ 1640 1641 if (execlists->tasklet.func == execlists_submission_tasklet) 1642 __execlists_submission_tasklet(engine); 1643 else 1644 tasklet_hi_schedule(&execlists->tasklet); 1645 } 1646 1647 static void submit_queue(struct intel_engine_cs *engine, 1648 const struct i915_request *rq) 1649 { 1650 struct intel_engine_execlists *execlists = &engine->execlists; 1651 1652 if (rq_prio(rq) <= execlists->queue_priority_hint) 1653 return; 1654 1655 execlists->queue_priority_hint = rq_prio(rq); 1656 __submit_queue_imm(engine); 1657 } 1658 1659 static void execlists_submit_request(struct i915_request *request) 1660 { 1661 struct intel_engine_cs *engine = request->engine; 1662 unsigned long flags; 1663 1664 /* Will be called from irq-context when using foreign fences. */ 1665 spin_lock_irqsave(&engine->active.lock, flags); 1666 1667 queue_request(engine, &request->sched, rq_prio(request)); 1668 1669 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1670 GEM_BUG_ON(list_empty(&request->sched.link)); 1671 1672 submit_queue(engine, request); 1673 1674 spin_unlock_irqrestore(&engine->active.lock, flags); 1675 } 1676 1677 static void __execlists_context_fini(struct intel_context *ce) 1678 { 1679 intel_ring_put(ce->ring); 1680 i915_vma_put(ce->state); 1681 } 1682 1683 static void execlists_context_destroy(struct kref *kref) 1684 { 1685 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 1686 1687 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 1688 GEM_BUG_ON(intel_context_is_pinned(ce)); 1689 1690 if (ce->state) 1691 __execlists_context_fini(ce); 1692 1693 intel_context_fini(ce); 1694 intel_context_free(ce); 1695 } 1696 1697 static void 1698 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 1699 { 1700 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1701 return; 1702 1703 vaddr += LRC_HEADER_PAGES * PAGE_SIZE; 1704 vaddr += engine->context_size; 1705 1706 memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE); 1707 } 1708 1709 static void 1710 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 1711 { 1712 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1713 return; 1714 1715 vaddr += LRC_HEADER_PAGES * PAGE_SIZE; 1716 vaddr += engine->context_size; 1717 1718 if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) 1719 dev_err_once(engine->i915->drm.dev, 1720 "%s context redzone overwritten!\n", 1721 engine->name); 1722 } 1723 1724 static void execlists_context_unpin(struct intel_context *ce) 1725 { 1726 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 1727 ce->engine); 1728 1729 i915_gem_context_unpin_hw_id(ce->gem_context); 1730 i915_gem_object_unpin_map(ce->state->obj); 1731 intel_ring_reset(ce->ring, ce->ring->tail); 1732 } 1733 1734 static void 1735 __execlists_update_reg_state(struct intel_context *ce, 1736 struct intel_engine_cs *engine) 1737 { 1738 struct intel_ring *ring = ce->ring; 1739 u32 *regs = ce->lrc_reg_state; 1740 1741 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); 1742 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 1743 1744 regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma); 1745 regs[CTX_RING_HEAD + 1] = ring->head; 1746 regs[CTX_RING_TAIL + 1] = ring->tail; 1747 1748 /* RPCS */ 1749 if (engine->class == RENDER_CLASS) { 1750 regs[CTX_R_PWR_CLK_STATE + 1] = 1751 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 1752 1753 i915_oa_init_reg_state(engine, ce, regs); 1754 } 1755 } 1756 1757 static int 1758 __execlists_context_pin(struct intel_context *ce, 1759 struct intel_engine_cs *engine) 1760 { 1761 void *vaddr; 1762 int ret; 1763 1764 GEM_BUG_ON(!ce->state); 1765 1766 ret = intel_context_active_acquire(ce); 1767 if (ret) 1768 goto err; 1769 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 1770 1771 vaddr = i915_gem_object_pin_map(ce->state->obj, 1772 i915_coherent_map_type(engine->i915) | 1773 I915_MAP_OVERRIDE); 1774 if (IS_ERR(vaddr)) { 1775 ret = PTR_ERR(vaddr); 1776 goto unpin_active; 1777 } 1778 1779 ret = i915_gem_context_pin_hw_id(ce->gem_context); 1780 if (ret) 1781 goto unpin_map; 1782 1783 ce->lrc_desc = lrc_descriptor(ce, engine); 1784 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 1785 __execlists_update_reg_state(ce, engine); 1786 1787 return 0; 1788 1789 unpin_map: 1790 i915_gem_object_unpin_map(ce->state->obj); 1791 unpin_active: 1792 intel_context_active_release(ce); 1793 err: 1794 return ret; 1795 } 1796 1797 static int execlists_context_pin(struct intel_context *ce) 1798 { 1799 return __execlists_context_pin(ce, ce->engine); 1800 } 1801 1802 static int execlists_context_alloc(struct intel_context *ce) 1803 { 1804 return __execlists_context_alloc(ce, ce->engine); 1805 } 1806 1807 static void execlists_context_reset(struct intel_context *ce) 1808 { 1809 /* 1810 * Because we emit WA_TAIL_DWORDS there may be a disparity 1811 * between our bookkeeping in ce->ring->head and ce->ring->tail and 1812 * that stored in context. As we only write new commands from 1813 * ce->ring->tail onwards, everything before that is junk. If the GPU 1814 * starts reading from its RING_HEAD from the context, it may try to 1815 * execute that junk and die. 1816 * 1817 * The contexts that are stilled pinned on resume belong to the 1818 * kernel, and are local to each engine. All other contexts will 1819 * have their head/tail sanitized upon pinning before use, so they 1820 * will never see garbage, 1821 * 1822 * So to avoid that we reset the context images upon resume. For 1823 * simplicity, we just zero everything out. 1824 */ 1825 intel_ring_reset(ce->ring, 0); 1826 __execlists_update_reg_state(ce, ce->engine); 1827 } 1828 1829 static const struct intel_context_ops execlists_context_ops = { 1830 .alloc = execlists_context_alloc, 1831 1832 .pin = execlists_context_pin, 1833 .unpin = execlists_context_unpin, 1834 1835 .enter = intel_context_enter_engine, 1836 .exit = intel_context_exit_engine, 1837 1838 .reset = execlists_context_reset, 1839 .destroy = execlists_context_destroy, 1840 }; 1841 1842 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 1843 { 1844 u32 *cs; 1845 1846 GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb); 1847 1848 cs = intel_ring_begin(rq, 6); 1849 if (IS_ERR(cs)) 1850 return PTR_ERR(cs); 1851 1852 /* 1853 * Check if we have been preempted before we even get started. 1854 * 1855 * After this point i915_request_started() reports true, even if 1856 * we get preempted and so are no longer running. 1857 */ 1858 *cs++ = MI_ARB_CHECK; 1859 *cs++ = MI_NOOP; 1860 1861 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 1862 *cs++ = rq->timeline->hwsp_offset; 1863 *cs++ = 0; 1864 *cs++ = rq->fence.seqno - 1; 1865 1866 intel_ring_advance(rq, cs); 1867 1868 /* Record the updated position of the request's payload */ 1869 rq->infix = intel_ring_offset(rq, cs); 1870 1871 return 0; 1872 } 1873 1874 static int emit_pdps(struct i915_request *rq) 1875 { 1876 const struct intel_engine_cs * const engine = rq->engine; 1877 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->hw_context->vm); 1878 int err, i; 1879 u32 *cs; 1880 1881 GEM_BUG_ON(intel_vgpu_active(rq->i915)); 1882 1883 /* 1884 * Beware ye of the dragons, this sequence is magic! 1885 * 1886 * Small changes to this sequence can cause anything from 1887 * GPU hangs to forcewake errors and machine lockups! 1888 */ 1889 1890 /* Flush any residual operations from the context load */ 1891 err = engine->emit_flush(rq, EMIT_FLUSH); 1892 if (err) 1893 return err; 1894 1895 /* Magic required to prevent forcewake errors! */ 1896 err = engine->emit_flush(rq, EMIT_INVALIDATE); 1897 if (err) 1898 return err; 1899 1900 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 1901 if (IS_ERR(cs)) 1902 return PTR_ERR(cs); 1903 1904 /* Ensure the LRI have landed before we invalidate & continue */ 1905 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 1906 for (i = GEN8_3LVL_PDPES; i--; ) { 1907 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 1908 u32 base = engine->mmio_base; 1909 1910 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 1911 *cs++ = upper_32_bits(pd_daddr); 1912 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 1913 *cs++ = lower_32_bits(pd_daddr); 1914 } 1915 *cs++ = MI_NOOP; 1916 1917 intel_ring_advance(rq, cs); 1918 1919 /* Be doubly sure the LRI have landed before proceeding */ 1920 err = engine->emit_flush(rq, EMIT_FLUSH); 1921 if (err) 1922 return err; 1923 1924 /* Re-invalidate the TLB for luck */ 1925 return engine->emit_flush(rq, EMIT_INVALIDATE); 1926 } 1927 1928 static int execlists_request_alloc(struct i915_request *request) 1929 { 1930 int ret; 1931 1932 GEM_BUG_ON(!intel_context_is_pinned(request->hw_context)); 1933 1934 /* 1935 * Flush enough space to reduce the likelihood of waiting after 1936 * we start building the request - in which case we will just 1937 * have to repeat work. 1938 */ 1939 request->reserved_space += EXECLISTS_REQUEST_SIZE; 1940 1941 /* 1942 * Note that after this point, we have committed to using 1943 * this request as it is being used to both track the 1944 * state of engine initialisation and liveness of the 1945 * golden renderstate above. Think twice before you try 1946 * to cancel/unwind this request now. 1947 */ 1948 1949 /* Unconditionally invalidate GPU caches and TLBs. */ 1950 if (i915_vm_is_4lvl(request->hw_context->vm)) 1951 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 1952 else 1953 ret = emit_pdps(request); 1954 if (ret) 1955 return ret; 1956 1957 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 1958 return 0; 1959 } 1960 1961 /* 1962 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 1963 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 1964 * but there is a slight complication as this is applied in WA batch where the 1965 * values are only initialized once so we cannot take register value at the 1966 * beginning and reuse it further; hence we save its value to memory, upload a 1967 * constant value with bit21 set and then we restore it back with the saved value. 1968 * To simplify the WA, a constant value is formed by using the default value 1969 * of this register. This shouldn't be a problem because we are only modifying 1970 * it for a short period and this batch in non-premptible. We can ofcourse 1971 * use additional instructions that read the actual value of the register 1972 * at that time and set our bit of interest but it makes the WA complicated. 1973 * 1974 * This WA is also required for Gen9 so extracting as a function avoids 1975 * code duplication. 1976 */ 1977 static u32 * 1978 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 1979 { 1980 /* NB no one else is allowed to scribble over scratch + 256! */ 1981 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1982 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1983 *batch++ = intel_gt_scratch_offset(engine->gt, 1984 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 1985 *batch++ = 0; 1986 1987 *batch++ = MI_LOAD_REGISTER_IMM(1); 1988 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1989 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 1990 1991 batch = gen8_emit_pipe_control(batch, 1992 PIPE_CONTROL_CS_STALL | 1993 PIPE_CONTROL_DC_FLUSH_ENABLE, 1994 0); 1995 1996 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 1997 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 1998 *batch++ = intel_gt_scratch_offset(engine->gt, 1999 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 2000 *batch++ = 0; 2001 2002 return batch; 2003 } 2004 2005 static u32 slm_offset(struct intel_engine_cs *engine) 2006 { 2007 return intel_gt_scratch_offset(engine->gt, 2008 INTEL_GT_SCRATCH_FIELD_CLEAR_SLM_WA); 2009 } 2010 2011 /* 2012 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 2013 * initialized at the beginning and shared across all contexts but this field 2014 * helps us to have multiple batches at different offsets and select them based 2015 * on a criteria. At the moment this batch always start at the beginning of the page 2016 * and at this point we don't have multiple wa_ctx batch buffers. 2017 * 2018 * The number of WA applied are not known at the beginning; we use this field 2019 * to return the no of DWORDS written. 2020 * 2021 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 2022 * so it adds NOOPs as padding to make it cacheline aligned. 2023 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 2024 * makes a complete batch buffer. 2025 */ 2026 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2027 { 2028 /* WaDisableCtxRestoreArbitration:bdw,chv */ 2029 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2030 2031 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 2032 if (IS_BROADWELL(engine->i915)) 2033 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2034 2035 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 2036 /* Actual scratch location is at 128 bytes offset */ 2037 batch = gen8_emit_pipe_control(batch, 2038 PIPE_CONTROL_FLUSH_L3 | 2039 PIPE_CONTROL_GLOBAL_GTT_IVB | 2040 PIPE_CONTROL_CS_STALL | 2041 PIPE_CONTROL_QW_WRITE, 2042 slm_offset(engine)); 2043 2044 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2045 2046 /* Pad to end of cacheline */ 2047 while ((unsigned long)batch % CACHELINE_BYTES) 2048 *batch++ = MI_NOOP; 2049 2050 /* 2051 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 2052 * execution depends on the length specified in terms of cache lines 2053 * in the register CTX_RCS_INDIRECT_CTX 2054 */ 2055 2056 return batch; 2057 } 2058 2059 struct lri { 2060 i915_reg_t reg; 2061 u32 value; 2062 }; 2063 2064 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 2065 { 2066 GEM_BUG_ON(!count || count > 63); 2067 2068 *batch++ = MI_LOAD_REGISTER_IMM(count); 2069 do { 2070 *batch++ = i915_mmio_reg_offset(lri->reg); 2071 *batch++ = lri->value; 2072 } while (lri++, --count); 2073 *batch++ = MI_NOOP; 2074 2075 return batch; 2076 } 2077 2078 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2079 { 2080 static const struct lri lri[] = { 2081 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 2082 { 2083 COMMON_SLICE_CHICKEN2, 2084 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 2085 0), 2086 }, 2087 2088 /* BSpec: 11391 */ 2089 { 2090 FF_SLICE_CHICKEN, 2091 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 2092 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 2093 }, 2094 2095 /* BSpec: 11299 */ 2096 { 2097 _3D_CHICKEN3, 2098 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 2099 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 2100 } 2101 }; 2102 2103 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2104 2105 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 2106 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 2107 2108 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 2109 2110 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 2111 if (HAS_POOLED_EU(engine->i915)) { 2112 /* 2113 * EU pool configuration is setup along with golden context 2114 * during context initialization. This value depends on 2115 * device type (2x6 or 3x6) and needs to be updated based 2116 * on which subslice is disabled especially for 2x6 2117 * devices, however it is safe to load default 2118 * configuration of 3x6 device instead of masking off 2119 * corresponding bits because HW ignores bits of a disabled 2120 * subslice and drops down to appropriate config. Please 2121 * see render_state_setup() in i915_gem_render_state.c for 2122 * possible configurations, to avoid duplication they are 2123 * not shown here again. 2124 */ 2125 *batch++ = GEN9_MEDIA_POOL_STATE; 2126 *batch++ = GEN9_MEDIA_POOL_ENABLE; 2127 *batch++ = 0x00777000; 2128 *batch++ = 0; 2129 *batch++ = 0; 2130 *batch++ = 0; 2131 } 2132 2133 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2134 2135 /* Pad to end of cacheline */ 2136 while ((unsigned long)batch % CACHELINE_BYTES) 2137 *batch++ = MI_NOOP; 2138 2139 return batch; 2140 } 2141 2142 static u32 * 2143 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 2144 { 2145 int i; 2146 2147 /* 2148 * WaPipeControlBefore3DStateSamplePattern: cnl 2149 * 2150 * Ensure the engine is idle prior to programming a 2151 * 3DSTATE_SAMPLE_PATTERN during a context restore. 2152 */ 2153 batch = gen8_emit_pipe_control(batch, 2154 PIPE_CONTROL_CS_STALL, 2155 0); 2156 /* 2157 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 2158 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 2159 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 2160 * confusing. Since gen8_emit_pipe_control() already advances the 2161 * batch by 6 dwords, we advance the other 10 here, completing a 2162 * cacheline. It's not clear if the workaround requires this padding 2163 * before other commands, or if it's just the regular padding we would 2164 * already have for the workaround bb, so leave it here for now. 2165 */ 2166 for (i = 0; i < 10; i++) 2167 *batch++ = MI_NOOP; 2168 2169 /* Pad to end of cacheline */ 2170 while ((unsigned long)batch % CACHELINE_BYTES) 2171 *batch++ = MI_NOOP; 2172 2173 return batch; 2174 } 2175 2176 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 2177 2178 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 2179 { 2180 struct drm_i915_gem_object *obj; 2181 struct i915_vma *vma; 2182 int err; 2183 2184 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 2185 if (IS_ERR(obj)) 2186 return PTR_ERR(obj); 2187 2188 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 2189 if (IS_ERR(vma)) { 2190 err = PTR_ERR(vma); 2191 goto err; 2192 } 2193 2194 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 2195 if (err) 2196 goto err; 2197 2198 engine->wa_ctx.vma = vma; 2199 return 0; 2200 2201 err: 2202 i915_gem_object_put(obj); 2203 return err; 2204 } 2205 2206 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 2207 { 2208 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 2209 } 2210 2211 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 2212 2213 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 2214 { 2215 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 2216 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 2217 &wa_ctx->per_ctx }; 2218 wa_bb_func_t wa_bb_fn[2]; 2219 struct page *page; 2220 void *batch, *batch_ptr; 2221 unsigned int i; 2222 int ret; 2223 2224 if (engine->class != RENDER_CLASS) 2225 return 0; 2226 2227 switch (INTEL_GEN(engine->i915)) { 2228 case 12: 2229 case 11: 2230 return 0; 2231 case 10: 2232 wa_bb_fn[0] = gen10_init_indirectctx_bb; 2233 wa_bb_fn[1] = NULL; 2234 break; 2235 case 9: 2236 wa_bb_fn[0] = gen9_init_indirectctx_bb; 2237 wa_bb_fn[1] = NULL; 2238 break; 2239 case 8: 2240 wa_bb_fn[0] = gen8_init_indirectctx_bb; 2241 wa_bb_fn[1] = NULL; 2242 break; 2243 default: 2244 MISSING_CASE(INTEL_GEN(engine->i915)); 2245 return 0; 2246 } 2247 2248 ret = lrc_setup_wa_ctx(engine); 2249 if (ret) { 2250 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 2251 return ret; 2252 } 2253 2254 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 2255 batch = batch_ptr = kmap_atomic(page); 2256 2257 /* 2258 * Emit the two workaround batch buffers, recording the offset from the 2259 * start of the workaround batch buffer object for each and their 2260 * respective sizes. 2261 */ 2262 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 2263 wa_bb[i]->offset = batch_ptr - batch; 2264 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 2265 CACHELINE_BYTES))) { 2266 ret = -EINVAL; 2267 break; 2268 } 2269 if (wa_bb_fn[i]) 2270 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 2271 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 2272 } 2273 2274 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 2275 2276 kunmap_atomic(batch); 2277 if (ret) 2278 lrc_destroy_wa_ctx(engine); 2279 2280 return ret; 2281 } 2282 2283 static void enable_execlists(struct intel_engine_cs *engine) 2284 { 2285 u32 mode; 2286 2287 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 2288 2289 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 2290 2291 if (INTEL_GEN(engine->i915) >= 11) 2292 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 2293 else 2294 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 2295 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 2296 2297 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 2298 2299 ENGINE_WRITE_FW(engine, 2300 RING_HWS_PGA, 2301 i915_ggtt_offset(engine->status_page.vma)); 2302 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 2303 } 2304 2305 static bool unexpected_starting_state(struct intel_engine_cs *engine) 2306 { 2307 bool unexpected = false; 2308 2309 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 2310 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 2311 unexpected = true; 2312 } 2313 2314 return unexpected; 2315 } 2316 2317 static int execlists_resume(struct intel_engine_cs *engine) 2318 { 2319 intel_engine_apply_workarounds(engine); 2320 intel_engine_apply_whitelist(engine); 2321 2322 intel_mocs_init_engine(engine); 2323 2324 intel_engine_reset_breadcrumbs(engine); 2325 2326 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 2327 struct drm_printer p = drm_debug_printer(__func__); 2328 2329 intel_engine_dump(engine, &p, NULL); 2330 } 2331 2332 enable_execlists(engine); 2333 2334 return 0; 2335 } 2336 2337 static void execlists_reset_prepare(struct intel_engine_cs *engine) 2338 { 2339 struct intel_engine_execlists * const execlists = &engine->execlists; 2340 unsigned long flags; 2341 2342 GEM_TRACE("%s: depth<-%d\n", engine->name, 2343 atomic_read(&execlists->tasklet.count)); 2344 2345 /* 2346 * Prevent request submission to the hardware until we have 2347 * completed the reset in i915_gem_reset_finish(). If a request 2348 * is completed by one engine, it may then queue a request 2349 * to a second via its execlists->tasklet *just* as we are 2350 * calling engine->resume() and also writing the ELSP. 2351 * Turning off the execlists->tasklet until the reset is over 2352 * prevents the race. 2353 */ 2354 __tasklet_disable_sync_once(&execlists->tasklet); 2355 GEM_BUG_ON(!reset_in_progress(execlists)); 2356 2357 /* And flush any current direct submission. */ 2358 spin_lock_irqsave(&engine->active.lock, flags); 2359 spin_unlock_irqrestore(&engine->active.lock, flags); 2360 2361 /* 2362 * We stop engines, otherwise we might get failed reset and a 2363 * dead gpu (on elk). Also as modern gpu as kbl can suffer 2364 * from system hang if batchbuffer is progressing when 2365 * the reset is issued, regardless of READY_TO_RESET ack. 2366 * Thus assume it is best to stop engines on all gens 2367 * where we have a gpu reset. 2368 * 2369 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 2370 * 2371 * FIXME: Wa for more modern gens needs to be validated 2372 */ 2373 intel_engine_stop_cs(engine); 2374 } 2375 2376 static void reset_csb_pointers(struct intel_engine_cs *engine) 2377 { 2378 struct intel_engine_execlists * const execlists = &engine->execlists; 2379 const unsigned int reset_value = execlists->csb_size - 1; 2380 2381 ring_set_paused(engine, 0); 2382 2383 /* 2384 * After a reset, the HW starts writing into CSB entry [0]. We 2385 * therefore have to set our HEAD pointer back one entry so that 2386 * the *first* entry we check is entry 0. To complicate this further, 2387 * as we don't wait for the first interrupt after reset, we have to 2388 * fake the HW write to point back to the last entry so that our 2389 * inline comparison of our cached head position against the last HW 2390 * write works even before the first interrupt. 2391 */ 2392 execlists->csb_head = reset_value; 2393 WRITE_ONCE(*execlists->csb_write, reset_value); 2394 wmb(); /* Make sure this is visible to HW (paranoia?) */ 2395 2396 invalidate_csb_entries(&execlists->csb_status[0], 2397 &execlists->csb_status[reset_value]); 2398 } 2399 2400 static struct i915_request *active_request(struct i915_request *rq) 2401 { 2402 const struct list_head * const list = &rq->timeline->requests; 2403 const struct intel_context * const ce = rq->hw_context; 2404 struct i915_request *active = NULL; 2405 2406 list_for_each_entry_from_reverse(rq, list, link) { 2407 if (i915_request_completed(rq)) 2408 break; 2409 2410 if (rq->hw_context != ce) 2411 break; 2412 2413 active = rq; 2414 } 2415 2416 return active; 2417 } 2418 2419 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 2420 { 2421 struct intel_engine_execlists * const execlists = &engine->execlists; 2422 struct intel_context *ce; 2423 struct i915_request *rq; 2424 u32 *regs; 2425 2426 process_csb(engine); /* drain preemption events */ 2427 2428 /* Following the reset, we need to reload the CSB read/write pointers */ 2429 reset_csb_pointers(engine); 2430 2431 /* 2432 * Save the currently executing context, even if we completed 2433 * its request, it was still running at the time of the 2434 * reset and will have been clobbered. 2435 */ 2436 rq = execlists_active(execlists); 2437 if (!rq) 2438 goto unwind; 2439 2440 ce = rq->hw_context; 2441 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 2442 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2443 rq = active_request(rq); 2444 if (!rq) { 2445 ce->ring->head = ce->ring->tail; 2446 goto out_replay; 2447 } 2448 2449 ce->ring->head = intel_ring_wrap(ce->ring, rq->head); 2450 2451 /* 2452 * If this request hasn't started yet, e.g. it is waiting on a 2453 * semaphore, we need to avoid skipping the request or else we 2454 * break the signaling chain. However, if the context is corrupt 2455 * the request will not restart and we will be stuck with a wedged 2456 * device. It is quite often the case that if we issue a reset 2457 * while the GPU is loading the context image, that the context 2458 * image becomes corrupt. 2459 * 2460 * Otherwise, if we have not started yet, the request should replay 2461 * perfectly and we do not need to flag the result as being erroneous. 2462 */ 2463 if (!i915_request_started(rq)) 2464 goto out_replay; 2465 2466 /* 2467 * If the request was innocent, we leave the request in the ELSP 2468 * and will try to replay it on restarting. The context image may 2469 * have been corrupted by the reset, in which case we may have 2470 * to service a new GPU hang, but more likely we can continue on 2471 * without impact. 2472 * 2473 * If the request was guilty, we presume the context is corrupt 2474 * and have to at least restore the RING register in the context 2475 * image back to the expected values to skip over the guilty request. 2476 */ 2477 __i915_request_reset(rq, stalled); 2478 if (!stalled) 2479 goto out_replay; 2480 2481 /* 2482 * We want a simple context + ring to execute the breadcrumb update. 2483 * We cannot rely on the context being intact across the GPU hang, 2484 * so clear it and rebuild just what we need for the breadcrumb. 2485 * All pending requests for this context will be zapped, and any 2486 * future request will be after userspace has had the opportunity 2487 * to recreate its own state. 2488 */ 2489 regs = ce->lrc_reg_state; 2490 if (engine->pinned_default_state) { 2491 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 2492 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 2493 engine->context_size - PAGE_SIZE); 2494 } 2495 execlists_init_reg_state(regs, ce, engine, ce->ring); 2496 2497 out_replay: 2498 GEM_TRACE("%s replay {head:%04x, tail:%04x\n", 2499 engine->name, ce->ring->head, ce->ring->tail); 2500 intel_ring_update_space(ce->ring); 2501 __execlists_update_reg_state(ce, engine); 2502 2503 unwind: 2504 /* Push back any incomplete requests for replay after the reset. */ 2505 cancel_port_requests(execlists); 2506 __unwind_incomplete_requests(engine); 2507 } 2508 2509 static void execlists_reset(struct intel_engine_cs *engine, bool stalled) 2510 { 2511 unsigned long flags; 2512 2513 GEM_TRACE("%s\n", engine->name); 2514 2515 spin_lock_irqsave(&engine->active.lock, flags); 2516 2517 __execlists_reset(engine, stalled); 2518 2519 spin_unlock_irqrestore(&engine->active.lock, flags); 2520 } 2521 2522 static void nop_submission_tasklet(unsigned long data) 2523 { 2524 /* The driver is wedged; don't process any more events. */ 2525 } 2526 2527 static void execlists_cancel_requests(struct intel_engine_cs *engine) 2528 { 2529 struct intel_engine_execlists * const execlists = &engine->execlists; 2530 struct i915_request *rq, *rn; 2531 struct rb_node *rb; 2532 unsigned long flags; 2533 2534 GEM_TRACE("%s\n", engine->name); 2535 2536 /* 2537 * Before we call engine->cancel_requests(), we should have exclusive 2538 * access to the submission state. This is arranged for us by the 2539 * caller disabling the interrupt generation, the tasklet and other 2540 * threads that may then access the same state, giving us a free hand 2541 * to reset state. However, we still need to let lockdep be aware that 2542 * we know this state may be accessed in hardirq context, so we 2543 * disable the irq around this manipulation and we want to keep 2544 * the spinlock focused on its duties and not accidentally conflate 2545 * coverage to the submission's irq state. (Similarly, although we 2546 * shouldn't need to disable irq around the manipulation of the 2547 * submission's irq state, we also wish to remind ourselves that 2548 * it is irq state.) 2549 */ 2550 spin_lock_irqsave(&engine->active.lock, flags); 2551 2552 __execlists_reset(engine, true); 2553 2554 /* Mark all executing requests as skipped. */ 2555 list_for_each_entry(rq, &engine->active.requests, sched.link) { 2556 if (!i915_request_signaled(rq)) 2557 dma_fence_set_error(&rq->fence, -EIO); 2558 2559 i915_request_mark_complete(rq); 2560 } 2561 2562 /* Flush the queued requests to the timeline list (for retiring). */ 2563 while ((rb = rb_first_cached(&execlists->queue))) { 2564 struct i915_priolist *p = to_priolist(rb); 2565 int i; 2566 2567 priolist_for_each_request_consume(rq, rn, p, i) { 2568 list_del_init(&rq->sched.link); 2569 __i915_request_submit(rq); 2570 dma_fence_set_error(&rq->fence, -EIO); 2571 i915_request_mark_complete(rq); 2572 } 2573 2574 rb_erase_cached(&p->node, &execlists->queue); 2575 i915_priolist_free(p); 2576 } 2577 2578 /* Cancel all attached virtual engines */ 2579 while ((rb = rb_first_cached(&execlists->virtual))) { 2580 struct virtual_engine *ve = 2581 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2582 2583 rb_erase_cached(rb, &execlists->virtual); 2584 RB_CLEAR_NODE(rb); 2585 2586 spin_lock(&ve->base.active.lock); 2587 if (ve->request) { 2588 ve->request->engine = engine; 2589 __i915_request_submit(ve->request); 2590 dma_fence_set_error(&ve->request->fence, -EIO); 2591 i915_request_mark_complete(ve->request); 2592 ve->base.execlists.queue_priority_hint = INT_MIN; 2593 ve->request = NULL; 2594 } 2595 spin_unlock(&ve->base.active.lock); 2596 } 2597 2598 /* Remaining _unready_ requests will be nop'ed when submitted */ 2599 2600 execlists->queue_priority_hint = INT_MIN; 2601 execlists->queue = RB_ROOT_CACHED; 2602 2603 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 2604 execlists->tasklet.func = nop_submission_tasklet; 2605 2606 spin_unlock_irqrestore(&engine->active.lock, flags); 2607 } 2608 2609 static void execlists_reset_finish(struct intel_engine_cs *engine) 2610 { 2611 struct intel_engine_execlists * const execlists = &engine->execlists; 2612 2613 /* 2614 * After a GPU reset, we may have requests to replay. Do so now while 2615 * we still have the forcewake to be sure that the GPU is not allowed 2616 * to sleep before we restart and reload a context. 2617 */ 2618 GEM_BUG_ON(!reset_in_progress(execlists)); 2619 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 2620 execlists->tasklet.func(execlists->tasklet.data); 2621 2622 if (__tasklet_enable(&execlists->tasklet)) 2623 /* And kick in case we missed a new request submission. */ 2624 tasklet_hi_schedule(&execlists->tasklet); 2625 GEM_TRACE("%s: depth->%d\n", engine->name, 2626 atomic_read(&execlists->tasklet.count)); 2627 } 2628 2629 static int gen8_emit_bb_start(struct i915_request *rq, 2630 u64 offset, u32 len, 2631 const unsigned int flags) 2632 { 2633 u32 *cs; 2634 2635 cs = intel_ring_begin(rq, 4); 2636 if (IS_ERR(cs)) 2637 return PTR_ERR(cs); 2638 2639 /* 2640 * WaDisableCtxRestoreArbitration:bdw,chv 2641 * 2642 * We don't need to perform MI_ARB_ENABLE as often as we do (in 2643 * particular all the gen that do not need the w/a at all!), if we 2644 * took care to make sure that on every switch into this context 2645 * (both ordinary and for preemption) that arbitrartion was enabled 2646 * we would be fine. However, for gen8 there is another w/a that 2647 * requires us to not preempt inside GPGPU execution, so we keep 2648 * arbitration disabled for gen8 batches. Arbitration will be 2649 * re-enabled before we close the request 2650 * (engine->emit_fini_breadcrumb). 2651 */ 2652 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2653 2654 /* FIXME(BDW+): Address space and security selectors. */ 2655 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 2656 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 2657 *cs++ = lower_32_bits(offset); 2658 *cs++ = upper_32_bits(offset); 2659 2660 intel_ring_advance(rq, cs); 2661 2662 return 0; 2663 } 2664 2665 static int gen9_emit_bb_start(struct i915_request *rq, 2666 u64 offset, u32 len, 2667 const unsigned int flags) 2668 { 2669 u32 *cs; 2670 2671 cs = intel_ring_begin(rq, 6); 2672 if (IS_ERR(cs)) 2673 return PTR_ERR(cs); 2674 2675 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2676 2677 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 2678 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 2679 *cs++ = lower_32_bits(offset); 2680 *cs++ = upper_32_bits(offset); 2681 2682 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 2683 *cs++ = MI_NOOP; 2684 2685 intel_ring_advance(rq, cs); 2686 2687 return 0; 2688 } 2689 2690 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 2691 { 2692 ENGINE_WRITE(engine, RING_IMR, 2693 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 2694 ENGINE_POSTING_READ(engine, RING_IMR); 2695 } 2696 2697 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 2698 { 2699 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 2700 } 2701 2702 static int gen8_emit_flush(struct i915_request *request, u32 mode) 2703 { 2704 u32 cmd, *cs; 2705 2706 cs = intel_ring_begin(request, 4); 2707 if (IS_ERR(cs)) 2708 return PTR_ERR(cs); 2709 2710 cmd = MI_FLUSH_DW + 1; 2711 2712 /* We always require a command barrier so that subsequent 2713 * commands, such as breadcrumb interrupts, are strictly ordered 2714 * wrt the contents of the write cache being flushed to memory 2715 * (and thus being coherent from the CPU). 2716 */ 2717 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 2718 2719 if (mode & EMIT_INVALIDATE) { 2720 cmd |= MI_INVALIDATE_TLB; 2721 if (request->engine->class == VIDEO_DECODE_CLASS) 2722 cmd |= MI_INVALIDATE_BSD; 2723 } 2724 2725 *cs++ = cmd; 2726 *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; 2727 *cs++ = 0; /* upper addr */ 2728 *cs++ = 0; /* value */ 2729 intel_ring_advance(request, cs); 2730 2731 return 0; 2732 } 2733 2734 static int gen8_emit_flush_render(struct i915_request *request, 2735 u32 mode) 2736 { 2737 struct intel_engine_cs *engine = request->engine; 2738 u32 scratch_addr = 2739 intel_gt_scratch_offset(engine->gt, 2740 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 2741 bool vf_flush_wa = false, dc_flush_wa = false; 2742 u32 *cs, flags = 0; 2743 int len; 2744 2745 flags |= PIPE_CONTROL_CS_STALL; 2746 2747 if (mode & EMIT_FLUSH) { 2748 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 2749 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 2750 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 2751 flags |= PIPE_CONTROL_FLUSH_ENABLE; 2752 } 2753 2754 if (mode & EMIT_INVALIDATE) { 2755 flags |= PIPE_CONTROL_TLB_INVALIDATE; 2756 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 2757 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 2758 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 2759 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 2760 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 2761 flags |= PIPE_CONTROL_QW_WRITE; 2762 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 2763 2764 /* 2765 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 2766 * pipe control. 2767 */ 2768 if (IS_GEN(request->i915, 9)) 2769 vf_flush_wa = true; 2770 2771 /* WaForGAMHang:kbl */ 2772 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 2773 dc_flush_wa = true; 2774 } 2775 2776 len = 6; 2777 2778 if (vf_flush_wa) 2779 len += 6; 2780 2781 if (dc_flush_wa) 2782 len += 12; 2783 2784 cs = intel_ring_begin(request, len); 2785 if (IS_ERR(cs)) 2786 return PTR_ERR(cs); 2787 2788 if (vf_flush_wa) 2789 cs = gen8_emit_pipe_control(cs, 0, 0); 2790 2791 if (dc_flush_wa) 2792 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 2793 0); 2794 2795 cs = gen8_emit_pipe_control(cs, flags, scratch_addr); 2796 2797 if (dc_flush_wa) 2798 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 2799 2800 intel_ring_advance(request, cs); 2801 2802 return 0; 2803 } 2804 2805 static int gen11_emit_flush_render(struct i915_request *request, 2806 u32 mode) 2807 { 2808 struct intel_engine_cs *engine = request->engine; 2809 const u32 scratch_addr = 2810 intel_gt_scratch_offset(engine->gt, 2811 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); 2812 2813 if (mode & EMIT_FLUSH) { 2814 u32 *cs; 2815 u32 flags = 0; 2816 2817 flags |= PIPE_CONTROL_CS_STALL; 2818 2819 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 2820 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 2821 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 2822 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 2823 flags |= PIPE_CONTROL_FLUSH_ENABLE; 2824 flags |= PIPE_CONTROL_QW_WRITE; 2825 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 2826 2827 cs = intel_ring_begin(request, 6); 2828 if (IS_ERR(cs)) 2829 return PTR_ERR(cs); 2830 2831 cs = gen8_emit_pipe_control(cs, flags, scratch_addr); 2832 intel_ring_advance(request, cs); 2833 } 2834 2835 if (mode & EMIT_INVALIDATE) { 2836 u32 *cs; 2837 u32 flags = 0; 2838 2839 flags |= PIPE_CONTROL_CS_STALL; 2840 2841 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 2842 flags |= PIPE_CONTROL_TLB_INVALIDATE; 2843 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 2844 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 2845 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 2846 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 2847 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 2848 flags |= PIPE_CONTROL_QW_WRITE; 2849 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; 2850 2851 cs = intel_ring_begin(request, 6); 2852 if (IS_ERR(cs)) 2853 return PTR_ERR(cs); 2854 2855 cs = gen8_emit_pipe_control(cs, flags, scratch_addr); 2856 intel_ring_advance(request, cs); 2857 } 2858 2859 return 0; 2860 } 2861 2862 /* 2863 * Reserve space for 2 NOOPs at the end of each request to be 2864 * used as a workaround for not being allowed to do lite 2865 * restore with HEAD==TAIL (WaIdleLiteRestore). 2866 */ 2867 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 2868 { 2869 /* Ensure there's always at least one preemption point per-request. */ 2870 *cs++ = MI_ARB_CHECK; 2871 *cs++ = MI_NOOP; 2872 request->wa_tail = intel_ring_offset(request, cs); 2873 2874 return cs; 2875 } 2876 2877 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 2878 { 2879 *cs++ = MI_SEMAPHORE_WAIT | 2880 MI_SEMAPHORE_GLOBAL_GTT | 2881 MI_SEMAPHORE_POLL | 2882 MI_SEMAPHORE_SAD_EQ_SDD; 2883 *cs++ = 0; 2884 *cs++ = intel_hws_preempt_address(request->engine); 2885 *cs++ = 0; 2886 2887 return cs; 2888 } 2889 2890 static __always_inline u32* 2891 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 2892 u32 *cs) 2893 { 2894 *cs++ = MI_USER_INTERRUPT; 2895 2896 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 2897 if (intel_engine_has_semaphores(request->engine)) 2898 cs = emit_preempt_busywait(request, cs); 2899 2900 request->tail = intel_ring_offset(request, cs); 2901 assert_ring_tail_valid(request->ring, request->tail); 2902 2903 return gen8_emit_wa_tail(request, cs); 2904 } 2905 2906 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 2907 { 2908 cs = gen8_emit_ggtt_write(cs, 2909 request->fence.seqno, 2910 request->timeline->hwsp_offset, 2911 0); 2912 2913 return gen8_emit_fini_breadcrumb_footer(request, cs); 2914 } 2915 2916 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 2917 { 2918 cs = gen8_emit_ggtt_write_rcs(cs, 2919 request->fence.seqno, 2920 request->timeline->hwsp_offset, 2921 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 2922 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 2923 PIPE_CONTROL_DC_FLUSH_ENABLE); 2924 2925 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 2926 cs = gen8_emit_pipe_control(cs, 2927 PIPE_CONTROL_FLUSH_ENABLE | 2928 PIPE_CONTROL_CS_STALL, 2929 0); 2930 2931 return gen8_emit_fini_breadcrumb_footer(request, cs); 2932 } 2933 2934 static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, 2935 u32 *cs) 2936 { 2937 cs = gen8_emit_ggtt_write_rcs(cs, 2938 request->fence.seqno, 2939 request->timeline->hwsp_offset, 2940 PIPE_CONTROL_CS_STALL | 2941 PIPE_CONTROL_TILE_CACHE_FLUSH | 2942 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 2943 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 2944 PIPE_CONTROL_DC_FLUSH_ENABLE | 2945 PIPE_CONTROL_FLUSH_ENABLE); 2946 2947 return gen8_emit_fini_breadcrumb_footer(request, cs); 2948 } 2949 2950 static void execlists_park(struct intel_engine_cs *engine) 2951 { 2952 del_timer(&engine->execlists.timer); 2953 } 2954 2955 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 2956 { 2957 engine->submit_request = execlists_submit_request; 2958 engine->cancel_requests = execlists_cancel_requests; 2959 engine->schedule = i915_schedule; 2960 engine->execlists.tasklet.func = execlists_submission_tasklet; 2961 2962 engine->reset.prepare = execlists_reset_prepare; 2963 engine->reset.reset = execlists_reset; 2964 engine->reset.finish = execlists_reset_finish; 2965 2966 engine->park = execlists_park; 2967 engine->unpark = NULL; 2968 2969 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 2970 if (!intel_vgpu_active(engine->i915)) { 2971 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 2972 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 2973 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 2974 } 2975 } 2976 2977 static void execlists_destroy(struct intel_engine_cs *engine) 2978 { 2979 intel_engine_cleanup_common(engine); 2980 lrc_destroy_wa_ctx(engine); 2981 kfree(engine); 2982 } 2983 2984 static void 2985 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 2986 { 2987 /* Default vfuncs which can be overriden by each engine. */ 2988 2989 engine->destroy = execlists_destroy; 2990 engine->resume = execlists_resume; 2991 2992 engine->reset.prepare = execlists_reset_prepare; 2993 engine->reset.reset = execlists_reset; 2994 engine->reset.finish = execlists_reset_finish; 2995 2996 engine->cops = &execlists_context_ops; 2997 engine->request_alloc = execlists_request_alloc; 2998 2999 engine->emit_flush = gen8_emit_flush; 3000 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 3001 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 3002 3003 engine->set_default_submission = intel_execlists_set_default_submission; 3004 3005 if (INTEL_GEN(engine->i915) < 11) { 3006 engine->irq_enable = gen8_logical_ring_enable_irq; 3007 engine->irq_disable = gen8_logical_ring_disable_irq; 3008 } else { 3009 /* 3010 * TODO: On Gen11 interrupt masks need to be clear 3011 * to allow C6 entry. Keep interrupts enabled at 3012 * and take the hit of generating extra interrupts 3013 * until a more refined solution exists. 3014 */ 3015 } 3016 if (IS_GEN(engine->i915, 8)) 3017 engine->emit_bb_start = gen8_emit_bb_start; 3018 else 3019 engine->emit_bb_start = gen9_emit_bb_start; 3020 } 3021 3022 static inline void 3023 logical_ring_default_irqs(struct intel_engine_cs *engine) 3024 { 3025 unsigned int shift = 0; 3026 3027 if (INTEL_GEN(engine->i915) < 11) { 3028 const u8 irq_shifts[] = { 3029 [RCS0] = GEN8_RCS_IRQ_SHIFT, 3030 [BCS0] = GEN8_BCS_IRQ_SHIFT, 3031 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 3032 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 3033 [VECS0] = GEN8_VECS_IRQ_SHIFT, 3034 }; 3035 3036 shift = irq_shifts[engine->id]; 3037 } 3038 3039 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 3040 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 3041 } 3042 3043 static void rcs_submission_override(struct intel_engine_cs *engine) 3044 { 3045 switch (INTEL_GEN(engine->i915)) { 3046 case 12: 3047 case 11: 3048 engine->emit_flush = gen11_emit_flush_render; 3049 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 3050 break; 3051 default: 3052 engine->emit_flush = gen8_emit_flush_render; 3053 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 3054 break; 3055 } 3056 } 3057 3058 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 3059 { 3060 tasklet_init(&engine->execlists.tasklet, 3061 execlists_submission_tasklet, (unsigned long)engine); 3062 timer_setup(&engine->execlists.timer, execlists_submission_timer, 0); 3063 3064 logical_ring_default_vfuncs(engine); 3065 logical_ring_default_irqs(engine); 3066 3067 if (engine->class == RENDER_CLASS) 3068 rcs_submission_override(engine); 3069 3070 return 0; 3071 } 3072 3073 int intel_execlists_submission_init(struct intel_engine_cs *engine) 3074 { 3075 struct intel_engine_execlists * const execlists = &engine->execlists; 3076 struct drm_i915_private *i915 = engine->i915; 3077 struct intel_uncore *uncore = engine->uncore; 3078 u32 base = engine->mmio_base; 3079 int ret; 3080 3081 ret = intel_engine_init_common(engine); 3082 if (ret) 3083 return ret; 3084 3085 if (intel_init_workaround_bb(engine)) 3086 /* 3087 * We continue even if we fail to initialize WA batch 3088 * because we only expect rare glitches but nothing 3089 * critical to prevent us from using GPU 3090 */ 3091 DRM_ERROR("WA batch buffer initialization failed\n"); 3092 3093 if (HAS_LOGICAL_RING_ELSQ(i915)) { 3094 execlists->submit_reg = uncore->regs + 3095 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 3096 execlists->ctrl_reg = uncore->regs + 3097 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 3098 } else { 3099 execlists->submit_reg = uncore->regs + 3100 i915_mmio_reg_offset(RING_ELSP(base)); 3101 } 3102 3103 execlists->csb_status = 3104 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 3105 3106 execlists->csb_write = 3107 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 3108 3109 if (INTEL_GEN(i915) < 11) 3110 execlists->csb_size = GEN8_CSB_ENTRIES; 3111 else 3112 execlists->csb_size = GEN11_CSB_ENTRIES; 3113 3114 reset_csb_pointers(engine); 3115 3116 return 0; 3117 } 3118 3119 static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine) 3120 { 3121 u32 indirect_ctx_offset; 3122 3123 switch (INTEL_GEN(engine->i915)) { 3124 default: 3125 MISSING_CASE(INTEL_GEN(engine->i915)); 3126 /* fall through */ 3127 case 12: 3128 indirect_ctx_offset = 3129 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3130 break; 3131 case 11: 3132 indirect_ctx_offset = 3133 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3134 break; 3135 case 10: 3136 indirect_ctx_offset = 3137 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3138 break; 3139 case 9: 3140 indirect_ctx_offset = 3141 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3142 break; 3143 case 8: 3144 indirect_ctx_offset = 3145 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 3146 break; 3147 } 3148 3149 return indirect_ctx_offset; 3150 } 3151 3152 static void execlists_init_reg_state(u32 *regs, 3153 struct intel_context *ce, 3154 struct intel_engine_cs *engine, 3155 struct intel_ring *ring) 3156 { 3157 struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->vm); 3158 bool rcs = engine->class == RENDER_CLASS; 3159 u32 base = engine->mmio_base; 3160 3161 /* 3162 * A context is actually a big batch buffer with several 3163 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 3164 * values we are setting here are only for the first context restore: 3165 * on a subsequent save, the GPU will recreate this batchbuffer with new 3166 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 3167 * we are not initializing here). 3168 * 3169 * Must keep consistent with virtual_update_register_offsets(). 3170 */ 3171 regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) | 3172 MI_LRI_FORCE_POSTED; 3173 3174 CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base), 3175 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | 3176 _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH)); 3177 if (INTEL_GEN(engine->i915) < 11) { 3178 regs[CTX_CONTEXT_CONTROL + 1] |= 3179 _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 3180 CTX_CTRL_RS_CTX_ENABLE); 3181 } 3182 CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0); 3183 CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0); 3184 CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0); 3185 CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base), 3186 RING_CTL_SIZE(ring->size) | RING_VALID); 3187 CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0); 3188 CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0); 3189 CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT); 3190 CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0); 3191 CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0); 3192 CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0); 3193 if (rcs) { 3194 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3195 3196 CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0); 3197 CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET, 3198 RING_INDIRECT_CTX_OFFSET(base), 0); 3199 if (wa_ctx->indirect_ctx.size) { 3200 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3201 3202 regs[CTX_RCS_INDIRECT_CTX + 1] = 3203 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 3204 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 3205 3206 regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = 3207 intel_lr_indirect_ctx_offset(engine) << 6; 3208 } 3209 3210 CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0); 3211 if (wa_ctx->per_ctx.size) { 3212 u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 3213 3214 regs[CTX_BB_PER_CTX_PTR + 1] = 3215 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 3216 } 3217 } 3218 3219 regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; 3220 3221 CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0); 3222 /* PDP values well be assigned later if needed */ 3223 CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0); 3224 CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0); 3225 CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0); 3226 CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0); 3227 CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0); 3228 CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0); 3229 CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0); 3230 CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0); 3231 3232 if (i915_vm_is_4lvl(&ppgtt->vm)) { 3233 /* 64b PPGTT (48bit canonical) 3234 * PDP0_DESCRIPTOR contains the base address to PML4 and 3235 * other PDP Descriptors are ignored. 3236 */ 3237 ASSIGN_CTX_PML4(ppgtt, regs); 3238 } else { 3239 ASSIGN_CTX_PDP(ppgtt, regs, 3); 3240 ASSIGN_CTX_PDP(ppgtt, regs, 2); 3241 ASSIGN_CTX_PDP(ppgtt, regs, 1); 3242 ASSIGN_CTX_PDP(ppgtt, regs, 0); 3243 } 3244 3245 if (rcs) { 3246 regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); 3247 CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0); 3248 } 3249 3250 regs[CTX_END] = MI_BATCH_BUFFER_END; 3251 if (INTEL_GEN(engine->i915) >= 10) 3252 regs[CTX_END] |= BIT(0); 3253 } 3254 3255 static int 3256 populate_lr_context(struct intel_context *ce, 3257 struct drm_i915_gem_object *ctx_obj, 3258 struct intel_engine_cs *engine, 3259 struct intel_ring *ring) 3260 { 3261 void *vaddr; 3262 u32 *regs; 3263 int ret; 3264 3265 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 3266 if (IS_ERR(vaddr)) { 3267 ret = PTR_ERR(vaddr); 3268 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 3269 return ret; 3270 } 3271 3272 set_redzone(vaddr, engine); 3273 3274 if (engine->default_state) { 3275 /* 3276 * We only want to copy over the template context state; 3277 * skipping over the headers reserved for GuC communication, 3278 * leaving those as zero. 3279 */ 3280 const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE; 3281 void *defaults; 3282 3283 defaults = i915_gem_object_pin_map(engine->default_state, 3284 I915_MAP_WB); 3285 if (IS_ERR(defaults)) { 3286 ret = PTR_ERR(defaults); 3287 goto err_unpin_ctx; 3288 } 3289 3290 memcpy(vaddr + start, defaults + start, engine->context_size); 3291 i915_gem_object_unpin_map(engine->default_state); 3292 } 3293 3294 /* The second page of the context object contains some fields which must 3295 * be set up prior to the first execution. */ 3296 regs = vaddr + LRC_STATE_PN * PAGE_SIZE; 3297 execlists_init_reg_state(regs, ce, engine, ring); 3298 if (!engine->default_state) 3299 regs[CTX_CONTEXT_CONTROL + 1] |= 3300 _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 3301 3302 ret = 0; 3303 err_unpin_ctx: 3304 __i915_gem_object_flush_map(ctx_obj, 3305 LRC_HEADER_PAGES * PAGE_SIZE, 3306 engine->context_size); 3307 i915_gem_object_unpin_map(ctx_obj); 3308 return ret; 3309 } 3310 3311 static int __execlists_context_alloc(struct intel_context *ce, 3312 struct intel_engine_cs *engine) 3313 { 3314 struct drm_i915_gem_object *ctx_obj; 3315 struct intel_ring *ring; 3316 struct i915_vma *vma; 3317 u32 context_size; 3318 int ret; 3319 3320 GEM_BUG_ON(ce->state); 3321 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 3322 3323 /* 3324 * Before the actual start of the context image, we insert a few pages 3325 * for our own use and for sharing with the GuC. 3326 */ 3327 context_size += LRC_HEADER_PAGES * PAGE_SIZE; 3328 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3329 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 3330 3331 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 3332 if (IS_ERR(ctx_obj)) 3333 return PTR_ERR(ctx_obj); 3334 3335 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 3336 if (IS_ERR(vma)) { 3337 ret = PTR_ERR(vma); 3338 goto error_deref_obj; 3339 } 3340 3341 if (!ce->timeline) { 3342 struct intel_timeline *tl; 3343 3344 tl = intel_timeline_create(engine->gt, NULL); 3345 if (IS_ERR(tl)) { 3346 ret = PTR_ERR(tl); 3347 goto error_deref_obj; 3348 } 3349 3350 ce->timeline = tl; 3351 } 3352 3353 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 3354 if (IS_ERR(ring)) { 3355 ret = PTR_ERR(ring); 3356 goto error_deref_obj; 3357 } 3358 3359 ret = populate_lr_context(ce, ctx_obj, engine, ring); 3360 if (ret) { 3361 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 3362 goto error_ring_free; 3363 } 3364 3365 ce->ring = ring; 3366 ce->state = vma; 3367 3368 return 0; 3369 3370 error_ring_free: 3371 intel_ring_put(ring); 3372 error_deref_obj: 3373 i915_gem_object_put(ctx_obj); 3374 return ret; 3375 } 3376 3377 static struct list_head *virtual_queue(struct virtual_engine *ve) 3378 { 3379 return &ve->base.execlists.default_priolist.requests[0]; 3380 } 3381 3382 static void virtual_context_destroy(struct kref *kref) 3383 { 3384 struct virtual_engine *ve = 3385 container_of(kref, typeof(*ve), context.ref); 3386 unsigned int n; 3387 3388 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 3389 GEM_BUG_ON(ve->request); 3390 GEM_BUG_ON(ve->context.inflight); 3391 3392 for (n = 0; n < ve->num_siblings; n++) { 3393 struct intel_engine_cs *sibling = ve->siblings[n]; 3394 struct rb_node *node = &ve->nodes[sibling->id].rb; 3395 3396 if (RB_EMPTY_NODE(node)) 3397 continue; 3398 3399 spin_lock_irq(&sibling->active.lock); 3400 3401 /* Detachment is lazily performed in the execlists tasklet */ 3402 if (!RB_EMPTY_NODE(node)) 3403 rb_erase_cached(node, &sibling->execlists.virtual); 3404 3405 spin_unlock_irq(&sibling->active.lock); 3406 } 3407 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 3408 3409 if (ve->context.state) 3410 __execlists_context_fini(&ve->context); 3411 intel_context_fini(&ve->context); 3412 3413 kfree(ve->bonds); 3414 kfree(ve); 3415 } 3416 3417 static void virtual_engine_initial_hint(struct virtual_engine *ve) 3418 { 3419 int swp; 3420 3421 /* 3422 * Pick a random sibling on starting to help spread the load around. 3423 * 3424 * New contexts are typically created with exactly the same order 3425 * of siblings, and often started in batches. Due to the way we iterate 3426 * the array of sibling when submitting requests, sibling[0] is 3427 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 3428 * randomised across the system, we also help spread the load by the 3429 * first engine we inspect being different each time. 3430 * 3431 * NB This does not force us to execute on this engine, it will just 3432 * typically be the first we inspect for submission. 3433 */ 3434 swp = prandom_u32_max(ve->num_siblings); 3435 if (!swp) 3436 return; 3437 3438 swap(ve->siblings[swp], ve->siblings[0]); 3439 virtual_update_register_offsets(ve->context.lrc_reg_state, 3440 ve->siblings[0]); 3441 } 3442 3443 static int virtual_context_pin(struct intel_context *ce) 3444 { 3445 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 3446 int err; 3447 3448 /* Note: we must use a real engine class for setting up reg state */ 3449 err = __execlists_context_pin(ce, ve->siblings[0]); 3450 if (err) 3451 return err; 3452 3453 virtual_engine_initial_hint(ve); 3454 return 0; 3455 } 3456 3457 static void virtual_context_enter(struct intel_context *ce) 3458 { 3459 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 3460 unsigned int n; 3461 3462 for (n = 0; n < ve->num_siblings; n++) 3463 intel_engine_pm_get(ve->siblings[n]); 3464 3465 intel_timeline_enter(ce->timeline); 3466 } 3467 3468 static void virtual_context_exit(struct intel_context *ce) 3469 { 3470 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 3471 unsigned int n; 3472 3473 intel_timeline_exit(ce->timeline); 3474 3475 for (n = 0; n < ve->num_siblings; n++) 3476 intel_engine_pm_put(ve->siblings[n]); 3477 } 3478 3479 static const struct intel_context_ops virtual_context_ops = { 3480 .pin = virtual_context_pin, 3481 .unpin = execlists_context_unpin, 3482 3483 .enter = virtual_context_enter, 3484 .exit = virtual_context_exit, 3485 3486 .destroy = virtual_context_destroy, 3487 }; 3488 3489 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 3490 { 3491 struct i915_request *rq; 3492 intel_engine_mask_t mask; 3493 3494 rq = READ_ONCE(ve->request); 3495 if (!rq) 3496 return 0; 3497 3498 /* The rq is ready for submission; rq->execution_mask is now stable. */ 3499 mask = rq->execution_mask; 3500 if (unlikely(!mask)) { 3501 /* Invalid selection, submit to a random engine in error */ 3502 i915_request_skip(rq, -ENODEV); 3503 mask = ve->siblings[0]->mask; 3504 } 3505 3506 GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n", 3507 ve->base.name, 3508 rq->fence.context, rq->fence.seqno, 3509 mask, ve->base.execlists.queue_priority_hint); 3510 3511 return mask; 3512 } 3513 3514 static void virtual_submission_tasklet(unsigned long data) 3515 { 3516 struct virtual_engine * const ve = (struct virtual_engine *)data; 3517 const int prio = ve->base.execlists.queue_priority_hint; 3518 intel_engine_mask_t mask; 3519 unsigned int n; 3520 3521 rcu_read_lock(); 3522 mask = virtual_submission_mask(ve); 3523 rcu_read_unlock(); 3524 if (unlikely(!mask)) 3525 return; 3526 3527 local_irq_disable(); 3528 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 3529 struct intel_engine_cs *sibling = ve->siblings[n]; 3530 struct ve_node * const node = &ve->nodes[sibling->id]; 3531 struct rb_node **parent, *rb; 3532 bool first; 3533 3534 if (unlikely(!(mask & sibling->mask))) { 3535 if (!RB_EMPTY_NODE(&node->rb)) { 3536 spin_lock(&sibling->active.lock); 3537 rb_erase_cached(&node->rb, 3538 &sibling->execlists.virtual); 3539 RB_CLEAR_NODE(&node->rb); 3540 spin_unlock(&sibling->active.lock); 3541 } 3542 continue; 3543 } 3544 3545 spin_lock(&sibling->active.lock); 3546 3547 if (!RB_EMPTY_NODE(&node->rb)) { 3548 /* 3549 * Cheat and avoid rebalancing the tree if we can 3550 * reuse this node in situ. 3551 */ 3552 first = rb_first_cached(&sibling->execlists.virtual) == 3553 &node->rb; 3554 if (prio == node->prio || (prio > node->prio && first)) 3555 goto submit_engine; 3556 3557 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 3558 } 3559 3560 rb = NULL; 3561 first = true; 3562 parent = &sibling->execlists.virtual.rb_root.rb_node; 3563 while (*parent) { 3564 struct ve_node *other; 3565 3566 rb = *parent; 3567 other = rb_entry(rb, typeof(*other), rb); 3568 if (prio > other->prio) { 3569 parent = &rb->rb_left; 3570 } else { 3571 parent = &rb->rb_right; 3572 first = false; 3573 } 3574 } 3575 3576 rb_link_node(&node->rb, rb, parent); 3577 rb_insert_color_cached(&node->rb, 3578 &sibling->execlists.virtual, 3579 first); 3580 3581 submit_engine: 3582 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 3583 node->prio = prio; 3584 if (first && prio > sibling->execlists.queue_priority_hint) { 3585 sibling->execlists.queue_priority_hint = prio; 3586 tasklet_hi_schedule(&sibling->execlists.tasklet); 3587 } 3588 3589 spin_unlock(&sibling->active.lock); 3590 } 3591 local_irq_enable(); 3592 } 3593 3594 static void virtual_submit_request(struct i915_request *rq) 3595 { 3596 struct virtual_engine *ve = to_virtual_engine(rq->engine); 3597 3598 GEM_TRACE("%s: rq=%llx:%lld\n", 3599 ve->base.name, 3600 rq->fence.context, 3601 rq->fence.seqno); 3602 3603 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 3604 3605 GEM_BUG_ON(ve->request); 3606 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 3607 3608 ve->base.execlists.queue_priority_hint = rq_prio(rq); 3609 WRITE_ONCE(ve->request, rq); 3610 3611 list_move_tail(&rq->sched.link, virtual_queue(ve)); 3612 3613 tasklet_schedule(&ve->base.execlists.tasklet); 3614 } 3615 3616 static struct ve_bond * 3617 virtual_find_bond(struct virtual_engine *ve, 3618 const struct intel_engine_cs *master) 3619 { 3620 int i; 3621 3622 for (i = 0; i < ve->num_bonds; i++) { 3623 if (ve->bonds[i].master == master) 3624 return &ve->bonds[i]; 3625 } 3626 3627 return NULL; 3628 } 3629 3630 static void 3631 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 3632 { 3633 struct virtual_engine *ve = to_virtual_engine(rq->engine); 3634 struct ve_bond *bond; 3635 3636 bond = virtual_find_bond(ve, to_request(signal)->engine); 3637 if (bond) { 3638 intel_engine_mask_t old, new, cmp; 3639 3640 cmp = READ_ONCE(rq->execution_mask); 3641 do { 3642 old = cmp; 3643 new = cmp & bond->sibling_mask; 3644 } while ((cmp = cmpxchg(&rq->execution_mask, old, new)) != old); 3645 } 3646 } 3647 3648 struct intel_context * 3649 intel_execlists_create_virtual(struct i915_gem_context *ctx, 3650 struct intel_engine_cs **siblings, 3651 unsigned int count) 3652 { 3653 struct virtual_engine *ve; 3654 unsigned int n; 3655 int err; 3656 3657 if (count == 0) 3658 return ERR_PTR(-EINVAL); 3659 3660 if (count == 1) 3661 return intel_context_create(ctx, siblings[0]); 3662 3663 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 3664 if (!ve) 3665 return ERR_PTR(-ENOMEM); 3666 3667 ve->base.i915 = ctx->i915; 3668 ve->base.gt = siblings[0]->gt; 3669 ve->base.id = -1; 3670 ve->base.class = OTHER_CLASS; 3671 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 3672 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 3673 3674 /* 3675 * The decision on whether to submit a request using semaphores 3676 * depends on the saturated state of the engine. We only compute 3677 * this during HW submission of the request, and we need for this 3678 * state to be globally applied to all requests being submitted 3679 * to this engine. Virtual engines encompass more than one physical 3680 * engine and so we cannot accurately tell in advance if one of those 3681 * engines is already saturated and so cannot afford to use a semaphore 3682 * and be pessimized in priority for doing so -- if we are the only 3683 * context using semaphores after all other clients have stopped, we 3684 * will be starved on the saturated system. Such a global switch for 3685 * semaphores is less than ideal, but alas is the current compromise. 3686 */ 3687 ve->base.saturated = ALL_ENGINES; 3688 3689 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 3690 3691 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 3692 3693 intel_engine_init_execlists(&ve->base); 3694 3695 ve->base.cops = &virtual_context_ops; 3696 ve->base.request_alloc = execlists_request_alloc; 3697 3698 ve->base.schedule = i915_schedule; 3699 ve->base.submit_request = virtual_submit_request; 3700 ve->base.bond_execute = virtual_bond_execute; 3701 3702 INIT_LIST_HEAD(virtual_queue(ve)); 3703 ve->base.execlists.queue_priority_hint = INT_MIN; 3704 tasklet_init(&ve->base.execlists.tasklet, 3705 virtual_submission_tasklet, 3706 (unsigned long)ve); 3707 3708 intel_context_init(&ve->context, ctx, &ve->base); 3709 3710 for (n = 0; n < count; n++) { 3711 struct intel_engine_cs *sibling = siblings[n]; 3712 3713 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 3714 if (sibling->mask & ve->base.mask) { 3715 DRM_DEBUG("duplicate %s entry in load balancer\n", 3716 sibling->name); 3717 err = -EINVAL; 3718 goto err_put; 3719 } 3720 3721 /* 3722 * The virtual engine implementation is tightly coupled to 3723 * the execlists backend -- we push out request directly 3724 * into a tree inside each physical engine. We could support 3725 * layering if we handle cloning of the requests and 3726 * submitting a copy into each backend. 3727 */ 3728 if (sibling->execlists.tasklet.func != 3729 execlists_submission_tasklet) { 3730 err = -ENODEV; 3731 goto err_put; 3732 } 3733 3734 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 3735 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 3736 3737 ve->siblings[ve->num_siblings++] = sibling; 3738 ve->base.mask |= sibling->mask; 3739 3740 /* 3741 * All physical engines must be compatible for their emission 3742 * functions (as we build the instructions during request 3743 * construction and do not alter them before submission 3744 * on the physical engine). We use the engine class as a guide 3745 * here, although that could be refined. 3746 */ 3747 if (ve->base.class != OTHER_CLASS) { 3748 if (ve->base.class != sibling->class) { 3749 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 3750 sibling->class, ve->base.class); 3751 err = -EINVAL; 3752 goto err_put; 3753 } 3754 continue; 3755 } 3756 3757 ve->base.class = sibling->class; 3758 ve->base.uabi_class = sibling->uabi_class; 3759 snprintf(ve->base.name, sizeof(ve->base.name), 3760 "v%dx%d", ve->base.class, count); 3761 ve->base.context_size = sibling->context_size; 3762 3763 ve->base.emit_bb_start = sibling->emit_bb_start; 3764 ve->base.emit_flush = sibling->emit_flush; 3765 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 3766 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 3767 ve->base.emit_fini_breadcrumb_dw = 3768 sibling->emit_fini_breadcrumb_dw; 3769 3770 ve->base.flags = sibling->flags; 3771 } 3772 3773 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 3774 3775 err = __execlists_context_alloc(&ve->context, siblings[0]); 3776 if (err) 3777 goto err_put; 3778 3779 __set_bit(CONTEXT_ALLOC_BIT, &ve->context.flags); 3780 3781 return &ve->context; 3782 3783 err_put: 3784 intel_context_put(&ve->context); 3785 return ERR_PTR(err); 3786 } 3787 3788 struct intel_context * 3789 intel_execlists_clone_virtual(struct i915_gem_context *ctx, 3790 struct intel_engine_cs *src) 3791 { 3792 struct virtual_engine *se = to_virtual_engine(src); 3793 struct intel_context *dst; 3794 3795 dst = intel_execlists_create_virtual(ctx, 3796 se->siblings, 3797 se->num_siblings); 3798 if (IS_ERR(dst)) 3799 return dst; 3800 3801 if (se->num_bonds) { 3802 struct virtual_engine *de = to_virtual_engine(dst->engine); 3803 3804 de->bonds = kmemdup(se->bonds, 3805 sizeof(*se->bonds) * se->num_bonds, 3806 GFP_KERNEL); 3807 if (!de->bonds) { 3808 intel_context_put(dst); 3809 return ERR_PTR(-ENOMEM); 3810 } 3811 3812 de->num_bonds = se->num_bonds; 3813 } 3814 3815 return dst; 3816 } 3817 3818 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 3819 const struct intel_engine_cs *master, 3820 const struct intel_engine_cs *sibling) 3821 { 3822 struct virtual_engine *ve = to_virtual_engine(engine); 3823 struct ve_bond *bond; 3824 int n; 3825 3826 /* Sanity check the sibling is part of the virtual engine */ 3827 for (n = 0; n < ve->num_siblings; n++) 3828 if (sibling == ve->siblings[n]) 3829 break; 3830 if (n == ve->num_siblings) 3831 return -EINVAL; 3832 3833 bond = virtual_find_bond(ve, master); 3834 if (bond) { 3835 bond->sibling_mask |= sibling->mask; 3836 return 0; 3837 } 3838 3839 bond = krealloc(ve->bonds, 3840 sizeof(*bond) * (ve->num_bonds + 1), 3841 GFP_KERNEL); 3842 if (!bond) 3843 return -ENOMEM; 3844 3845 bond[ve->num_bonds].master = master; 3846 bond[ve->num_bonds].sibling_mask = sibling->mask; 3847 3848 ve->bonds = bond; 3849 ve->num_bonds++; 3850 3851 return 0; 3852 } 3853 3854 void intel_execlists_show_requests(struct intel_engine_cs *engine, 3855 struct drm_printer *m, 3856 void (*show_request)(struct drm_printer *m, 3857 struct i915_request *rq, 3858 const char *prefix), 3859 unsigned int max) 3860 { 3861 const struct intel_engine_execlists *execlists = &engine->execlists; 3862 struct i915_request *rq, *last; 3863 unsigned long flags; 3864 unsigned int count; 3865 struct rb_node *rb; 3866 3867 spin_lock_irqsave(&engine->active.lock, flags); 3868 3869 last = NULL; 3870 count = 0; 3871 list_for_each_entry(rq, &engine->active.requests, sched.link) { 3872 if (count++ < max - 1) 3873 show_request(m, rq, "\t\tE "); 3874 else 3875 last = rq; 3876 } 3877 if (last) { 3878 if (count > max) { 3879 drm_printf(m, 3880 "\t\t...skipping %d executing requests...\n", 3881 count - max); 3882 } 3883 show_request(m, last, "\t\tE "); 3884 } 3885 3886 last = NULL; 3887 count = 0; 3888 if (execlists->queue_priority_hint != INT_MIN) 3889 drm_printf(m, "\t\tQueue priority hint: %d\n", 3890 execlists->queue_priority_hint); 3891 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 3892 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 3893 int i; 3894 3895 priolist_for_each_request(rq, p, i) { 3896 if (count++ < max - 1) 3897 show_request(m, rq, "\t\tQ "); 3898 else 3899 last = rq; 3900 } 3901 } 3902 if (last) { 3903 if (count > max) { 3904 drm_printf(m, 3905 "\t\t...skipping %d queued requests...\n", 3906 count - max); 3907 } 3908 show_request(m, last, "\t\tQ "); 3909 } 3910 3911 last = NULL; 3912 count = 0; 3913 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 3914 struct virtual_engine *ve = 3915 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3916 struct i915_request *rq = READ_ONCE(ve->request); 3917 3918 if (rq) { 3919 if (count++ < max - 1) 3920 show_request(m, rq, "\t\tV "); 3921 else 3922 last = rq; 3923 } 3924 } 3925 if (last) { 3926 if (count > max) { 3927 drm_printf(m, 3928 "\t\t...skipping %d virtual requests...\n", 3929 count - max); 3930 } 3931 show_request(m, last, "\t\tV "); 3932 } 3933 3934 spin_unlock_irqrestore(&engine->active.lock, flags); 3935 } 3936 3937 void intel_lr_context_reset(struct intel_engine_cs *engine, 3938 struct intel_context *ce, 3939 u32 head, 3940 bool scrub) 3941 { 3942 /* 3943 * We want a simple context + ring to execute the breadcrumb update. 3944 * We cannot rely on the context being intact across the GPU hang, 3945 * so clear it and rebuild just what we need for the breadcrumb. 3946 * All pending requests for this context will be zapped, and any 3947 * future request will be after userspace has had the opportunity 3948 * to recreate its own state. 3949 */ 3950 if (scrub) { 3951 u32 *regs = ce->lrc_reg_state; 3952 3953 if (engine->pinned_default_state) { 3954 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 3955 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 3956 engine->context_size - PAGE_SIZE); 3957 } 3958 execlists_init_reg_state(regs, ce, engine, ce->ring); 3959 } 3960 3961 /* Rerun the request; its payload has been neutered (if guilty). */ 3962 ce->ring->head = head; 3963 intel_ring_update_space(ce->ring); 3964 3965 __execlists_update_reg_state(ce, engine); 3966 } 3967 3968 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 3969 #include "selftest_lrc.c" 3970 #endif 3971