1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_context.h" 141 #include "intel_engine_pm.h" 142 #include "intel_gt.h" 143 #include "intel_gt_pm.h" 144 #include "intel_gt_requests.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 151 #define RING_EXECLIST_QFULL (1 << 0x2) 152 #define RING_EXECLIST1_VALID (1 << 0x3) 153 #define RING_EXECLIST0_VALID (1 << 0x4) 154 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 155 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 156 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 157 158 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 159 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 160 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 161 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 162 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 163 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 164 165 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 166 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 167 168 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 169 170 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 171 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 172 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 173 #define GEN12_IDLE_CTX_ID 0x7FF 174 #define GEN12_CSB_CTX_VALID(csb_dw) \ 175 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 176 177 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 178 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 179 #define WA_TAIL_DWORDS 2 180 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[0]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine, 241 u32 head); 242 243 static void mark_eio(struct i915_request *rq) 244 { 245 if (i915_request_completed(rq)) 246 return; 247 248 GEM_BUG_ON(i915_request_signaled(rq)); 249 250 dma_fence_set_error(&rq->fence, -EIO); 251 i915_request_mark_complete(rq); 252 } 253 254 static struct i915_request * 255 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 256 { 257 struct i915_request *active = rq; 258 259 rcu_read_lock(); 260 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 261 if (i915_request_completed(rq)) 262 break; 263 264 active = rq; 265 } 266 rcu_read_unlock(); 267 268 return active; 269 } 270 271 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 272 { 273 return (i915_ggtt_offset(engine->status_page.vma) + 274 I915_GEM_HWS_PREEMPT_ADDR); 275 } 276 277 static inline void 278 ring_set_paused(const struct intel_engine_cs *engine, int state) 279 { 280 /* 281 * We inspect HWS_PREEMPT with a semaphore inside 282 * engine->emit_fini_breadcrumb. If the dword is true, 283 * the ring is paused as the semaphore will busywait 284 * until the dword is false. 285 */ 286 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 287 if (state) 288 wmb(); 289 } 290 291 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 292 { 293 return rb_entry(rb, struct i915_priolist, node); 294 } 295 296 static inline int rq_prio(const struct i915_request *rq) 297 { 298 return rq->sched.attr.priority; 299 } 300 301 static int effective_prio(const struct i915_request *rq) 302 { 303 int prio = rq_prio(rq); 304 305 /* 306 * If this request is special and must not be interrupted at any 307 * cost, so be it. Note we are only checking the most recent request 308 * in the context and so may be masking an earlier vip request. It 309 * is hoped that under the conditions where nopreempt is used, this 310 * will not matter (i.e. all requests to that context will be 311 * nopreempt for as long as desired). 312 */ 313 if (i915_request_has_nopreempt(rq)) 314 prio = I915_PRIORITY_UNPREEMPTABLE; 315 316 /* 317 * On unwinding the active request, we give it a priority bump 318 * if it has completed waiting on any semaphore. If we know that 319 * the request has already started, we can prevent an unwanted 320 * preempt-to-idle cycle by taking that into account now. 321 */ 322 if (__i915_request_has_started(rq)) 323 prio |= I915_PRIORITY_NOSEMAPHORE; 324 325 /* Restrict mere WAIT boosts from triggering preemption */ 326 BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ 327 return prio | __NO_PREEMPTION; 328 } 329 330 static int queue_prio(const struct intel_engine_execlists *execlists) 331 { 332 struct i915_priolist *p; 333 struct rb_node *rb; 334 335 rb = rb_first_cached(&execlists->queue); 336 if (!rb) 337 return INT_MIN; 338 339 /* 340 * As the priolist[] are inverted, with the highest priority in [0], 341 * we have to flip the index value to become priority. 342 */ 343 p = to_priolist(rb); 344 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 345 } 346 347 static inline bool need_preempt(const struct intel_engine_cs *engine, 348 const struct i915_request *rq, 349 struct rb_node *rb) 350 { 351 int last_prio; 352 353 if (!intel_engine_has_semaphores(engine)) 354 return false; 355 356 /* 357 * Check if the current priority hint merits a preemption attempt. 358 * 359 * We record the highest value priority we saw during rescheduling 360 * prior to this dequeue, therefore we know that if it is strictly 361 * less than the current tail of ESLP[0], we do not need to force 362 * a preempt-to-idle cycle. 363 * 364 * However, the priority hint is a mere hint that we may need to 365 * preempt. If that hint is stale or we may be trying to preempt 366 * ourselves, ignore the request. 367 * 368 * More naturally we would write 369 * prio >= max(0, last); 370 * except that we wish to prevent triggering preemption at the same 371 * priority level: the task that is running should remain running 372 * to preserve FIFO ordering of dependencies. 373 */ 374 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 375 if (engine->execlists.queue_priority_hint <= last_prio) 376 return false; 377 378 /* 379 * Check against the first request in ELSP[1], it will, thanks to the 380 * power of PI, be the highest priority of that context. 381 */ 382 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 383 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 384 return true; 385 386 if (rb) { 387 struct virtual_engine *ve = 388 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 389 bool preempt = false; 390 391 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 392 struct i915_request *next; 393 394 rcu_read_lock(); 395 next = READ_ONCE(ve->request); 396 if (next) 397 preempt = rq_prio(next) > last_prio; 398 rcu_read_unlock(); 399 } 400 401 if (preempt) 402 return preempt; 403 } 404 405 /* 406 * If the inflight context did not trigger the preemption, then maybe 407 * it was the set of queued requests? Pick the highest priority in 408 * the queue (the first active priolist) and see if it deserves to be 409 * running instead of ELSP[0]. 410 * 411 * The highest priority request in the queue can not be either 412 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 413 * context, it's priority would not exceed ELSP[0] aka last_prio. 414 */ 415 return queue_prio(&engine->execlists) > last_prio; 416 } 417 418 __maybe_unused static inline bool 419 assert_priority_queue(const struct i915_request *prev, 420 const struct i915_request *next) 421 { 422 /* 423 * Without preemption, the prev may refer to the still active element 424 * which we refuse to let go. 425 * 426 * Even with preemption, there are times when we think it is better not 427 * to preempt and leave an ostensibly lower priority request in flight. 428 */ 429 if (i915_request_is_active(prev)) 430 return true; 431 432 return rq_prio(prev) >= rq_prio(next); 433 } 434 435 /* 436 * The context descriptor encodes various attributes of a context, 437 * including its GTT address and some flags. Because it's fairly 438 * expensive to calculate, we'll just do it once and cache the result, 439 * which remains valid until the context is unpinned. 440 * 441 * This is what a descriptor looks like, from LSB to MSB:: 442 * 443 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 444 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 445 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 446 * bits 53-54: mbz, reserved for use by hardware 447 * bits 55-63: group ID, currently unused and set to 0 448 * 449 * Starting from Gen11, the upper dword of the descriptor has a new format: 450 * 451 * bits 32-36: reserved 452 * bits 37-47: SW context ID 453 * bits 48:53: engine instance 454 * bit 54: mbz, reserved for use by hardware 455 * bits 55-60: SW counter 456 * bits 61-63: engine class 457 * 458 * engine info, SW context ID and SW counter need to form a unique number 459 * (Context ID) per lrc. 460 */ 461 static u64 462 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 463 { 464 u64 desc; 465 466 desc = INTEL_LEGACY_32B_CONTEXT; 467 if (i915_vm_is_4lvl(ce->vm)) 468 desc = INTEL_LEGACY_64B_CONTEXT; 469 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 470 471 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 472 if (IS_GEN(engine->i915, 8)) 473 desc |= GEN8_CTX_L3LLC_COHERENT; 474 475 desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */ 476 /* 477 * The following 32bits are copied into the OA reports (dword 2). 478 * Consider updating oa_get_render_ctx_id in i915_perf.c when changing 479 * anything below. 480 */ 481 if (INTEL_GEN(engine->i915) >= 11) { 482 desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; 483 /* bits 48-53 */ 484 485 desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; 486 /* bits 61-63 */ 487 } 488 489 return desc; 490 } 491 492 static inline unsigned int dword_in_page(void *addr) 493 { 494 return offset_in_page(addr) / sizeof(u32); 495 } 496 497 static void set_offsets(u32 *regs, 498 const u8 *data, 499 const struct intel_engine_cs *engine, 500 bool clear) 501 #define NOP(x) (BIT(7) | (x)) 502 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 503 #define POSTED BIT(0) 504 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 505 #define REG16(x) \ 506 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 507 (((x) >> 2) & 0x7f) 508 #define END(x) 0, (x) 509 { 510 const u32 base = engine->mmio_base; 511 512 while (*data) { 513 u8 count, flags; 514 515 if (*data & BIT(7)) { /* skip */ 516 count = *data++ & ~BIT(7); 517 if (clear) 518 memset32(regs, MI_NOOP, count); 519 regs += count; 520 continue; 521 } 522 523 count = *data & 0x3f; 524 flags = *data >> 6; 525 data++; 526 527 *regs = MI_LOAD_REGISTER_IMM(count); 528 if (flags & POSTED) 529 *regs |= MI_LRI_FORCE_POSTED; 530 if (INTEL_GEN(engine->i915) >= 11) 531 *regs |= MI_LRI_CS_MMIO; 532 regs++; 533 534 GEM_BUG_ON(!count); 535 do { 536 u32 offset = 0; 537 u8 v; 538 539 do { 540 v = *data++; 541 offset <<= 7; 542 offset |= v & ~BIT(7); 543 } while (v & BIT(7)); 544 545 regs[0] = base + (offset << 2); 546 if (clear) 547 regs[1] = 0; 548 regs += 2; 549 } while (--count); 550 } 551 552 if (clear) { 553 u8 count = *++data; 554 555 /* Clear past the tail for HW access */ 556 GEM_BUG_ON(dword_in_page(regs) > count); 557 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 558 559 /* Close the batch; used mainly by live_lrc_layout() */ 560 *regs = MI_BATCH_BUFFER_END; 561 if (INTEL_GEN(engine->i915) >= 10) 562 *regs |= BIT(0); 563 } 564 } 565 566 static const u8 gen8_xcs_offsets[] = { 567 NOP(1), 568 LRI(11, 0), 569 REG16(0x244), 570 REG(0x034), 571 REG(0x030), 572 REG(0x038), 573 REG(0x03c), 574 REG(0x168), 575 REG(0x140), 576 REG(0x110), 577 REG(0x11c), 578 REG(0x114), 579 REG(0x118), 580 581 NOP(9), 582 LRI(9, 0), 583 REG16(0x3a8), 584 REG16(0x28c), 585 REG16(0x288), 586 REG16(0x284), 587 REG16(0x280), 588 REG16(0x27c), 589 REG16(0x278), 590 REG16(0x274), 591 REG16(0x270), 592 593 NOP(13), 594 LRI(2, 0), 595 REG16(0x200), 596 REG(0x028), 597 598 END(80) 599 }; 600 601 static const u8 gen9_xcs_offsets[] = { 602 NOP(1), 603 LRI(14, POSTED), 604 REG16(0x244), 605 REG(0x034), 606 REG(0x030), 607 REG(0x038), 608 REG(0x03c), 609 REG(0x168), 610 REG(0x140), 611 REG(0x110), 612 REG(0x11c), 613 REG(0x114), 614 REG(0x118), 615 REG(0x1c0), 616 REG(0x1c4), 617 REG(0x1c8), 618 619 NOP(3), 620 LRI(9, POSTED), 621 REG16(0x3a8), 622 REG16(0x28c), 623 REG16(0x288), 624 REG16(0x284), 625 REG16(0x280), 626 REG16(0x27c), 627 REG16(0x278), 628 REG16(0x274), 629 REG16(0x270), 630 631 NOP(13), 632 LRI(1, POSTED), 633 REG16(0x200), 634 635 NOP(13), 636 LRI(44, POSTED), 637 REG(0x028), 638 REG(0x09c), 639 REG(0x0c0), 640 REG(0x178), 641 REG(0x17c), 642 REG16(0x358), 643 REG(0x170), 644 REG(0x150), 645 REG(0x154), 646 REG(0x158), 647 REG16(0x41c), 648 REG16(0x600), 649 REG16(0x604), 650 REG16(0x608), 651 REG16(0x60c), 652 REG16(0x610), 653 REG16(0x614), 654 REG16(0x618), 655 REG16(0x61c), 656 REG16(0x620), 657 REG16(0x624), 658 REG16(0x628), 659 REG16(0x62c), 660 REG16(0x630), 661 REG16(0x634), 662 REG16(0x638), 663 REG16(0x63c), 664 REG16(0x640), 665 REG16(0x644), 666 REG16(0x648), 667 REG16(0x64c), 668 REG16(0x650), 669 REG16(0x654), 670 REG16(0x658), 671 REG16(0x65c), 672 REG16(0x660), 673 REG16(0x664), 674 REG16(0x668), 675 REG16(0x66c), 676 REG16(0x670), 677 REG16(0x674), 678 REG16(0x678), 679 REG16(0x67c), 680 REG(0x068), 681 682 END(176) 683 }; 684 685 static const u8 gen12_xcs_offsets[] = { 686 NOP(1), 687 LRI(13, POSTED), 688 REG16(0x244), 689 REG(0x034), 690 REG(0x030), 691 REG(0x038), 692 REG(0x03c), 693 REG(0x168), 694 REG(0x140), 695 REG(0x110), 696 REG(0x1c0), 697 REG(0x1c4), 698 REG(0x1c8), 699 REG(0x180), 700 REG16(0x2b4), 701 702 NOP(5), 703 LRI(9, POSTED), 704 REG16(0x3a8), 705 REG16(0x28c), 706 REG16(0x288), 707 REG16(0x284), 708 REG16(0x280), 709 REG16(0x27c), 710 REG16(0x278), 711 REG16(0x274), 712 REG16(0x270), 713 714 END(80) 715 }; 716 717 static const u8 gen8_rcs_offsets[] = { 718 NOP(1), 719 LRI(14, POSTED), 720 REG16(0x244), 721 REG(0x034), 722 REG(0x030), 723 REG(0x038), 724 REG(0x03c), 725 REG(0x168), 726 REG(0x140), 727 REG(0x110), 728 REG(0x11c), 729 REG(0x114), 730 REG(0x118), 731 REG(0x1c0), 732 REG(0x1c4), 733 REG(0x1c8), 734 735 NOP(3), 736 LRI(9, POSTED), 737 REG16(0x3a8), 738 REG16(0x28c), 739 REG16(0x288), 740 REG16(0x284), 741 REG16(0x280), 742 REG16(0x27c), 743 REG16(0x278), 744 REG16(0x274), 745 REG16(0x270), 746 747 NOP(13), 748 LRI(1, 0), 749 REG(0x0c8), 750 751 END(80) 752 }; 753 754 static const u8 gen9_rcs_offsets[] = { 755 NOP(1), 756 LRI(14, POSTED), 757 REG16(0x244), 758 REG(0x34), 759 REG(0x30), 760 REG(0x38), 761 REG(0x3c), 762 REG(0x168), 763 REG(0x140), 764 REG(0x110), 765 REG(0x11c), 766 REG(0x114), 767 REG(0x118), 768 REG(0x1c0), 769 REG(0x1c4), 770 REG(0x1c8), 771 772 NOP(3), 773 LRI(9, POSTED), 774 REG16(0x3a8), 775 REG16(0x28c), 776 REG16(0x288), 777 REG16(0x284), 778 REG16(0x280), 779 REG16(0x27c), 780 REG16(0x278), 781 REG16(0x274), 782 REG16(0x270), 783 784 NOP(13), 785 LRI(1, 0), 786 REG(0xc8), 787 788 NOP(13), 789 LRI(44, POSTED), 790 REG(0x28), 791 REG(0x9c), 792 REG(0xc0), 793 REG(0x178), 794 REG(0x17c), 795 REG16(0x358), 796 REG(0x170), 797 REG(0x150), 798 REG(0x154), 799 REG(0x158), 800 REG16(0x41c), 801 REG16(0x600), 802 REG16(0x604), 803 REG16(0x608), 804 REG16(0x60c), 805 REG16(0x610), 806 REG16(0x614), 807 REG16(0x618), 808 REG16(0x61c), 809 REG16(0x620), 810 REG16(0x624), 811 REG16(0x628), 812 REG16(0x62c), 813 REG16(0x630), 814 REG16(0x634), 815 REG16(0x638), 816 REG16(0x63c), 817 REG16(0x640), 818 REG16(0x644), 819 REG16(0x648), 820 REG16(0x64c), 821 REG16(0x650), 822 REG16(0x654), 823 REG16(0x658), 824 REG16(0x65c), 825 REG16(0x660), 826 REG16(0x664), 827 REG16(0x668), 828 REG16(0x66c), 829 REG16(0x670), 830 REG16(0x674), 831 REG16(0x678), 832 REG16(0x67c), 833 REG(0x68), 834 835 END(176) 836 }; 837 838 static const u8 gen11_rcs_offsets[] = { 839 NOP(1), 840 LRI(15, POSTED), 841 REG16(0x244), 842 REG(0x034), 843 REG(0x030), 844 REG(0x038), 845 REG(0x03c), 846 REG(0x168), 847 REG(0x140), 848 REG(0x110), 849 REG(0x11c), 850 REG(0x114), 851 REG(0x118), 852 REG(0x1c0), 853 REG(0x1c4), 854 REG(0x1c8), 855 REG(0x180), 856 857 NOP(1), 858 LRI(9, POSTED), 859 REG16(0x3a8), 860 REG16(0x28c), 861 REG16(0x288), 862 REG16(0x284), 863 REG16(0x280), 864 REG16(0x27c), 865 REG16(0x278), 866 REG16(0x274), 867 REG16(0x270), 868 869 LRI(1, POSTED), 870 REG(0x1b0), 871 872 NOP(10), 873 LRI(1, 0), 874 REG(0x0c8), 875 876 END(80) 877 }; 878 879 static const u8 gen12_rcs_offsets[] = { 880 NOP(1), 881 LRI(13, POSTED), 882 REG16(0x244), 883 REG(0x034), 884 REG(0x030), 885 REG(0x038), 886 REG(0x03c), 887 REG(0x168), 888 REG(0x140), 889 REG(0x110), 890 REG(0x1c0), 891 REG(0x1c4), 892 REG(0x1c8), 893 REG(0x180), 894 REG16(0x2b4), 895 896 NOP(5), 897 LRI(9, POSTED), 898 REG16(0x3a8), 899 REG16(0x28c), 900 REG16(0x288), 901 REG16(0x284), 902 REG16(0x280), 903 REG16(0x27c), 904 REG16(0x278), 905 REG16(0x274), 906 REG16(0x270), 907 908 LRI(3, POSTED), 909 REG(0x1b0), 910 REG16(0x5a8), 911 REG16(0x5ac), 912 913 NOP(6), 914 LRI(1, 0), 915 REG(0x0c8), 916 917 END(80) 918 }; 919 920 #undef END 921 #undef REG16 922 #undef REG 923 #undef LRI 924 #undef NOP 925 926 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 927 { 928 /* 929 * The gen12+ lists only have the registers we program in the basic 930 * default state. We rely on the context image using relative 931 * addressing to automatic fixup the register state between the 932 * physical engines for virtual engine. 933 */ 934 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 935 !intel_engine_has_relative_mmio(engine)); 936 937 if (engine->class == RENDER_CLASS) { 938 if (INTEL_GEN(engine->i915) >= 12) 939 return gen12_rcs_offsets; 940 else if (INTEL_GEN(engine->i915) >= 11) 941 return gen11_rcs_offsets; 942 else if (INTEL_GEN(engine->i915) >= 9) 943 return gen9_rcs_offsets; 944 else 945 return gen8_rcs_offsets; 946 } else { 947 if (INTEL_GEN(engine->i915) >= 12) 948 return gen12_xcs_offsets; 949 else if (INTEL_GEN(engine->i915) >= 9) 950 return gen9_xcs_offsets; 951 else 952 return gen8_xcs_offsets; 953 } 954 } 955 956 static struct i915_request * 957 __unwind_incomplete_requests(struct intel_engine_cs *engine) 958 { 959 struct i915_request *rq, *rn, *active = NULL; 960 struct list_head *uninitialized_var(pl); 961 int prio = I915_PRIORITY_INVALID; 962 963 lockdep_assert_held(&engine->active.lock); 964 965 list_for_each_entry_safe_reverse(rq, rn, 966 &engine->active.requests, 967 sched.link) { 968 if (i915_request_completed(rq)) 969 continue; /* XXX */ 970 971 __i915_request_unsubmit(rq); 972 973 /* 974 * Push the request back into the queue for later resubmission. 975 * If this request is not native to this physical engine (i.e. 976 * it came from a virtual source), push it back onto the virtual 977 * engine so that it can be moved across onto another physical 978 * engine as load dictates. 979 */ 980 if (likely(rq->execution_mask == engine->mask)) { 981 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 982 if (rq_prio(rq) != prio) { 983 prio = rq_prio(rq); 984 pl = i915_sched_lookup_priolist(engine, prio); 985 } 986 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 987 988 list_move(&rq->sched.link, pl); 989 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 990 991 active = rq; 992 } else { 993 struct intel_engine_cs *owner = rq->context->engine; 994 995 /* 996 * Decouple the virtual breadcrumb before moving it 997 * back to the virtual engine -- we don't want the 998 * request to complete in the background and try 999 * and cancel the breadcrumb on the virtual engine 1000 * (instead of the old engine where it is linked)! 1001 */ 1002 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 1003 &rq->fence.flags)) { 1004 spin_lock_nested(&rq->lock, 1005 SINGLE_DEPTH_NESTING); 1006 i915_request_cancel_breadcrumb(rq); 1007 spin_unlock(&rq->lock); 1008 } 1009 rq->engine = owner; 1010 owner->submit_request(rq); 1011 active = NULL; 1012 } 1013 } 1014 1015 return active; 1016 } 1017 1018 struct i915_request * 1019 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1020 { 1021 struct intel_engine_cs *engine = 1022 container_of(execlists, typeof(*engine), execlists); 1023 1024 return __unwind_incomplete_requests(engine); 1025 } 1026 1027 static inline void 1028 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1029 { 1030 /* 1031 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1032 * The compiler should eliminate this function as dead-code. 1033 */ 1034 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1035 return; 1036 1037 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1038 status, rq); 1039 } 1040 1041 static void intel_engine_context_in(struct intel_engine_cs *engine) 1042 { 1043 unsigned long flags; 1044 1045 if (READ_ONCE(engine->stats.enabled) == 0) 1046 return; 1047 1048 write_seqlock_irqsave(&engine->stats.lock, flags); 1049 1050 if (engine->stats.enabled > 0) { 1051 if (engine->stats.active++ == 0) 1052 engine->stats.start = ktime_get(); 1053 GEM_BUG_ON(engine->stats.active == 0); 1054 } 1055 1056 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1057 } 1058 1059 static void intel_engine_context_out(struct intel_engine_cs *engine) 1060 { 1061 unsigned long flags; 1062 1063 if (READ_ONCE(engine->stats.enabled) == 0) 1064 return; 1065 1066 write_seqlock_irqsave(&engine->stats.lock, flags); 1067 1068 if (engine->stats.enabled > 0) { 1069 ktime_t last; 1070 1071 if (engine->stats.active && --engine->stats.active == 0) { 1072 /* 1073 * Decrement the active context count and in case GPU 1074 * is now idle add up to the running total. 1075 */ 1076 last = ktime_sub(ktime_get(), engine->stats.start); 1077 1078 engine->stats.total = ktime_add(engine->stats.total, 1079 last); 1080 } else if (engine->stats.active == 0) { 1081 /* 1082 * After turning on engine stats, context out might be 1083 * the first event in which case we account from the 1084 * time stats gathering was turned on. 1085 */ 1086 last = ktime_sub(ktime_get(), engine->stats.enabled_at); 1087 1088 engine->stats.total = ktime_add(engine->stats.total, 1089 last); 1090 } 1091 } 1092 1093 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1094 } 1095 1096 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 1097 { 1098 if (INTEL_GEN(engine->i915) >= 12) 1099 return 0x60; 1100 else if (INTEL_GEN(engine->i915) >= 9) 1101 return 0x54; 1102 else if (engine->class == RENDER_CLASS) 1103 return 0x58; 1104 else 1105 return -1; 1106 } 1107 1108 static void 1109 execlists_check_context(const struct intel_context *ce, 1110 const struct intel_engine_cs *engine) 1111 { 1112 const struct intel_ring *ring = ce->ring; 1113 u32 *regs = ce->lrc_reg_state; 1114 bool valid = true; 1115 int x; 1116 1117 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1118 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1119 engine->name, 1120 regs[CTX_RING_START], 1121 i915_ggtt_offset(ring->vma)); 1122 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1123 valid = false; 1124 } 1125 1126 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1127 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1128 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1129 engine->name, 1130 regs[CTX_RING_CTL], 1131 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1132 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1133 valid = false; 1134 } 1135 1136 x = lrc_ring_mi_mode(engine); 1137 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1138 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1139 engine->name, regs[x + 1]); 1140 regs[x + 1] &= ~STOP_RING; 1141 regs[x + 1] |= STOP_RING << 16; 1142 valid = false; 1143 } 1144 1145 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1146 } 1147 1148 static void restore_default_state(struct intel_context *ce, 1149 struct intel_engine_cs *engine) 1150 { 1151 u32 *regs = ce->lrc_reg_state; 1152 1153 if (engine->pinned_default_state) 1154 memcpy(regs, /* skip restoring the vanilla PPHWSP */ 1155 engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, 1156 engine->context_size - PAGE_SIZE); 1157 1158 execlists_init_reg_state(regs, ce, engine, ce->ring, false); 1159 } 1160 1161 static void reset_active(struct i915_request *rq, 1162 struct intel_engine_cs *engine) 1163 { 1164 struct intel_context * const ce = rq->context; 1165 u32 head; 1166 1167 /* 1168 * The executing context has been cancelled. We want to prevent 1169 * further execution along this context and propagate the error on 1170 * to anything depending on its results. 1171 * 1172 * In __i915_request_submit(), we apply the -EIO and remove the 1173 * requests' payloads for any banned requests. But first, we must 1174 * rewind the context back to the start of the incomplete request so 1175 * that we do not jump back into the middle of the batch. 1176 * 1177 * We preserve the breadcrumbs and semaphores of the incomplete 1178 * requests so that inter-timeline dependencies (i.e other timelines) 1179 * remain correctly ordered. And we defer to __i915_request_submit() 1180 * so that all asynchronous waits are correctly handled. 1181 */ 1182 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1183 rq->fence.context, rq->fence.seqno); 1184 1185 /* On resubmission of the active request, payload will be scrubbed */ 1186 if (i915_request_completed(rq)) 1187 head = rq->tail; 1188 else 1189 head = active_request(ce->timeline, rq)->head; 1190 head = intel_ring_wrap(ce->ring, head); 1191 1192 /* Scrub the context image to prevent replaying the previous batch */ 1193 restore_default_state(ce, engine); 1194 __execlists_update_reg_state(ce, engine, head); 1195 1196 /* We've switched away, so this should be a no-op, but intent matters */ 1197 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 1198 } 1199 1200 static inline struct intel_engine_cs * 1201 __execlists_schedule_in(struct i915_request *rq) 1202 { 1203 struct intel_engine_cs * const engine = rq->engine; 1204 struct intel_context * const ce = rq->context; 1205 1206 intel_context_get(ce); 1207 1208 if (unlikely(intel_context_is_banned(ce))) 1209 reset_active(rq, engine); 1210 1211 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1212 execlists_check_context(ce, engine); 1213 1214 if (ce->tag) { 1215 /* Use a fixed tag for OA and friends */ 1216 ce->lrc_desc |= (u64)ce->tag << 32; 1217 } else { 1218 /* We don't need a strict matching tag, just different values */ 1219 ce->lrc_desc &= ~GENMASK_ULL(47, 37); 1220 ce->lrc_desc |= 1221 (u64)(++engine->context_tag % NUM_CONTEXT_TAG) << 1222 GEN11_SW_CTX_ID_SHIFT; 1223 BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID); 1224 } 1225 1226 __intel_gt_pm_get(engine->gt); 1227 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1228 intel_engine_context_in(engine); 1229 1230 return engine; 1231 } 1232 1233 static inline struct i915_request * 1234 execlists_schedule_in(struct i915_request *rq, int idx) 1235 { 1236 struct intel_context * const ce = rq->context; 1237 struct intel_engine_cs *old; 1238 1239 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1240 trace_i915_request_in(rq, idx); 1241 1242 old = READ_ONCE(ce->inflight); 1243 do { 1244 if (!old) { 1245 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1246 break; 1247 } 1248 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1249 1250 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1251 return i915_request_get(rq); 1252 } 1253 1254 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1255 { 1256 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1257 struct i915_request *next = READ_ONCE(ve->request); 1258 1259 if (next && next->execution_mask & ~rq->execution_mask) 1260 tasklet_schedule(&ve->base.execlists.tasklet); 1261 } 1262 1263 static inline void 1264 __execlists_schedule_out(struct i915_request *rq, 1265 struct intel_engine_cs * const engine) 1266 { 1267 struct intel_context * const ce = rq->context; 1268 1269 /* 1270 * NB process_csb() is not under the engine->active.lock and hence 1271 * schedule_out can race with schedule_in meaning that we should 1272 * refrain from doing non-trivial work here. 1273 */ 1274 1275 /* 1276 * If we have just completed this context, the engine may now be 1277 * idle and we want to re-enter powersaving. 1278 */ 1279 if (list_is_last(&rq->link, &ce->timeline->requests) && 1280 i915_request_completed(rq)) 1281 intel_engine_add_retire(engine, ce->timeline); 1282 1283 intel_engine_context_out(engine); 1284 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1285 intel_gt_pm_put_async(engine->gt); 1286 1287 /* 1288 * If this is part of a virtual engine, its next request may 1289 * have been blocked waiting for access to the active context. 1290 * We have to kick all the siblings again in case we need to 1291 * switch (e.g. the next request is not runnable on this 1292 * engine). Hopefully, we will already have submitted the next 1293 * request before the tasklet runs and do not need to rebuild 1294 * each virtual tree and kick everyone again. 1295 */ 1296 if (ce->engine != engine) 1297 kick_siblings(rq, ce); 1298 1299 intel_context_put(ce); 1300 } 1301 1302 static inline void 1303 execlists_schedule_out(struct i915_request *rq) 1304 { 1305 struct intel_context * const ce = rq->context; 1306 struct intel_engine_cs *cur, *old; 1307 1308 trace_i915_request_out(rq); 1309 1310 old = READ_ONCE(ce->inflight); 1311 do 1312 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1313 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1314 if (!cur) 1315 __execlists_schedule_out(rq, old); 1316 1317 i915_request_put(rq); 1318 } 1319 1320 static u64 execlists_update_context(struct i915_request *rq) 1321 { 1322 struct intel_context *ce = rq->context; 1323 u64 desc = ce->lrc_desc; 1324 u32 tail, prev; 1325 1326 /* 1327 * WaIdleLiteRestore:bdw,skl 1328 * 1329 * We should never submit the context with the same RING_TAIL twice 1330 * just in case we submit an empty ring, which confuses the HW. 1331 * 1332 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1333 * the normal request to be able to always advance the RING_TAIL on 1334 * subsequent resubmissions (for lite restore). Should that fail us, 1335 * and we try and submit the same tail again, force the context 1336 * reload. 1337 * 1338 * If we need to return to a preempted context, we need to skip the 1339 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1340 * HW has a tendency to ignore us rewinding the TAIL to the end of 1341 * an earlier request. 1342 */ 1343 tail = intel_ring_set_tail(rq->ring, rq->tail); 1344 prev = ce->lrc_reg_state[CTX_RING_TAIL]; 1345 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1346 desc |= CTX_DESC_FORCE_RESTORE; 1347 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1348 rq->tail = rq->wa_tail; 1349 1350 /* 1351 * Make sure the context image is complete before we submit it to HW. 1352 * 1353 * Ostensibly, writes (including the WCB) should be flushed prior to 1354 * an uncached write such as our mmio register access, the empirical 1355 * evidence (esp. on Braswell) suggests that the WC write into memory 1356 * may not be visible to the HW prior to the completion of the UC 1357 * register write and that we may begin execution from the context 1358 * before its image is complete leading to invalid PD chasing. 1359 */ 1360 wmb(); 1361 1362 ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; 1363 return desc; 1364 } 1365 1366 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1367 { 1368 if (execlists->ctrl_reg) { 1369 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1370 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1371 } else { 1372 writel(upper_32_bits(desc), execlists->submit_reg); 1373 writel(lower_32_bits(desc), execlists->submit_reg); 1374 } 1375 } 1376 1377 static __maybe_unused void 1378 trace_ports(const struct intel_engine_execlists *execlists, 1379 const char *msg, 1380 struct i915_request * const *ports) 1381 { 1382 const struct intel_engine_cs *engine = 1383 container_of(execlists, typeof(*engine), execlists); 1384 1385 if (!ports[0]) 1386 return; 1387 1388 ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg, 1389 ports[0]->fence.context, 1390 ports[0]->fence.seqno, 1391 i915_request_completed(ports[0]) ? "!" : 1392 i915_request_started(ports[0]) ? "*" : 1393 "", 1394 ports[1] ? ports[1]->fence.context : 0, 1395 ports[1] ? ports[1]->fence.seqno : 0); 1396 } 1397 1398 static __maybe_unused bool 1399 assert_pending_valid(const struct intel_engine_execlists *execlists, 1400 const char *msg) 1401 { 1402 struct i915_request * const *port, *rq; 1403 struct intel_context *ce = NULL; 1404 1405 trace_ports(execlists, msg, execlists->pending); 1406 1407 if (!execlists->pending[0]) { 1408 GEM_TRACE_ERR("Nothing pending for promotion!\n"); 1409 return false; 1410 } 1411 1412 if (execlists->pending[execlists_num_ports(execlists)]) { 1413 GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", 1414 execlists_num_ports(execlists)); 1415 return false; 1416 } 1417 1418 for (port = execlists->pending; (rq = *port); port++) { 1419 unsigned long flags; 1420 bool ok = true; 1421 1422 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1423 GEM_BUG_ON(!i915_request_is_active(rq)); 1424 1425 if (ce == rq->context) { 1426 GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n", 1427 ce->timeline->fence_context, 1428 port - execlists->pending); 1429 return false; 1430 } 1431 ce = rq->context; 1432 1433 /* Hold tightly onto the lock to prevent concurrent retires! */ 1434 if (!spin_trylock_irqsave(&rq->lock, flags)) 1435 continue; 1436 1437 if (i915_request_completed(rq)) 1438 goto unlock; 1439 1440 if (i915_active_is_idle(&ce->active) && 1441 !intel_context_is_barrier(ce)) { 1442 GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n", 1443 ce->timeline->fence_context, 1444 port - execlists->pending); 1445 ok = false; 1446 goto unlock; 1447 } 1448 1449 if (!i915_vma_is_pinned(ce->state)) { 1450 GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n", 1451 ce->timeline->fence_context, 1452 port - execlists->pending); 1453 ok = false; 1454 goto unlock; 1455 } 1456 1457 if (!i915_vma_is_pinned(ce->ring->vma)) { 1458 GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n", 1459 ce->timeline->fence_context, 1460 port - execlists->pending); 1461 ok = false; 1462 goto unlock; 1463 } 1464 1465 unlock: 1466 spin_unlock_irqrestore(&rq->lock, flags); 1467 if (!ok) 1468 return false; 1469 } 1470 1471 return ce; 1472 } 1473 1474 static void execlists_submit_ports(struct intel_engine_cs *engine) 1475 { 1476 struct intel_engine_execlists *execlists = &engine->execlists; 1477 unsigned int n; 1478 1479 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1480 1481 /* 1482 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1483 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1484 * not be relinquished until the device is idle (see 1485 * i915_gem_idle_work_handler()). As a precaution, we make sure 1486 * that all ELSP are drained i.e. we have processed the CSB, 1487 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1488 */ 1489 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1490 1491 /* 1492 * ELSQ note: the submit queue is not cleared after being submitted 1493 * to the HW so we need to make sure we always clean it up. This is 1494 * currently ensured by the fact that we always write the same number 1495 * of elsq entries, keep this in mind before changing the loop below. 1496 */ 1497 for (n = execlists_num_ports(execlists); n--; ) { 1498 struct i915_request *rq = execlists->pending[n]; 1499 1500 write_desc(execlists, 1501 rq ? execlists_update_context(rq) : 0, 1502 n); 1503 } 1504 1505 /* we need to manually load the submit queue */ 1506 if (execlists->ctrl_reg) 1507 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1508 } 1509 1510 static bool ctx_single_port_submission(const struct intel_context *ce) 1511 { 1512 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1513 intel_context_force_single_submission(ce)); 1514 } 1515 1516 static bool can_merge_ctx(const struct intel_context *prev, 1517 const struct intel_context *next) 1518 { 1519 if (prev != next) 1520 return false; 1521 1522 if (ctx_single_port_submission(prev)) 1523 return false; 1524 1525 return true; 1526 } 1527 1528 static bool can_merge_rq(const struct i915_request *prev, 1529 const struct i915_request *next) 1530 { 1531 GEM_BUG_ON(prev == next); 1532 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1533 1534 /* 1535 * We do not submit known completed requests. Therefore if the next 1536 * request is already completed, we can pretend to merge it in 1537 * with the previous context (and we will skip updating the ELSP 1538 * and tracking). Thus hopefully keeping the ELSP full with active 1539 * contexts, despite the best efforts of preempt-to-busy to confuse 1540 * us. 1541 */ 1542 if (i915_request_completed(next)) 1543 return true; 1544 1545 if (unlikely((prev->fence.flags ^ next->fence.flags) & 1546 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1547 BIT(I915_FENCE_FLAG_SENTINEL)))) 1548 return false; 1549 1550 if (!can_merge_ctx(prev->context, next->context)) 1551 return false; 1552 1553 return true; 1554 } 1555 1556 static void virtual_update_register_offsets(u32 *regs, 1557 struct intel_engine_cs *engine) 1558 { 1559 set_offsets(regs, reg_offsets(engine), engine, false); 1560 } 1561 1562 static bool virtual_matches(const struct virtual_engine *ve, 1563 const struct i915_request *rq, 1564 const struct intel_engine_cs *engine) 1565 { 1566 const struct intel_engine_cs *inflight; 1567 1568 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1569 return false; 1570 1571 /* 1572 * We track when the HW has completed saving the context image 1573 * (i.e. when we have seen the final CS event switching out of 1574 * the context) and must not overwrite the context image before 1575 * then. This restricts us to only using the active engine 1576 * while the previous virtualized request is inflight (so 1577 * we reuse the register offsets). This is a very small 1578 * hystersis on the greedy seelction algorithm. 1579 */ 1580 inflight = intel_context_inflight(&ve->context); 1581 if (inflight && inflight != engine) 1582 return false; 1583 1584 return true; 1585 } 1586 1587 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, 1588 struct intel_engine_cs *engine) 1589 { 1590 struct intel_engine_cs *old = ve->siblings[0]; 1591 1592 /* All unattached (rq->engine == old) must already be completed */ 1593 1594 spin_lock(&old->breadcrumbs.irq_lock); 1595 if (!list_empty(&ve->context.signal_link)) { 1596 list_move_tail(&ve->context.signal_link, 1597 &engine->breadcrumbs.signalers); 1598 intel_engine_signal_breadcrumbs(engine); 1599 } 1600 spin_unlock(&old->breadcrumbs.irq_lock); 1601 } 1602 1603 static struct i915_request * 1604 last_active(const struct intel_engine_execlists *execlists) 1605 { 1606 struct i915_request * const *last = READ_ONCE(execlists->active); 1607 1608 while (*last && i915_request_completed(*last)) 1609 last++; 1610 1611 return *last; 1612 } 1613 1614 #define for_each_waiter(p__, rq__) \ 1615 list_for_each_entry_lockless(p__, \ 1616 &(rq__)->sched.waiters_list, \ 1617 wait_link) 1618 1619 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1620 { 1621 LIST_HEAD(list); 1622 1623 /* 1624 * We want to move the interrupted request to the back of 1625 * the round-robin list (i.e. its priority level), but 1626 * in doing so, we must then move all requests that were in 1627 * flight and were waiting for the interrupted request to 1628 * be run after it again. 1629 */ 1630 do { 1631 struct i915_dependency *p; 1632 1633 GEM_BUG_ON(i915_request_is_active(rq)); 1634 list_move_tail(&rq->sched.link, pl); 1635 1636 for_each_waiter(p, rq) { 1637 struct i915_request *w = 1638 container_of(p->waiter, typeof(*w), sched); 1639 1640 /* Leave semaphores spinning on the other engines */ 1641 if (w->engine != rq->engine) 1642 continue; 1643 1644 /* No waiter should start before its signaler */ 1645 GEM_BUG_ON(i915_request_started(w) && 1646 !i915_request_completed(rq)); 1647 1648 GEM_BUG_ON(i915_request_is_active(w)); 1649 if (!i915_request_is_ready(w)) 1650 continue; 1651 1652 if (rq_prio(w) < rq_prio(rq)) 1653 continue; 1654 1655 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1656 list_move_tail(&w->sched.link, &list); 1657 } 1658 1659 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1660 } while (rq); 1661 } 1662 1663 static void defer_active(struct intel_engine_cs *engine) 1664 { 1665 struct i915_request *rq; 1666 1667 rq = __unwind_incomplete_requests(engine); 1668 if (!rq) 1669 return; 1670 1671 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1672 } 1673 1674 static bool 1675 need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) 1676 { 1677 int hint; 1678 1679 if (!intel_engine_has_timeslices(engine)) 1680 return false; 1681 1682 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1683 return false; 1684 1685 hint = max(rq_prio(list_next_entry(rq, sched.link)), 1686 engine->execlists.queue_priority_hint); 1687 1688 return hint >= effective_prio(rq); 1689 } 1690 1691 static int 1692 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1693 { 1694 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1695 return INT_MIN; 1696 1697 return rq_prio(list_next_entry(rq, sched.link)); 1698 } 1699 1700 static inline unsigned long 1701 timeslice(const struct intel_engine_cs *engine) 1702 { 1703 return READ_ONCE(engine->props.timeslice_duration_ms); 1704 } 1705 1706 static unsigned long 1707 active_timeslice(const struct intel_engine_cs *engine) 1708 { 1709 const struct i915_request *rq = *engine->execlists.active; 1710 1711 if (!rq || i915_request_completed(rq)) 1712 return 0; 1713 1714 if (engine->execlists.switch_priority_hint < effective_prio(rq)) 1715 return 0; 1716 1717 return timeslice(engine); 1718 } 1719 1720 static void set_timeslice(struct intel_engine_cs *engine) 1721 { 1722 if (!intel_engine_has_timeslices(engine)) 1723 return; 1724 1725 set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); 1726 } 1727 1728 static void record_preemption(struct intel_engine_execlists *execlists) 1729 { 1730 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 1731 } 1732 1733 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine) 1734 { 1735 struct i915_request *rq; 1736 1737 rq = last_active(&engine->execlists); 1738 if (!rq) 1739 return 0; 1740 1741 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 1742 if (unlikely(intel_context_is_banned(rq->context))) 1743 return 1; 1744 1745 return READ_ONCE(engine->props.preempt_timeout_ms); 1746 } 1747 1748 static void set_preempt_timeout(struct intel_engine_cs *engine) 1749 { 1750 if (!intel_engine_has_preempt_reset(engine)) 1751 return; 1752 1753 set_timer_ms(&engine->execlists.preempt, 1754 active_preempt_timeout(engine)); 1755 } 1756 1757 static inline void clear_ports(struct i915_request **ports, int count) 1758 { 1759 memset_p((void **)ports, NULL, count); 1760 } 1761 1762 static void execlists_dequeue(struct intel_engine_cs *engine) 1763 { 1764 struct intel_engine_execlists * const execlists = &engine->execlists; 1765 struct i915_request **port = execlists->pending; 1766 struct i915_request ** const last_port = port + execlists->port_mask; 1767 struct i915_request *last; 1768 struct rb_node *rb; 1769 bool submit = false; 1770 1771 /* 1772 * Hardware submission is through 2 ports. Conceptually each port 1773 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 1774 * static for a context, and unique to each, so we only execute 1775 * requests belonging to a single context from each ring. RING_HEAD 1776 * is maintained by the CS in the context image, it marks the place 1777 * where it got up to last time, and through RING_TAIL we tell the CS 1778 * where we want to execute up to this time. 1779 * 1780 * In this list the requests are in order of execution. Consecutive 1781 * requests from the same context are adjacent in the ringbuffer. We 1782 * can combine these requests into a single RING_TAIL update: 1783 * 1784 * RING_HEAD...req1...req2 1785 * ^- RING_TAIL 1786 * since to execute req2 the CS must first execute req1. 1787 * 1788 * Our goal then is to point each port to the end of a consecutive 1789 * sequence of requests as being the most optimal (fewest wake ups 1790 * and context switches) submission. 1791 */ 1792 1793 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 1794 struct virtual_engine *ve = 1795 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1796 struct i915_request *rq = READ_ONCE(ve->request); 1797 1798 if (!rq) { /* lazily cleanup after another engine handled rq */ 1799 rb_erase_cached(rb, &execlists->virtual); 1800 RB_CLEAR_NODE(rb); 1801 rb = rb_first_cached(&execlists->virtual); 1802 continue; 1803 } 1804 1805 if (!virtual_matches(ve, rq, engine)) { 1806 rb = rb_next(rb); 1807 continue; 1808 } 1809 1810 break; 1811 } 1812 1813 /* 1814 * If the queue is higher priority than the last 1815 * request in the currently active context, submit afresh. 1816 * We will resubmit again afterwards in case we need to split 1817 * the active context to interject the preemption request, 1818 * i.e. we will retrigger preemption following the ack in case 1819 * of trouble. 1820 */ 1821 last = last_active(execlists); 1822 if (last) { 1823 if (need_preempt(engine, last, rb)) { 1824 ENGINE_TRACE(engine, 1825 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 1826 last->fence.context, 1827 last->fence.seqno, 1828 last->sched.attr.priority, 1829 execlists->queue_priority_hint); 1830 record_preemption(execlists); 1831 1832 /* 1833 * Don't let the RING_HEAD advance past the breadcrumb 1834 * as we unwind (and until we resubmit) so that we do 1835 * not accidentally tell it to go backwards. 1836 */ 1837 ring_set_paused(engine, 1); 1838 1839 /* 1840 * Note that we have not stopped the GPU at this point, 1841 * so we are unwinding the incomplete requests as they 1842 * remain inflight and so by the time we do complete 1843 * the preemption, some of the unwound requests may 1844 * complete! 1845 */ 1846 __unwind_incomplete_requests(engine); 1847 1848 last = NULL; 1849 } else if (need_timeslice(engine, last) && 1850 timer_expired(&engine->execlists.timer)) { 1851 ENGINE_TRACE(engine, 1852 "expired last=%llx:%lld, prio=%d, hint=%d\n", 1853 last->fence.context, 1854 last->fence.seqno, 1855 last->sched.attr.priority, 1856 execlists->queue_priority_hint); 1857 1858 ring_set_paused(engine, 1); 1859 defer_active(engine); 1860 1861 /* 1862 * Unlike for preemption, if we rewind and continue 1863 * executing the same context as previously active, 1864 * the order of execution will remain the same and 1865 * the tail will only advance. We do not need to 1866 * force a full context restore, as a lite-restore 1867 * is sufficient to resample the monotonic TAIL. 1868 * 1869 * If we switch to any other context, similarly we 1870 * will not rewind TAIL of current context, and 1871 * normal save/restore will preserve state and allow 1872 * us to later continue executing the same request. 1873 */ 1874 last = NULL; 1875 } else { 1876 /* 1877 * Otherwise if we already have a request pending 1878 * for execution after the current one, we can 1879 * just wait until the next CS event before 1880 * queuing more. In either case we will force a 1881 * lite-restore preemption event, but if we wait 1882 * we hopefully coalesce several updates into a single 1883 * submission. 1884 */ 1885 if (!list_is_last(&last->sched.link, 1886 &engine->active.requests)) { 1887 /* 1888 * Even if ELSP[1] is occupied and not worthy 1889 * of timeslices, our queue might be. 1890 */ 1891 if (!execlists->timer.expires && 1892 need_timeslice(engine, last)) 1893 set_timer_ms(&execlists->timer, 1894 timeslice(engine)); 1895 1896 return; 1897 } 1898 } 1899 } 1900 1901 while (rb) { /* XXX virtual is always taking precedence */ 1902 struct virtual_engine *ve = 1903 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1904 struct i915_request *rq; 1905 1906 spin_lock(&ve->base.active.lock); 1907 1908 rq = ve->request; 1909 if (unlikely(!rq)) { /* lost the race to a sibling */ 1910 spin_unlock(&ve->base.active.lock); 1911 rb_erase_cached(rb, &execlists->virtual); 1912 RB_CLEAR_NODE(rb); 1913 rb = rb_first_cached(&execlists->virtual); 1914 continue; 1915 } 1916 1917 GEM_BUG_ON(rq != ve->request); 1918 GEM_BUG_ON(rq->engine != &ve->base); 1919 GEM_BUG_ON(rq->context != &ve->context); 1920 1921 if (rq_prio(rq) >= queue_prio(execlists)) { 1922 if (!virtual_matches(ve, rq, engine)) { 1923 spin_unlock(&ve->base.active.lock); 1924 rb = rb_next(rb); 1925 continue; 1926 } 1927 1928 if (last && !can_merge_rq(last, rq)) { 1929 spin_unlock(&ve->base.active.lock); 1930 return; /* leave this for another */ 1931 } 1932 1933 ENGINE_TRACE(engine, 1934 "virtual rq=%llx:%lld%s, new engine? %s\n", 1935 rq->fence.context, 1936 rq->fence.seqno, 1937 i915_request_completed(rq) ? "!" : 1938 i915_request_started(rq) ? "*" : 1939 "", 1940 yesno(engine != ve->siblings[0])); 1941 1942 ve->request = NULL; 1943 ve->base.execlists.queue_priority_hint = INT_MIN; 1944 rb_erase_cached(rb, &execlists->virtual); 1945 RB_CLEAR_NODE(rb); 1946 1947 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 1948 rq->engine = engine; 1949 1950 if (engine != ve->siblings[0]) { 1951 u32 *regs = ve->context.lrc_reg_state; 1952 unsigned int n; 1953 1954 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1955 1956 if (!intel_engine_has_relative_mmio(engine)) 1957 virtual_update_register_offsets(regs, 1958 engine); 1959 1960 if (!list_empty(&ve->context.signals)) 1961 virtual_xfer_breadcrumbs(ve, engine); 1962 1963 /* 1964 * Move the bound engine to the top of the list 1965 * for future execution. We then kick this 1966 * tasklet first before checking others, so that 1967 * we preferentially reuse this set of bound 1968 * registers. 1969 */ 1970 for (n = 1; n < ve->num_siblings; n++) { 1971 if (ve->siblings[n] == engine) { 1972 swap(ve->siblings[n], 1973 ve->siblings[0]); 1974 break; 1975 } 1976 } 1977 1978 GEM_BUG_ON(ve->siblings[0] != engine); 1979 } 1980 1981 if (__i915_request_submit(rq)) { 1982 submit = true; 1983 last = rq; 1984 } 1985 i915_request_put(rq); 1986 1987 /* 1988 * Hmm, we have a bunch of virtual engine requests, 1989 * but the first one was already completed (thanks 1990 * preempt-to-busy!). Keep looking at the veng queue 1991 * until we have no more relevant requests (i.e. 1992 * the normal submit queue has higher priority). 1993 */ 1994 if (!submit) { 1995 spin_unlock(&ve->base.active.lock); 1996 rb = rb_first_cached(&execlists->virtual); 1997 continue; 1998 } 1999 } 2000 2001 spin_unlock(&ve->base.active.lock); 2002 break; 2003 } 2004 2005 while ((rb = rb_first_cached(&execlists->queue))) { 2006 struct i915_priolist *p = to_priolist(rb); 2007 struct i915_request *rq, *rn; 2008 int i; 2009 2010 priolist_for_each_request_consume(rq, rn, p, i) { 2011 bool merge = true; 2012 2013 /* 2014 * Can we combine this request with the current port? 2015 * It has to be the same context/ringbuffer and not 2016 * have any exceptions (e.g. GVT saying never to 2017 * combine contexts). 2018 * 2019 * If we can combine the requests, we can execute both 2020 * by updating the RING_TAIL to point to the end of the 2021 * second request, and so we never need to tell the 2022 * hardware about the first. 2023 */ 2024 if (last && !can_merge_rq(last, rq)) { 2025 /* 2026 * If we are on the second port and cannot 2027 * combine this request with the last, then we 2028 * are done. 2029 */ 2030 if (port == last_port) 2031 goto done; 2032 2033 /* 2034 * We must not populate both ELSP[] with the 2035 * same LRCA, i.e. we must submit 2 different 2036 * contexts if we submit 2 ELSP. 2037 */ 2038 if (last->context == rq->context) 2039 goto done; 2040 2041 if (i915_request_has_sentinel(last)) 2042 goto done; 2043 2044 /* 2045 * If GVT overrides us we only ever submit 2046 * port[0], leaving port[1] empty. Note that we 2047 * also have to be careful that we don't queue 2048 * the same context (even though a different 2049 * request) to the second port. 2050 */ 2051 if (ctx_single_port_submission(last->context) || 2052 ctx_single_port_submission(rq->context)) 2053 goto done; 2054 2055 merge = false; 2056 } 2057 2058 if (__i915_request_submit(rq)) { 2059 if (!merge) { 2060 *port = execlists_schedule_in(last, port - execlists->pending); 2061 port++; 2062 last = NULL; 2063 } 2064 2065 GEM_BUG_ON(last && 2066 !can_merge_ctx(last->context, 2067 rq->context)); 2068 2069 submit = true; 2070 last = rq; 2071 } 2072 } 2073 2074 rb_erase_cached(&p->node, &execlists->queue); 2075 i915_priolist_free(p); 2076 } 2077 2078 done: 2079 /* 2080 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2081 * 2082 * We choose the priority hint such that if we add a request of greater 2083 * priority than this, we kick the submission tasklet to decide on 2084 * the right order of submitting the requests to hardware. We must 2085 * also be prepared to reorder requests as they are in-flight on the 2086 * HW. We derive the priority hint then as the first "hole" in 2087 * the HW submission ports and if there are no available slots, 2088 * the priority of the lowest executing request, i.e. last. 2089 * 2090 * When we do receive a higher priority request ready to run from the 2091 * user, see queue_request(), the priority hint is bumped to that 2092 * request triggering preemption on the next dequeue (or subsequent 2093 * interrupt for secondary ports). 2094 */ 2095 execlists->queue_priority_hint = queue_prio(execlists); 2096 2097 if (submit) { 2098 *port = execlists_schedule_in(last, port - execlists->pending); 2099 execlists->switch_priority_hint = 2100 switch_prio(engine, *execlists->pending); 2101 2102 /* 2103 * Skip if we ended up with exactly the same set of requests, 2104 * e.g. trying to timeslice a pair of ordered contexts 2105 */ 2106 if (!memcmp(execlists->active, execlists->pending, 2107 (port - execlists->pending + 1) * sizeof(*port))) { 2108 do 2109 execlists_schedule_out(fetch_and_zero(port)); 2110 while (port-- != execlists->pending); 2111 2112 goto skip_submit; 2113 } 2114 clear_ports(port + 1, last_port - port); 2115 2116 execlists_submit_ports(engine); 2117 set_preempt_timeout(engine); 2118 } else { 2119 skip_submit: 2120 ring_set_paused(engine, 0); 2121 } 2122 } 2123 2124 static void 2125 cancel_port_requests(struct intel_engine_execlists * const execlists) 2126 { 2127 struct i915_request * const *port; 2128 2129 for (port = execlists->pending; *port; port++) 2130 execlists_schedule_out(*port); 2131 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2132 2133 /* Mark the end of active before we overwrite *active */ 2134 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2135 execlists_schedule_out(*port); 2136 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2137 2138 WRITE_ONCE(execlists->active, execlists->inflight); 2139 } 2140 2141 static inline void 2142 invalidate_csb_entries(const u32 *first, const u32 *last) 2143 { 2144 clflush((void *)first); 2145 clflush((void *)last); 2146 } 2147 2148 static inline bool 2149 reset_in_progress(const struct intel_engine_execlists *execlists) 2150 { 2151 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 2152 } 2153 2154 /* 2155 * Starting with Gen12, the status has a new format: 2156 * 2157 * bit 0: switched to new queue 2158 * bit 1: reserved 2159 * bit 2: semaphore wait mode (poll or signal), only valid when 2160 * switch detail is set to "wait on semaphore" 2161 * bits 3-5: engine class 2162 * bits 6-11: engine instance 2163 * bits 12-14: reserved 2164 * bits 15-25: sw context id of the lrc the GT switched to 2165 * bits 26-31: sw counter of the lrc the GT switched to 2166 * bits 32-35: context switch detail 2167 * - 0: ctx complete 2168 * - 1: wait on sync flip 2169 * - 2: wait on vblank 2170 * - 3: wait on scanline 2171 * - 4: wait on semaphore 2172 * - 5: context preempted (not on SEMAPHORE_WAIT or 2173 * WAIT_FOR_EVENT) 2174 * bit 36: reserved 2175 * bits 37-43: wait detail (for switch detail 1 to 4) 2176 * bits 44-46: reserved 2177 * bits 47-57: sw context id of the lrc the GT switched away from 2178 * bits 58-63: sw counter of the lrc the GT switched away from 2179 */ 2180 static inline bool 2181 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2182 { 2183 u32 lower_dw = csb[0]; 2184 u32 upper_dw = csb[1]; 2185 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2186 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2187 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2188 2189 /* 2190 * The context switch detail is not guaranteed to be 5 when a preemption 2191 * occurs, so we can't just check for that. The check below works for 2192 * all the cases we care about, including preemptions of WAIT 2193 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2194 * would require some extra handling, but we don't support that. 2195 */ 2196 if (!ctx_away_valid || new_queue) { 2197 GEM_BUG_ON(!ctx_to_valid); 2198 return true; 2199 } 2200 2201 /* 2202 * switch detail = 5 is covered by the case above and we do not expect a 2203 * context switch on an unsuccessful wait instruction since we always 2204 * use polling mode. 2205 */ 2206 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2207 return false; 2208 } 2209 2210 static inline bool 2211 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2212 { 2213 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2214 } 2215 2216 static void process_csb(struct intel_engine_cs *engine) 2217 { 2218 struct intel_engine_execlists * const execlists = &engine->execlists; 2219 const u32 * const buf = execlists->csb_status; 2220 const u8 num_entries = execlists->csb_size; 2221 u8 head, tail; 2222 2223 /* 2224 * As we modify our execlists state tracking we require exclusive 2225 * access. Either we are inside the tasklet, or the tasklet is disabled 2226 * and we assume that is only inside the reset paths and so serialised. 2227 */ 2228 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2229 !reset_in_progress(execlists)); 2230 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2231 2232 /* 2233 * Note that csb_write, csb_status may be either in HWSP or mmio. 2234 * When reading from the csb_write mmio register, we have to be 2235 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2236 * the low 4bits. As it happens we know the next 4bits are always 2237 * zero and so we can simply masked off the low u8 of the register 2238 * and treat it identically to reading from the HWSP (without having 2239 * to use explicit shifting and masking, and probably bifurcating 2240 * the code to handle the legacy mmio read). 2241 */ 2242 head = execlists->csb_head; 2243 tail = READ_ONCE(*execlists->csb_write); 2244 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2245 if (unlikely(head == tail)) 2246 return; 2247 2248 /* 2249 * Hopefully paired with a wmb() in HW! 2250 * 2251 * We must complete the read of the write pointer before any reads 2252 * from the CSB, so that we do not see stale values. Without an rmb 2253 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2254 * we perform the READ_ONCE(*csb_write). 2255 */ 2256 rmb(); 2257 2258 do { 2259 bool promote; 2260 2261 if (++head == num_entries) 2262 head = 0; 2263 2264 /* 2265 * We are flying near dragons again. 2266 * 2267 * We hold a reference to the request in execlist_port[] 2268 * but no more than that. We are operating in softirq 2269 * context and so cannot hold any mutex or sleep. That 2270 * prevents us stopping the requests we are processing 2271 * in port[] from being retired simultaneously (the 2272 * breadcrumb will be complete before we see the 2273 * context-switch). As we only hold the reference to the 2274 * request, any pointer chasing underneath the request 2275 * is subject to a potential use-after-free. Thus we 2276 * store all of the bookkeeping within port[] as 2277 * required, and avoid using unguarded pointers beneath 2278 * request itself. The same applies to the atomic 2279 * status notifier. 2280 */ 2281 2282 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2283 head, buf[2 * head + 0], buf[2 * head + 1]); 2284 2285 if (INTEL_GEN(engine->i915) >= 12) 2286 promote = gen12_csb_parse(execlists, buf + 2 * head); 2287 else 2288 promote = gen8_csb_parse(execlists, buf + 2 * head); 2289 if (promote) { 2290 struct i915_request * const *old = execlists->active; 2291 2292 /* Point active to the new ELSP; prevent overwriting */ 2293 WRITE_ONCE(execlists->active, execlists->pending); 2294 2295 if (!inject_preempt_hang(execlists)) 2296 ring_set_paused(engine, 0); 2297 2298 /* cancel old inflight, prepare for switch */ 2299 trace_ports(execlists, "preempted", old); 2300 while (*old) 2301 execlists_schedule_out(*old++); 2302 2303 /* switch pending to inflight */ 2304 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2305 WRITE_ONCE(execlists->active, 2306 memcpy(execlists->inflight, 2307 execlists->pending, 2308 execlists_num_ports(execlists) * 2309 sizeof(*execlists->pending))); 2310 2311 WRITE_ONCE(execlists->pending[0], NULL); 2312 } else { 2313 GEM_BUG_ON(!*execlists->active); 2314 2315 /* port0 completed, advanced to port1 */ 2316 trace_ports(execlists, "completed", execlists->active); 2317 2318 /* 2319 * We rely on the hardware being strongly 2320 * ordered, that the breadcrumb write is 2321 * coherent (visible from the CPU) before the 2322 * user interrupt and CSB is processed. 2323 */ 2324 GEM_BUG_ON(!i915_request_completed(*execlists->active) && 2325 !reset_in_progress(execlists)); 2326 execlists_schedule_out(*execlists->active++); 2327 2328 GEM_BUG_ON(execlists->active - execlists->inflight > 2329 execlists_num_ports(execlists)); 2330 } 2331 } while (head != tail); 2332 2333 execlists->csb_head = head; 2334 set_timeslice(engine); 2335 2336 /* 2337 * Gen11 has proven to fail wrt global observation point between 2338 * entry and tail update, failing on the ordering and thus 2339 * we see an old entry in the context status buffer. 2340 * 2341 * Forcibly evict out entries for the next gpu csb update, 2342 * to increase the odds that we get a fresh entries with non 2343 * working hardware. The cost for doing so comes out mostly with 2344 * the wash as hardware, working or not, will need to do the 2345 * invalidation before. 2346 */ 2347 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2348 } 2349 2350 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2351 { 2352 lockdep_assert_held(&engine->active.lock); 2353 if (!engine->execlists.pending[0]) { 2354 rcu_read_lock(); /* protect peeking at execlists->active */ 2355 execlists_dequeue(engine); 2356 rcu_read_unlock(); 2357 } 2358 } 2359 2360 static void __execlists_hold(struct i915_request *rq) 2361 { 2362 LIST_HEAD(list); 2363 2364 do { 2365 struct i915_dependency *p; 2366 2367 if (i915_request_is_active(rq)) 2368 __i915_request_unsubmit(rq); 2369 2370 RQ_TRACE(rq, "on hold\n"); 2371 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2372 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2373 i915_request_set_hold(rq); 2374 2375 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 2376 struct i915_request *w = 2377 container_of(p->waiter, typeof(*w), sched); 2378 2379 /* Leave semaphores spinning on the other engines */ 2380 if (w->engine != rq->engine) 2381 continue; 2382 2383 if (!i915_request_is_ready(w)) 2384 continue; 2385 2386 if (i915_request_completed(w)) 2387 continue; 2388 2389 if (i915_request_on_hold(rq)) 2390 continue; 2391 2392 list_move_tail(&w->sched.link, &list); 2393 } 2394 2395 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2396 } while (rq); 2397 } 2398 2399 static bool execlists_hold(struct intel_engine_cs *engine, 2400 struct i915_request *rq) 2401 { 2402 spin_lock_irq(&engine->active.lock); 2403 2404 if (i915_request_completed(rq)) { /* too late! */ 2405 rq = NULL; 2406 goto unlock; 2407 } 2408 2409 if (rq->engine != engine) { /* preempted virtual engine */ 2410 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2411 2412 /* 2413 * intel_context_inflight() is only protected by virtue 2414 * of process_csb() being called only by the tasklet (or 2415 * directly from inside reset while the tasklet is suspended). 2416 * Assert that neither of those are allowed to run while we 2417 * poke at the request queues. 2418 */ 2419 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2420 2421 /* 2422 * An unsubmitted request along a virtual engine will 2423 * remain on the active (this) engine until we are able 2424 * to process the context switch away (and so mark the 2425 * context as no longer in flight). That cannot have happened 2426 * yet, otherwise we would not be hanging! 2427 */ 2428 spin_lock(&ve->base.active.lock); 2429 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2430 GEM_BUG_ON(ve->request != rq); 2431 ve->request = NULL; 2432 spin_unlock(&ve->base.active.lock); 2433 i915_request_put(rq); 2434 2435 rq->engine = engine; 2436 } 2437 2438 /* 2439 * Transfer this request onto the hold queue to prevent it 2440 * being resumbitted to HW (and potentially completed) before we have 2441 * released it. Since we may have already submitted following 2442 * requests, we need to remove those as well. 2443 */ 2444 GEM_BUG_ON(i915_request_on_hold(rq)); 2445 GEM_BUG_ON(rq->engine != engine); 2446 __execlists_hold(rq); 2447 2448 unlock: 2449 spin_unlock_irq(&engine->active.lock); 2450 return rq; 2451 } 2452 2453 static bool hold_request(const struct i915_request *rq) 2454 { 2455 struct i915_dependency *p; 2456 2457 /* 2458 * If one of our ancestors is on hold, we must also be on hold, 2459 * otherwise we will bypass it and execute before it. 2460 */ 2461 list_for_each_entry(p, &rq->sched.signalers_list, signal_link) { 2462 const struct i915_request *s = 2463 container_of(p->signaler, typeof(*s), sched); 2464 2465 if (s->engine != rq->engine) 2466 continue; 2467 2468 if (i915_request_on_hold(s)) 2469 return true; 2470 } 2471 2472 return false; 2473 } 2474 2475 static void __execlists_unhold(struct i915_request *rq) 2476 { 2477 LIST_HEAD(list); 2478 2479 do { 2480 struct i915_dependency *p; 2481 2482 GEM_BUG_ON(!i915_request_on_hold(rq)); 2483 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2484 2485 i915_request_clear_hold(rq); 2486 list_move_tail(&rq->sched.link, 2487 i915_sched_lookup_priolist(rq->engine, 2488 rq_prio(rq))); 2489 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2490 RQ_TRACE(rq, "hold release\n"); 2491 2492 /* Also release any children on this engine that are ready */ 2493 list_for_each_entry(p, &rq->sched.waiters_list, wait_link) { 2494 struct i915_request *w = 2495 container_of(p->waiter, typeof(*w), sched); 2496 2497 if (w->engine != rq->engine) 2498 continue; 2499 2500 if (!i915_request_on_hold(rq)) 2501 continue; 2502 2503 /* Check that no other parents are also on hold */ 2504 if (hold_request(rq)) 2505 continue; 2506 2507 list_move_tail(&w->sched.link, &list); 2508 } 2509 2510 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2511 } while (rq); 2512 } 2513 2514 static void execlists_unhold(struct intel_engine_cs *engine, 2515 struct i915_request *rq) 2516 { 2517 spin_lock_irq(&engine->active.lock); 2518 2519 /* 2520 * Move this request back to the priority queue, and all of its 2521 * children and grandchildren that were suspended along with it. 2522 */ 2523 __execlists_unhold(rq); 2524 2525 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2526 engine->execlists.queue_priority_hint = rq_prio(rq); 2527 tasklet_hi_schedule(&engine->execlists.tasklet); 2528 } 2529 2530 spin_unlock_irq(&engine->active.lock); 2531 } 2532 2533 struct execlists_capture { 2534 struct work_struct work; 2535 struct i915_request *rq; 2536 struct i915_gpu_coredump *error; 2537 }; 2538 2539 static void execlists_capture_work(struct work_struct *work) 2540 { 2541 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2542 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2543 struct intel_engine_cs *engine = cap->rq->engine; 2544 struct intel_gt_coredump *gt = cap->error->gt; 2545 struct intel_engine_capture_vma *vma; 2546 2547 /* Compress all the objects attached to the request, slow! */ 2548 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2549 if (vma) { 2550 struct i915_vma_compress *compress = 2551 i915_vma_capture_prepare(gt); 2552 2553 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2554 i915_vma_capture_finish(gt, compress); 2555 } 2556 2557 gt->simulated = gt->engine->simulated; 2558 cap->error->simulated = gt->simulated; 2559 2560 /* Publish the error state, and announce it to the world */ 2561 i915_error_state_store(cap->error); 2562 i915_gpu_coredump_put(cap->error); 2563 2564 /* Return this request and all that depend upon it for signaling */ 2565 execlists_unhold(engine, cap->rq); 2566 i915_request_put(cap->rq); 2567 2568 kfree(cap); 2569 } 2570 2571 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2572 { 2573 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2574 struct execlists_capture *cap; 2575 2576 cap = kmalloc(sizeof(*cap), gfp); 2577 if (!cap) 2578 return NULL; 2579 2580 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2581 if (!cap->error) 2582 goto err_cap; 2583 2584 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2585 if (!cap->error->gt) 2586 goto err_gpu; 2587 2588 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2589 if (!cap->error->gt->engine) 2590 goto err_gt; 2591 2592 return cap; 2593 2594 err_gt: 2595 kfree(cap->error->gt); 2596 err_gpu: 2597 kfree(cap->error); 2598 err_cap: 2599 kfree(cap); 2600 return NULL; 2601 } 2602 2603 static bool execlists_capture(struct intel_engine_cs *engine) 2604 { 2605 struct execlists_capture *cap; 2606 2607 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 2608 return true; 2609 2610 /* 2611 * We need to _quickly_ capture the engine state before we reset. 2612 * We are inside an atomic section (softirq) here and we are delaying 2613 * the forced preemption event. 2614 */ 2615 cap = capture_regs(engine); 2616 if (!cap) 2617 return true; 2618 2619 cap->rq = execlists_active(&engine->execlists); 2620 GEM_BUG_ON(!cap->rq); 2621 2622 rcu_read_lock(); 2623 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 2624 cap->rq = i915_request_get_rcu(cap->rq); 2625 rcu_read_unlock(); 2626 if (!cap->rq) 2627 goto err_free; 2628 2629 /* 2630 * Remove the request from the execlists queue, and take ownership 2631 * of the request. We pass it to our worker who will _slowly_ compress 2632 * all the pages the _user_ requested for debugging their batch, after 2633 * which we return it to the queue for signaling. 2634 * 2635 * By removing them from the execlists queue, we also remove the 2636 * requests from being processed by __unwind_incomplete_requests() 2637 * during the intel_engine_reset(), and so they will *not* be replayed 2638 * afterwards. 2639 * 2640 * Note that because we have not yet reset the engine at this point, 2641 * it is possible for the request that we have identified as being 2642 * guilty, did in fact complete and we will then hit an arbitration 2643 * point allowing the outstanding preemption to succeed. The likelihood 2644 * of that is very low (as capturing of the engine registers should be 2645 * fast enough to run inside an irq-off atomic section!), so we will 2646 * simply hold that request accountable for being non-preemptible 2647 * long enough to force the reset. 2648 */ 2649 if (!execlists_hold(engine, cap->rq)) 2650 goto err_rq; 2651 2652 INIT_WORK(&cap->work, execlists_capture_work); 2653 schedule_work(&cap->work); 2654 return true; 2655 2656 err_rq: 2657 i915_request_put(cap->rq); 2658 err_free: 2659 i915_gpu_coredump_put(cap->error); 2660 kfree(cap); 2661 return false; 2662 } 2663 2664 static noinline void preempt_reset(struct intel_engine_cs *engine) 2665 { 2666 const unsigned int bit = I915_RESET_ENGINE + engine->id; 2667 unsigned long *lock = &engine->gt->reset.flags; 2668 2669 if (i915_modparams.reset < 3) 2670 return; 2671 2672 if (test_and_set_bit(bit, lock)) 2673 return; 2674 2675 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 2676 tasklet_disable_nosync(&engine->execlists.tasklet); 2677 2678 ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n", 2679 READ_ONCE(engine->props.preempt_timeout_ms), 2680 jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); 2681 2682 ring_set_paused(engine, 1); /* Freeze the current request in place */ 2683 if (execlists_capture(engine)) 2684 intel_engine_reset(engine, "preemption time out"); 2685 else 2686 ring_set_paused(engine, 0); 2687 2688 tasklet_enable(&engine->execlists.tasklet); 2689 clear_and_wake_up_bit(bit, lock); 2690 } 2691 2692 static bool preempt_timeout(const struct intel_engine_cs *const engine) 2693 { 2694 const struct timer_list *t = &engine->execlists.preempt; 2695 2696 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 2697 return false; 2698 2699 if (!timer_expired(t)) 2700 return false; 2701 2702 return READ_ONCE(engine->execlists.pending[0]); 2703 } 2704 2705 /* 2706 * Check the unread Context Status Buffers and manage the submission of new 2707 * contexts to the ELSP accordingly. 2708 */ 2709 static void execlists_submission_tasklet(unsigned long data) 2710 { 2711 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 2712 bool timeout = preempt_timeout(engine); 2713 2714 process_csb(engine); 2715 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 2716 unsigned long flags; 2717 2718 spin_lock_irqsave(&engine->active.lock, flags); 2719 __execlists_submission_tasklet(engine); 2720 spin_unlock_irqrestore(&engine->active.lock, flags); 2721 2722 /* Recheck after serialising with direct-submission */ 2723 if (timeout && preempt_timeout(engine)) 2724 preempt_reset(engine); 2725 } 2726 } 2727 2728 static void __execlists_kick(struct intel_engine_execlists *execlists) 2729 { 2730 /* Kick the tasklet for some interrupt coalescing and reset handling */ 2731 tasklet_hi_schedule(&execlists->tasklet); 2732 } 2733 2734 #define execlists_kick(t, member) \ 2735 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 2736 2737 static void execlists_timeslice(struct timer_list *timer) 2738 { 2739 execlists_kick(timer, timer); 2740 } 2741 2742 static void execlists_preempt(struct timer_list *timer) 2743 { 2744 execlists_kick(timer, preempt); 2745 } 2746 2747 static void queue_request(struct intel_engine_cs *engine, 2748 struct i915_request *rq) 2749 { 2750 GEM_BUG_ON(!list_empty(&rq->sched.link)); 2751 list_add_tail(&rq->sched.link, 2752 i915_sched_lookup_priolist(engine, rq_prio(rq))); 2753 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2754 } 2755 2756 static void __submit_queue_imm(struct intel_engine_cs *engine) 2757 { 2758 struct intel_engine_execlists * const execlists = &engine->execlists; 2759 2760 if (reset_in_progress(execlists)) 2761 return; /* defer until we restart the engine following reset */ 2762 2763 if (execlists->tasklet.func == execlists_submission_tasklet) 2764 __execlists_submission_tasklet(engine); 2765 else 2766 tasklet_hi_schedule(&execlists->tasklet); 2767 } 2768 2769 static void submit_queue(struct intel_engine_cs *engine, 2770 const struct i915_request *rq) 2771 { 2772 struct intel_engine_execlists *execlists = &engine->execlists; 2773 2774 if (rq_prio(rq) <= execlists->queue_priority_hint) 2775 return; 2776 2777 execlists->queue_priority_hint = rq_prio(rq); 2778 __submit_queue_imm(engine); 2779 } 2780 2781 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 2782 const struct i915_request *rq) 2783 { 2784 GEM_BUG_ON(i915_request_on_hold(rq)); 2785 return !list_empty(&engine->active.hold) && hold_request(rq); 2786 } 2787 2788 static void execlists_submit_request(struct i915_request *request) 2789 { 2790 struct intel_engine_cs *engine = request->engine; 2791 unsigned long flags; 2792 2793 /* Will be called from irq-context when using foreign fences. */ 2794 spin_lock_irqsave(&engine->active.lock, flags); 2795 2796 if (unlikely(ancestor_on_hold(engine, request))) { 2797 list_add_tail(&request->sched.link, &engine->active.hold); 2798 i915_request_set_hold(request); 2799 } else { 2800 queue_request(engine, request); 2801 2802 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 2803 GEM_BUG_ON(list_empty(&request->sched.link)); 2804 2805 submit_queue(engine, request); 2806 } 2807 2808 spin_unlock_irqrestore(&engine->active.lock, flags); 2809 } 2810 2811 static void __execlists_context_fini(struct intel_context *ce) 2812 { 2813 intel_ring_put(ce->ring); 2814 i915_vma_put(ce->state); 2815 } 2816 2817 static void execlists_context_destroy(struct kref *kref) 2818 { 2819 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 2820 2821 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 2822 GEM_BUG_ON(intel_context_is_pinned(ce)); 2823 2824 if (ce->state) 2825 __execlists_context_fini(ce); 2826 2827 intel_context_fini(ce); 2828 intel_context_free(ce); 2829 } 2830 2831 static void 2832 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 2833 { 2834 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2835 return; 2836 2837 vaddr += engine->context_size; 2838 2839 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 2840 } 2841 2842 static void 2843 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 2844 { 2845 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 2846 return; 2847 2848 vaddr += engine->context_size; 2849 2850 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 2851 dev_err_once(engine->i915->drm.dev, 2852 "%s context redzone overwritten!\n", 2853 engine->name); 2854 } 2855 2856 static void execlists_context_unpin(struct intel_context *ce) 2857 { 2858 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, 2859 ce->engine); 2860 2861 i915_gem_object_unpin_map(ce->state->obj); 2862 } 2863 2864 static void 2865 __execlists_update_reg_state(const struct intel_context *ce, 2866 const struct intel_engine_cs *engine, 2867 u32 head) 2868 { 2869 struct intel_ring *ring = ce->ring; 2870 u32 *regs = ce->lrc_reg_state; 2871 2872 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 2873 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 2874 2875 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 2876 regs[CTX_RING_HEAD] = head; 2877 regs[CTX_RING_TAIL] = ring->tail; 2878 2879 /* RPCS */ 2880 if (engine->class == RENDER_CLASS) { 2881 regs[CTX_R_PWR_CLK_STATE] = 2882 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 2883 2884 i915_oa_init_reg_state(ce, engine); 2885 } 2886 } 2887 2888 static int 2889 __execlists_context_pin(struct intel_context *ce, 2890 struct intel_engine_cs *engine) 2891 { 2892 void *vaddr; 2893 2894 GEM_BUG_ON(!ce->state); 2895 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 2896 2897 vaddr = i915_gem_object_pin_map(ce->state->obj, 2898 i915_coherent_map_type(engine->i915) | 2899 I915_MAP_OVERRIDE); 2900 if (IS_ERR(vaddr)) 2901 return PTR_ERR(vaddr); 2902 2903 ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 2904 ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; 2905 __execlists_update_reg_state(ce, engine, ce->ring->tail); 2906 2907 return 0; 2908 } 2909 2910 static int execlists_context_pin(struct intel_context *ce) 2911 { 2912 return __execlists_context_pin(ce, ce->engine); 2913 } 2914 2915 static int execlists_context_alloc(struct intel_context *ce) 2916 { 2917 return __execlists_context_alloc(ce, ce->engine); 2918 } 2919 2920 static void execlists_context_reset(struct intel_context *ce) 2921 { 2922 CE_TRACE(ce, "reset\n"); 2923 GEM_BUG_ON(!intel_context_is_pinned(ce)); 2924 2925 /* 2926 * Because we emit WA_TAIL_DWORDS there may be a disparity 2927 * between our bookkeeping in ce->ring->head and ce->ring->tail and 2928 * that stored in context. As we only write new commands from 2929 * ce->ring->tail onwards, everything before that is junk. If the GPU 2930 * starts reading from its RING_HEAD from the context, it may try to 2931 * execute that junk and die. 2932 * 2933 * The contexts that are stilled pinned on resume belong to the 2934 * kernel, and are local to each engine. All other contexts will 2935 * have their head/tail sanitized upon pinning before use, so they 2936 * will never see garbage, 2937 * 2938 * So to avoid that we reset the context images upon resume. For 2939 * simplicity, we just zero everything out. 2940 */ 2941 intel_ring_reset(ce->ring, ce->ring->emit); 2942 2943 /* Scrub away the garbage */ 2944 execlists_init_reg_state(ce->lrc_reg_state, 2945 ce, ce->engine, ce->ring, true); 2946 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 2947 2948 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; 2949 } 2950 2951 static const struct intel_context_ops execlists_context_ops = { 2952 .alloc = execlists_context_alloc, 2953 2954 .pin = execlists_context_pin, 2955 .unpin = execlists_context_unpin, 2956 2957 .enter = intel_context_enter_engine, 2958 .exit = intel_context_exit_engine, 2959 2960 .reset = execlists_context_reset, 2961 .destroy = execlists_context_destroy, 2962 }; 2963 2964 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 2965 { 2966 u32 *cs; 2967 2968 GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb); 2969 2970 cs = intel_ring_begin(rq, 6); 2971 if (IS_ERR(cs)) 2972 return PTR_ERR(cs); 2973 2974 /* 2975 * Check if we have been preempted before we even get started. 2976 * 2977 * After this point i915_request_started() reports true, even if 2978 * we get preempted and so are no longer running. 2979 */ 2980 *cs++ = MI_ARB_CHECK; 2981 *cs++ = MI_NOOP; 2982 2983 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 2984 *cs++ = i915_request_timeline(rq)->hwsp_offset; 2985 *cs++ = 0; 2986 *cs++ = rq->fence.seqno - 1; 2987 2988 intel_ring_advance(rq, cs); 2989 2990 /* Record the updated position of the request's payload */ 2991 rq->infix = intel_ring_offset(rq, cs); 2992 2993 return 0; 2994 } 2995 2996 static int execlists_request_alloc(struct i915_request *request) 2997 { 2998 int ret; 2999 3000 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3001 3002 /* 3003 * Flush enough space to reduce the likelihood of waiting after 3004 * we start building the request - in which case we will just 3005 * have to repeat work. 3006 */ 3007 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3008 3009 /* 3010 * Note that after this point, we have committed to using 3011 * this request as it is being used to both track the 3012 * state of engine initialisation and liveness of the 3013 * golden renderstate above. Think twice before you try 3014 * to cancel/unwind this request now. 3015 */ 3016 3017 /* Unconditionally invalidate GPU caches and TLBs. */ 3018 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3019 if (ret) 3020 return ret; 3021 3022 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3023 return 0; 3024 } 3025 3026 /* 3027 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3028 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3029 * but there is a slight complication as this is applied in WA batch where the 3030 * values are only initialized once so we cannot take register value at the 3031 * beginning and reuse it further; hence we save its value to memory, upload a 3032 * constant value with bit21 set and then we restore it back with the saved value. 3033 * To simplify the WA, a constant value is formed by using the default value 3034 * of this register. This shouldn't be a problem because we are only modifying 3035 * it for a short period and this batch in non-premptible. We can ofcourse 3036 * use additional instructions that read the actual value of the register 3037 * at that time and set our bit of interest but it makes the WA complicated. 3038 * 3039 * This WA is also required for Gen9 so extracting as a function avoids 3040 * code duplication. 3041 */ 3042 static u32 * 3043 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3044 { 3045 /* NB no one else is allowed to scribble over scratch + 256! */ 3046 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3047 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3048 *batch++ = intel_gt_scratch_offset(engine->gt, 3049 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3050 *batch++ = 0; 3051 3052 *batch++ = MI_LOAD_REGISTER_IMM(1); 3053 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3054 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3055 3056 batch = gen8_emit_pipe_control(batch, 3057 PIPE_CONTROL_CS_STALL | 3058 PIPE_CONTROL_DC_FLUSH_ENABLE, 3059 0); 3060 3061 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3062 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3063 *batch++ = intel_gt_scratch_offset(engine->gt, 3064 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3065 *batch++ = 0; 3066 3067 return batch; 3068 } 3069 3070 /* 3071 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3072 * initialized at the beginning and shared across all contexts but this field 3073 * helps us to have multiple batches at different offsets and select them based 3074 * on a criteria. At the moment this batch always start at the beginning of the page 3075 * and at this point we don't have multiple wa_ctx batch buffers. 3076 * 3077 * The number of WA applied are not known at the beginning; we use this field 3078 * to return the no of DWORDS written. 3079 * 3080 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3081 * so it adds NOOPs as padding to make it cacheline aligned. 3082 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3083 * makes a complete batch buffer. 3084 */ 3085 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3086 { 3087 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3088 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3089 3090 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3091 if (IS_BROADWELL(engine->i915)) 3092 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3093 3094 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3095 /* Actual scratch location is at 128 bytes offset */ 3096 batch = gen8_emit_pipe_control(batch, 3097 PIPE_CONTROL_FLUSH_L3 | 3098 PIPE_CONTROL_STORE_DATA_INDEX | 3099 PIPE_CONTROL_CS_STALL | 3100 PIPE_CONTROL_QW_WRITE, 3101 LRC_PPHWSP_SCRATCH_ADDR); 3102 3103 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3104 3105 /* Pad to end of cacheline */ 3106 while ((unsigned long)batch % CACHELINE_BYTES) 3107 *batch++ = MI_NOOP; 3108 3109 /* 3110 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3111 * execution depends on the length specified in terms of cache lines 3112 * in the register CTX_RCS_INDIRECT_CTX 3113 */ 3114 3115 return batch; 3116 } 3117 3118 struct lri { 3119 i915_reg_t reg; 3120 u32 value; 3121 }; 3122 3123 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3124 { 3125 GEM_BUG_ON(!count || count > 63); 3126 3127 *batch++ = MI_LOAD_REGISTER_IMM(count); 3128 do { 3129 *batch++ = i915_mmio_reg_offset(lri->reg); 3130 *batch++ = lri->value; 3131 } while (lri++, --count); 3132 *batch++ = MI_NOOP; 3133 3134 return batch; 3135 } 3136 3137 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3138 { 3139 static const struct lri lri[] = { 3140 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3141 { 3142 COMMON_SLICE_CHICKEN2, 3143 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3144 0), 3145 }, 3146 3147 /* BSpec: 11391 */ 3148 { 3149 FF_SLICE_CHICKEN, 3150 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3151 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3152 }, 3153 3154 /* BSpec: 11299 */ 3155 { 3156 _3D_CHICKEN3, 3157 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3158 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3159 } 3160 }; 3161 3162 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3163 3164 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3165 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3166 3167 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3168 batch = gen8_emit_pipe_control(batch, 3169 PIPE_CONTROL_FLUSH_L3 | 3170 PIPE_CONTROL_STORE_DATA_INDEX | 3171 PIPE_CONTROL_CS_STALL | 3172 PIPE_CONTROL_QW_WRITE, 3173 LRC_PPHWSP_SCRATCH_ADDR); 3174 3175 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3176 3177 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3178 if (HAS_POOLED_EU(engine->i915)) { 3179 /* 3180 * EU pool configuration is setup along with golden context 3181 * during context initialization. This value depends on 3182 * device type (2x6 or 3x6) and needs to be updated based 3183 * on which subslice is disabled especially for 2x6 3184 * devices, however it is safe to load default 3185 * configuration of 3x6 device instead of masking off 3186 * corresponding bits because HW ignores bits of a disabled 3187 * subslice and drops down to appropriate config. Please 3188 * see render_state_setup() in i915_gem_render_state.c for 3189 * possible configurations, to avoid duplication they are 3190 * not shown here again. 3191 */ 3192 *batch++ = GEN9_MEDIA_POOL_STATE; 3193 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3194 *batch++ = 0x00777000; 3195 *batch++ = 0; 3196 *batch++ = 0; 3197 *batch++ = 0; 3198 } 3199 3200 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3201 3202 /* Pad to end of cacheline */ 3203 while ((unsigned long)batch % CACHELINE_BYTES) 3204 *batch++ = MI_NOOP; 3205 3206 return batch; 3207 } 3208 3209 static u32 * 3210 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3211 { 3212 int i; 3213 3214 /* 3215 * WaPipeControlBefore3DStateSamplePattern: cnl 3216 * 3217 * Ensure the engine is idle prior to programming a 3218 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3219 */ 3220 batch = gen8_emit_pipe_control(batch, 3221 PIPE_CONTROL_CS_STALL, 3222 0); 3223 /* 3224 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3225 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3226 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3227 * confusing. Since gen8_emit_pipe_control() already advances the 3228 * batch by 6 dwords, we advance the other 10 here, completing a 3229 * cacheline. It's not clear if the workaround requires this padding 3230 * before other commands, or if it's just the regular padding we would 3231 * already have for the workaround bb, so leave it here for now. 3232 */ 3233 for (i = 0; i < 10; i++) 3234 *batch++ = MI_NOOP; 3235 3236 /* Pad to end of cacheline */ 3237 while ((unsigned long)batch % CACHELINE_BYTES) 3238 *batch++ = MI_NOOP; 3239 3240 return batch; 3241 } 3242 3243 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3244 3245 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3246 { 3247 struct drm_i915_gem_object *obj; 3248 struct i915_vma *vma; 3249 int err; 3250 3251 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3252 if (IS_ERR(obj)) 3253 return PTR_ERR(obj); 3254 3255 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3256 if (IS_ERR(vma)) { 3257 err = PTR_ERR(vma); 3258 goto err; 3259 } 3260 3261 err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); 3262 if (err) 3263 goto err; 3264 3265 engine->wa_ctx.vma = vma; 3266 return 0; 3267 3268 err: 3269 i915_gem_object_put(obj); 3270 return err; 3271 } 3272 3273 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3274 { 3275 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3276 } 3277 3278 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3279 3280 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3281 { 3282 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3283 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3284 &wa_ctx->per_ctx }; 3285 wa_bb_func_t wa_bb_fn[2]; 3286 struct page *page; 3287 void *batch, *batch_ptr; 3288 unsigned int i; 3289 int ret; 3290 3291 if (engine->class != RENDER_CLASS) 3292 return 0; 3293 3294 switch (INTEL_GEN(engine->i915)) { 3295 case 12: 3296 case 11: 3297 return 0; 3298 case 10: 3299 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3300 wa_bb_fn[1] = NULL; 3301 break; 3302 case 9: 3303 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3304 wa_bb_fn[1] = NULL; 3305 break; 3306 case 8: 3307 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3308 wa_bb_fn[1] = NULL; 3309 break; 3310 default: 3311 MISSING_CASE(INTEL_GEN(engine->i915)); 3312 return 0; 3313 } 3314 3315 ret = lrc_setup_wa_ctx(engine); 3316 if (ret) { 3317 DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); 3318 return ret; 3319 } 3320 3321 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 3322 batch = batch_ptr = kmap_atomic(page); 3323 3324 /* 3325 * Emit the two workaround batch buffers, recording the offset from the 3326 * start of the workaround batch buffer object for each and their 3327 * respective sizes. 3328 */ 3329 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3330 wa_bb[i]->offset = batch_ptr - batch; 3331 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3332 CACHELINE_BYTES))) { 3333 ret = -EINVAL; 3334 break; 3335 } 3336 if (wa_bb_fn[i]) 3337 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3338 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3339 } 3340 3341 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3342 3343 kunmap_atomic(batch); 3344 if (ret) 3345 lrc_destroy_wa_ctx(engine); 3346 3347 return ret; 3348 } 3349 3350 static void enable_execlists(struct intel_engine_cs *engine) 3351 { 3352 u32 mode; 3353 3354 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 3355 3356 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 3357 3358 if (INTEL_GEN(engine->i915) >= 11) 3359 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 3360 else 3361 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 3362 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 3363 3364 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 3365 3366 ENGINE_WRITE_FW(engine, 3367 RING_HWS_PGA, 3368 i915_ggtt_offset(engine->status_page.vma)); 3369 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 3370 3371 engine->context_tag = 0; 3372 } 3373 3374 static bool unexpected_starting_state(struct intel_engine_cs *engine) 3375 { 3376 bool unexpected = false; 3377 3378 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 3379 DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); 3380 unexpected = true; 3381 } 3382 3383 return unexpected; 3384 } 3385 3386 static int execlists_resume(struct intel_engine_cs *engine) 3387 { 3388 intel_engine_apply_workarounds(engine); 3389 intel_engine_apply_whitelist(engine); 3390 3391 intel_mocs_init_engine(engine); 3392 3393 intel_engine_reset_breadcrumbs(engine); 3394 3395 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 3396 struct drm_printer p = drm_debug_printer(__func__); 3397 3398 intel_engine_dump(engine, &p, NULL); 3399 } 3400 3401 enable_execlists(engine); 3402 3403 return 0; 3404 } 3405 3406 static void execlists_reset_prepare(struct intel_engine_cs *engine) 3407 { 3408 struct intel_engine_execlists * const execlists = &engine->execlists; 3409 unsigned long flags; 3410 3411 ENGINE_TRACE(engine, "depth<-%d\n", 3412 atomic_read(&execlists->tasklet.count)); 3413 3414 /* 3415 * Prevent request submission to the hardware until we have 3416 * completed the reset in i915_gem_reset_finish(). If a request 3417 * is completed by one engine, it may then queue a request 3418 * to a second via its execlists->tasklet *just* as we are 3419 * calling engine->resume() and also writing the ELSP. 3420 * Turning off the execlists->tasklet until the reset is over 3421 * prevents the race. 3422 */ 3423 __tasklet_disable_sync_once(&execlists->tasklet); 3424 GEM_BUG_ON(!reset_in_progress(execlists)); 3425 3426 /* And flush any current direct submission. */ 3427 spin_lock_irqsave(&engine->active.lock, flags); 3428 spin_unlock_irqrestore(&engine->active.lock, flags); 3429 3430 /* 3431 * We stop engines, otherwise we might get failed reset and a 3432 * dead gpu (on elk). Also as modern gpu as kbl can suffer 3433 * from system hang if batchbuffer is progressing when 3434 * the reset is issued, regardless of READY_TO_RESET ack. 3435 * Thus assume it is best to stop engines on all gens 3436 * where we have a gpu reset. 3437 * 3438 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 3439 * 3440 * FIXME: Wa for more modern gens needs to be validated 3441 */ 3442 intel_engine_stop_cs(engine); 3443 } 3444 3445 static void reset_csb_pointers(struct intel_engine_cs *engine) 3446 { 3447 struct intel_engine_execlists * const execlists = &engine->execlists; 3448 const unsigned int reset_value = execlists->csb_size - 1; 3449 3450 ring_set_paused(engine, 0); 3451 3452 /* 3453 * After a reset, the HW starts writing into CSB entry [0]. We 3454 * therefore have to set our HEAD pointer back one entry so that 3455 * the *first* entry we check is entry 0. To complicate this further, 3456 * as we don't wait for the first interrupt after reset, we have to 3457 * fake the HW write to point back to the last entry so that our 3458 * inline comparison of our cached head position against the last HW 3459 * write works even before the first interrupt. 3460 */ 3461 execlists->csb_head = reset_value; 3462 WRITE_ONCE(*execlists->csb_write, reset_value); 3463 wmb(); /* Make sure this is visible to HW (paranoia?) */ 3464 3465 /* 3466 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 3467 * Bludgeon them with a mmio update to be sure. 3468 */ 3469 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3470 reset_value << 8 | reset_value); 3471 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3472 3473 invalidate_csb_entries(&execlists->csb_status[0], 3474 &execlists->csb_status[reset_value]); 3475 } 3476 3477 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 3478 { 3479 int x; 3480 3481 x = lrc_ring_mi_mode(engine); 3482 if (x != -1) { 3483 regs[x + 1] &= ~STOP_RING; 3484 regs[x + 1] |= STOP_RING << 16; 3485 } 3486 } 3487 3488 static void __execlists_reset_reg_state(const struct intel_context *ce, 3489 const struct intel_engine_cs *engine) 3490 { 3491 u32 *regs = ce->lrc_reg_state; 3492 3493 __reset_stop_ring(regs, engine); 3494 } 3495 3496 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 3497 { 3498 struct intel_engine_execlists * const execlists = &engine->execlists; 3499 struct intel_context *ce; 3500 struct i915_request *rq; 3501 u32 head; 3502 3503 mb(); /* paranoia: read the CSB pointers from after the reset */ 3504 clflush(execlists->csb_write); 3505 mb(); 3506 3507 process_csb(engine); /* drain preemption events */ 3508 3509 /* Following the reset, we need to reload the CSB read/write pointers */ 3510 reset_csb_pointers(engine); 3511 3512 /* 3513 * Save the currently executing context, even if we completed 3514 * its request, it was still running at the time of the 3515 * reset and will have been clobbered. 3516 */ 3517 rq = execlists_active(execlists); 3518 if (!rq) 3519 goto unwind; 3520 3521 /* We still have requests in-flight; the engine should be active */ 3522 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 3523 3524 ce = rq->context; 3525 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3526 3527 if (i915_request_completed(rq)) { 3528 /* Idle context; tidy up the ring so we can restart afresh */ 3529 head = intel_ring_wrap(ce->ring, rq->tail); 3530 goto out_replay; 3531 } 3532 3533 /* Context has requests still in-flight; it should not be idle! */ 3534 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 3535 rq = active_request(ce->timeline, rq); 3536 head = intel_ring_wrap(ce->ring, rq->head); 3537 GEM_BUG_ON(head == ce->ring->tail); 3538 3539 /* 3540 * If this request hasn't started yet, e.g. it is waiting on a 3541 * semaphore, we need to avoid skipping the request or else we 3542 * break the signaling chain. However, if the context is corrupt 3543 * the request will not restart and we will be stuck with a wedged 3544 * device. It is quite often the case that if we issue a reset 3545 * while the GPU is loading the context image, that the context 3546 * image becomes corrupt. 3547 * 3548 * Otherwise, if we have not started yet, the request should replay 3549 * perfectly and we do not need to flag the result as being erroneous. 3550 */ 3551 if (!i915_request_started(rq)) 3552 goto out_replay; 3553 3554 /* 3555 * If the request was innocent, we leave the request in the ELSP 3556 * and will try to replay it on restarting. The context image may 3557 * have been corrupted by the reset, in which case we may have 3558 * to service a new GPU hang, but more likely we can continue on 3559 * without impact. 3560 * 3561 * If the request was guilty, we presume the context is corrupt 3562 * and have to at least restore the RING register in the context 3563 * image back to the expected values to skip over the guilty request. 3564 */ 3565 __i915_request_reset(rq, stalled); 3566 if (!stalled) 3567 goto out_replay; 3568 3569 /* 3570 * We want a simple context + ring to execute the breadcrumb update. 3571 * We cannot rely on the context being intact across the GPU hang, 3572 * so clear it and rebuild just what we need for the breadcrumb. 3573 * All pending requests for this context will be zapped, and any 3574 * future request will be after userspace has had the opportunity 3575 * to recreate its own state. 3576 */ 3577 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3578 restore_default_state(ce, engine); 3579 3580 out_replay: 3581 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 3582 head, ce->ring->tail); 3583 __execlists_reset_reg_state(ce, engine); 3584 __execlists_update_reg_state(ce, engine, head); 3585 ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 3586 3587 unwind: 3588 /* Push back any incomplete requests for replay after the reset. */ 3589 cancel_port_requests(execlists); 3590 __unwind_incomplete_requests(engine); 3591 } 3592 3593 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 3594 { 3595 unsigned long flags; 3596 3597 ENGINE_TRACE(engine, "\n"); 3598 3599 spin_lock_irqsave(&engine->active.lock, flags); 3600 3601 __execlists_reset(engine, stalled); 3602 3603 spin_unlock_irqrestore(&engine->active.lock, flags); 3604 } 3605 3606 static void nop_submission_tasklet(unsigned long data) 3607 { 3608 /* The driver is wedged; don't process any more events. */ 3609 } 3610 3611 static void execlists_reset_cancel(struct intel_engine_cs *engine) 3612 { 3613 struct intel_engine_execlists * const execlists = &engine->execlists; 3614 struct i915_request *rq, *rn; 3615 struct rb_node *rb; 3616 unsigned long flags; 3617 3618 ENGINE_TRACE(engine, "\n"); 3619 3620 /* 3621 * Before we call engine->cancel_requests(), we should have exclusive 3622 * access to the submission state. This is arranged for us by the 3623 * caller disabling the interrupt generation, the tasklet and other 3624 * threads that may then access the same state, giving us a free hand 3625 * to reset state. However, we still need to let lockdep be aware that 3626 * we know this state may be accessed in hardirq context, so we 3627 * disable the irq around this manipulation and we want to keep 3628 * the spinlock focused on its duties and not accidentally conflate 3629 * coverage to the submission's irq state. (Similarly, although we 3630 * shouldn't need to disable irq around the manipulation of the 3631 * submission's irq state, we also wish to remind ourselves that 3632 * it is irq state.) 3633 */ 3634 spin_lock_irqsave(&engine->active.lock, flags); 3635 3636 __execlists_reset(engine, true); 3637 3638 /* Mark all executing requests as skipped. */ 3639 list_for_each_entry(rq, &engine->active.requests, sched.link) 3640 mark_eio(rq); 3641 3642 /* Flush the queued requests to the timeline list (for retiring). */ 3643 while ((rb = rb_first_cached(&execlists->queue))) { 3644 struct i915_priolist *p = to_priolist(rb); 3645 int i; 3646 3647 priolist_for_each_request_consume(rq, rn, p, i) { 3648 mark_eio(rq); 3649 __i915_request_submit(rq); 3650 } 3651 3652 rb_erase_cached(&p->node, &execlists->queue); 3653 i915_priolist_free(p); 3654 } 3655 3656 /* On-hold requests will be flushed to timeline upon their release */ 3657 list_for_each_entry(rq, &engine->active.hold, sched.link) 3658 mark_eio(rq); 3659 3660 /* Cancel all attached virtual engines */ 3661 while ((rb = rb_first_cached(&execlists->virtual))) { 3662 struct virtual_engine *ve = 3663 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 3664 3665 rb_erase_cached(rb, &execlists->virtual); 3666 RB_CLEAR_NODE(rb); 3667 3668 spin_lock(&ve->base.active.lock); 3669 rq = fetch_and_zero(&ve->request); 3670 if (rq) { 3671 mark_eio(rq); 3672 3673 rq->engine = engine; 3674 __i915_request_submit(rq); 3675 i915_request_put(rq); 3676 3677 ve->base.execlists.queue_priority_hint = INT_MIN; 3678 } 3679 spin_unlock(&ve->base.active.lock); 3680 } 3681 3682 /* Remaining _unready_ requests will be nop'ed when submitted */ 3683 3684 execlists->queue_priority_hint = INT_MIN; 3685 execlists->queue = RB_ROOT_CACHED; 3686 3687 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 3688 execlists->tasklet.func = nop_submission_tasklet; 3689 3690 spin_unlock_irqrestore(&engine->active.lock, flags); 3691 } 3692 3693 static void execlists_reset_finish(struct intel_engine_cs *engine) 3694 { 3695 struct intel_engine_execlists * const execlists = &engine->execlists; 3696 3697 /* 3698 * After a GPU reset, we may have requests to replay. Do so now while 3699 * we still have the forcewake to be sure that the GPU is not allowed 3700 * to sleep before we restart and reload a context. 3701 */ 3702 GEM_BUG_ON(!reset_in_progress(execlists)); 3703 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 3704 execlists->tasklet.func(execlists->tasklet.data); 3705 3706 if (__tasklet_enable(&execlists->tasklet)) 3707 /* And kick in case we missed a new request submission. */ 3708 tasklet_hi_schedule(&execlists->tasklet); 3709 ENGINE_TRACE(engine, "depth->%d\n", 3710 atomic_read(&execlists->tasklet.count)); 3711 } 3712 3713 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 3714 u64 offset, u32 len, 3715 const unsigned int flags) 3716 { 3717 u32 *cs; 3718 3719 cs = intel_ring_begin(rq, 4); 3720 if (IS_ERR(cs)) 3721 return PTR_ERR(cs); 3722 3723 /* 3724 * WaDisableCtxRestoreArbitration:bdw,chv 3725 * 3726 * We don't need to perform MI_ARB_ENABLE as often as we do (in 3727 * particular all the gen that do not need the w/a at all!), if we 3728 * took care to make sure that on every switch into this context 3729 * (both ordinary and for preemption) that arbitrartion was enabled 3730 * we would be fine. However, for gen8 there is another w/a that 3731 * requires us to not preempt inside GPGPU execution, so we keep 3732 * arbitration disabled for gen8 batches. Arbitration will be 3733 * re-enabled before we close the request 3734 * (engine->emit_fini_breadcrumb). 3735 */ 3736 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3737 3738 /* FIXME(BDW+): Address space and security selectors. */ 3739 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3740 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3741 *cs++ = lower_32_bits(offset); 3742 *cs++ = upper_32_bits(offset); 3743 3744 intel_ring_advance(rq, cs); 3745 3746 return 0; 3747 } 3748 3749 static int gen8_emit_bb_start(struct i915_request *rq, 3750 u64 offset, u32 len, 3751 const unsigned int flags) 3752 { 3753 u32 *cs; 3754 3755 cs = intel_ring_begin(rq, 6); 3756 if (IS_ERR(cs)) 3757 return PTR_ERR(cs); 3758 3759 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3760 3761 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 3762 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 3763 *cs++ = lower_32_bits(offset); 3764 *cs++ = upper_32_bits(offset); 3765 3766 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3767 *cs++ = MI_NOOP; 3768 3769 intel_ring_advance(rq, cs); 3770 3771 return 0; 3772 } 3773 3774 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 3775 { 3776 ENGINE_WRITE(engine, RING_IMR, 3777 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 3778 ENGINE_POSTING_READ(engine, RING_IMR); 3779 } 3780 3781 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 3782 { 3783 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 3784 } 3785 3786 static int gen8_emit_flush(struct i915_request *request, u32 mode) 3787 { 3788 u32 cmd, *cs; 3789 3790 cs = intel_ring_begin(request, 4); 3791 if (IS_ERR(cs)) 3792 return PTR_ERR(cs); 3793 3794 cmd = MI_FLUSH_DW + 1; 3795 3796 /* We always require a command barrier so that subsequent 3797 * commands, such as breadcrumb interrupts, are strictly ordered 3798 * wrt the contents of the write cache being flushed to memory 3799 * (and thus being coherent from the CPU). 3800 */ 3801 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 3802 3803 if (mode & EMIT_INVALIDATE) { 3804 cmd |= MI_INVALIDATE_TLB; 3805 if (request->engine->class == VIDEO_DECODE_CLASS) 3806 cmd |= MI_INVALIDATE_BSD; 3807 } 3808 3809 *cs++ = cmd; 3810 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 3811 *cs++ = 0; /* upper addr */ 3812 *cs++ = 0; /* value */ 3813 intel_ring_advance(request, cs); 3814 3815 return 0; 3816 } 3817 3818 static int gen8_emit_flush_render(struct i915_request *request, 3819 u32 mode) 3820 { 3821 bool vf_flush_wa = false, dc_flush_wa = false; 3822 u32 *cs, flags = 0; 3823 int len; 3824 3825 flags |= PIPE_CONTROL_CS_STALL; 3826 3827 if (mode & EMIT_FLUSH) { 3828 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3829 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3830 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3831 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3832 } 3833 3834 if (mode & EMIT_INVALIDATE) { 3835 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3836 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3837 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3838 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3839 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3840 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3841 flags |= PIPE_CONTROL_QW_WRITE; 3842 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3843 3844 /* 3845 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 3846 * pipe control. 3847 */ 3848 if (IS_GEN(request->i915, 9)) 3849 vf_flush_wa = true; 3850 3851 /* WaForGAMHang:kbl */ 3852 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 3853 dc_flush_wa = true; 3854 } 3855 3856 len = 6; 3857 3858 if (vf_flush_wa) 3859 len += 6; 3860 3861 if (dc_flush_wa) 3862 len += 12; 3863 3864 cs = intel_ring_begin(request, len); 3865 if (IS_ERR(cs)) 3866 return PTR_ERR(cs); 3867 3868 if (vf_flush_wa) 3869 cs = gen8_emit_pipe_control(cs, 0, 0); 3870 3871 if (dc_flush_wa) 3872 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 3873 0); 3874 3875 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3876 3877 if (dc_flush_wa) 3878 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 3879 3880 intel_ring_advance(request, cs); 3881 3882 return 0; 3883 } 3884 3885 static int gen11_emit_flush_render(struct i915_request *request, 3886 u32 mode) 3887 { 3888 if (mode & EMIT_FLUSH) { 3889 u32 *cs; 3890 u32 flags = 0; 3891 3892 flags |= PIPE_CONTROL_CS_STALL; 3893 3894 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3895 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3896 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3897 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3898 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3899 flags |= PIPE_CONTROL_QW_WRITE; 3900 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3901 3902 cs = intel_ring_begin(request, 6); 3903 if (IS_ERR(cs)) 3904 return PTR_ERR(cs); 3905 3906 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3907 intel_ring_advance(request, cs); 3908 } 3909 3910 if (mode & EMIT_INVALIDATE) { 3911 u32 *cs; 3912 u32 flags = 0; 3913 3914 flags |= PIPE_CONTROL_CS_STALL; 3915 3916 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3917 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3918 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3919 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3920 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3921 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3922 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3923 flags |= PIPE_CONTROL_QW_WRITE; 3924 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3925 3926 cs = intel_ring_begin(request, 6); 3927 if (IS_ERR(cs)) 3928 return PTR_ERR(cs); 3929 3930 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3931 intel_ring_advance(request, cs); 3932 } 3933 3934 return 0; 3935 } 3936 3937 static u32 preparser_disable(bool state) 3938 { 3939 return MI_ARB_CHECK | 1 << 8 | state; 3940 } 3941 3942 static int gen12_emit_flush_render(struct i915_request *request, 3943 u32 mode) 3944 { 3945 if (mode & EMIT_FLUSH) { 3946 u32 flags = 0; 3947 u32 *cs; 3948 3949 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 3950 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 3951 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 3952 /* Wa_1409600907:tgl */ 3953 flags |= PIPE_CONTROL_DEPTH_STALL; 3954 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 3955 flags |= PIPE_CONTROL_FLUSH_ENABLE; 3956 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 3957 3958 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3959 flags |= PIPE_CONTROL_QW_WRITE; 3960 3961 flags |= PIPE_CONTROL_CS_STALL; 3962 3963 cs = intel_ring_begin(request, 6); 3964 if (IS_ERR(cs)) 3965 return PTR_ERR(cs); 3966 3967 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 3968 intel_ring_advance(request, cs); 3969 } 3970 3971 if (mode & EMIT_INVALIDATE) { 3972 u32 flags = 0; 3973 u32 *cs; 3974 3975 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 3976 flags |= PIPE_CONTROL_TLB_INVALIDATE; 3977 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 3978 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 3979 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 3980 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 3981 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 3982 flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; 3983 3984 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 3985 flags |= PIPE_CONTROL_QW_WRITE; 3986 3987 flags |= PIPE_CONTROL_CS_STALL; 3988 3989 cs = intel_ring_begin(request, 8); 3990 if (IS_ERR(cs)) 3991 return PTR_ERR(cs); 3992 3993 /* 3994 * Prevent the pre-parser from skipping past the TLB 3995 * invalidate and loading a stale page for the batch 3996 * buffer / request payload. 3997 */ 3998 *cs++ = preparser_disable(true); 3999 4000 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4001 4002 *cs++ = preparser_disable(false); 4003 intel_ring_advance(request, cs); 4004 4005 /* 4006 * Wa_1604544889:tgl 4007 */ 4008 if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) { 4009 flags = 0; 4010 flags |= PIPE_CONTROL_CS_STALL; 4011 flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; 4012 4013 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4014 flags |= PIPE_CONTROL_QW_WRITE; 4015 4016 cs = intel_ring_begin(request, 6); 4017 if (IS_ERR(cs)) 4018 return PTR_ERR(cs); 4019 4020 cs = gen8_emit_pipe_control(cs, flags, 4021 LRC_PPHWSP_SCRATCH_ADDR); 4022 intel_ring_advance(request, cs); 4023 } 4024 } 4025 4026 return 0; 4027 } 4028 4029 /* 4030 * Reserve space for 2 NOOPs at the end of each request to be 4031 * used as a workaround for not being allowed to do lite 4032 * restore with HEAD==TAIL (WaIdleLiteRestore). 4033 */ 4034 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4035 { 4036 /* Ensure there's always at least one preemption point per-request. */ 4037 *cs++ = MI_ARB_CHECK; 4038 *cs++ = MI_NOOP; 4039 request->wa_tail = intel_ring_offset(request, cs); 4040 4041 return cs; 4042 } 4043 4044 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4045 { 4046 *cs++ = MI_SEMAPHORE_WAIT | 4047 MI_SEMAPHORE_GLOBAL_GTT | 4048 MI_SEMAPHORE_POLL | 4049 MI_SEMAPHORE_SAD_EQ_SDD; 4050 *cs++ = 0; 4051 *cs++ = intel_hws_preempt_address(request->engine); 4052 *cs++ = 0; 4053 4054 return cs; 4055 } 4056 4057 static __always_inline u32* 4058 gen8_emit_fini_breadcrumb_footer(struct i915_request *request, 4059 u32 *cs) 4060 { 4061 *cs++ = MI_USER_INTERRUPT; 4062 4063 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4064 if (intel_engine_has_semaphores(request->engine)) 4065 cs = emit_preempt_busywait(request, cs); 4066 4067 request->tail = intel_ring_offset(request, cs); 4068 assert_ring_tail_valid(request->ring, request->tail); 4069 4070 return gen8_emit_wa_tail(request, cs); 4071 } 4072 4073 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4074 { 4075 cs = gen8_emit_ggtt_write(cs, 4076 request->fence.seqno, 4077 i915_request_active_timeline(request)->hwsp_offset, 4078 0); 4079 4080 return gen8_emit_fini_breadcrumb_footer(request, cs); 4081 } 4082 4083 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4084 { 4085 cs = gen8_emit_pipe_control(cs, 4086 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4087 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4088 PIPE_CONTROL_DC_FLUSH_ENABLE, 4089 0); 4090 4091 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4092 cs = gen8_emit_ggtt_write_rcs(cs, 4093 request->fence.seqno, 4094 i915_request_active_timeline(request)->hwsp_offset, 4095 PIPE_CONTROL_FLUSH_ENABLE | 4096 PIPE_CONTROL_CS_STALL); 4097 4098 return gen8_emit_fini_breadcrumb_footer(request, cs); 4099 } 4100 4101 static u32 * 4102 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4103 { 4104 cs = gen8_emit_ggtt_write_rcs(cs, 4105 request->fence.seqno, 4106 i915_request_active_timeline(request)->hwsp_offset, 4107 PIPE_CONTROL_CS_STALL | 4108 PIPE_CONTROL_TILE_CACHE_FLUSH | 4109 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4110 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4111 PIPE_CONTROL_DC_FLUSH_ENABLE | 4112 PIPE_CONTROL_FLUSH_ENABLE); 4113 4114 return gen8_emit_fini_breadcrumb_footer(request, cs); 4115 } 4116 4117 /* 4118 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4119 * flush and will continue pre-fetching the instructions after it before the 4120 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4121 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4122 * of the next request before the memory has been flushed, we're guaranteed that 4123 * we won't access the batch itself too early. 4124 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4125 * so, if the current request is modifying an instruction in the next request on 4126 * the same intel_context, we might pre-fetch and then execute the pre-update 4127 * instruction. To avoid this, the users of self-modifying code should either 4128 * disable the parser around the code emitting the memory writes, via a new flag 4129 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4130 * the in-kernel use-cases we've opted to use a separate context, see 4131 * reloc_gpu() as an example. 4132 * All the above applies only to the instructions themselves. Non-inline data 4133 * used by the instructions is not pre-fetched. 4134 */ 4135 4136 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4137 { 4138 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4139 MI_SEMAPHORE_GLOBAL_GTT | 4140 MI_SEMAPHORE_POLL | 4141 MI_SEMAPHORE_SAD_EQ_SDD; 4142 *cs++ = 0; 4143 *cs++ = intel_hws_preempt_address(request->engine); 4144 *cs++ = 0; 4145 *cs++ = 0; 4146 *cs++ = MI_NOOP; 4147 4148 return cs; 4149 } 4150 4151 static __always_inline u32* 4152 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) 4153 { 4154 *cs++ = MI_USER_INTERRUPT; 4155 4156 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4157 if (intel_engine_has_semaphores(request->engine)) 4158 cs = gen12_emit_preempt_busywait(request, cs); 4159 4160 request->tail = intel_ring_offset(request, cs); 4161 assert_ring_tail_valid(request->ring, request->tail); 4162 4163 return gen8_emit_wa_tail(request, cs); 4164 } 4165 4166 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) 4167 { 4168 cs = gen8_emit_ggtt_write(cs, 4169 request->fence.seqno, 4170 i915_request_active_timeline(request)->hwsp_offset, 4171 0); 4172 4173 return gen12_emit_fini_breadcrumb_footer(request, cs); 4174 } 4175 4176 static u32 * 4177 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4178 { 4179 cs = gen8_emit_ggtt_write_rcs(cs, 4180 request->fence.seqno, 4181 i915_request_active_timeline(request)->hwsp_offset, 4182 PIPE_CONTROL_CS_STALL | 4183 PIPE_CONTROL_TILE_CACHE_FLUSH | 4184 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4185 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4186 /* Wa_1409600907:tgl */ 4187 PIPE_CONTROL_DEPTH_STALL | 4188 PIPE_CONTROL_DC_FLUSH_ENABLE | 4189 PIPE_CONTROL_FLUSH_ENABLE | 4190 PIPE_CONTROL_HDC_PIPELINE_FLUSH); 4191 4192 return gen12_emit_fini_breadcrumb_footer(request, cs); 4193 } 4194 4195 static void execlists_park(struct intel_engine_cs *engine) 4196 { 4197 cancel_timer(&engine->execlists.timer); 4198 cancel_timer(&engine->execlists.preempt); 4199 } 4200 4201 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 4202 { 4203 engine->submit_request = execlists_submit_request; 4204 engine->schedule = i915_schedule; 4205 engine->execlists.tasklet.func = execlists_submission_tasklet; 4206 4207 engine->reset.prepare = execlists_reset_prepare; 4208 engine->reset.rewind = execlists_reset_rewind; 4209 engine->reset.cancel = execlists_reset_cancel; 4210 engine->reset.finish = execlists_reset_finish; 4211 4212 engine->park = execlists_park; 4213 engine->unpark = NULL; 4214 4215 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 4216 if (!intel_vgpu_active(engine->i915)) { 4217 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 4218 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) 4219 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 4220 } 4221 4222 if (INTEL_GEN(engine->i915) >= 12) 4223 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 4224 4225 if (intel_engine_has_preemption(engine)) 4226 engine->emit_bb_start = gen8_emit_bb_start; 4227 else 4228 engine->emit_bb_start = gen8_emit_bb_start_noarb; 4229 } 4230 4231 static void execlists_shutdown(struct intel_engine_cs *engine) 4232 { 4233 /* Synchronise with residual timers and any softirq they raise */ 4234 del_timer_sync(&engine->execlists.timer); 4235 del_timer_sync(&engine->execlists.preempt); 4236 tasklet_kill(&engine->execlists.tasklet); 4237 } 4238 4239 static void execlists_release(struct intel_engine_cs *engine) 4240 { 4241 execlists_shutdown(engine); 4242 4243 intel_engine_cleanup_common(engine); 4244 lrc_destroy_wa_ctx(engine); 4245 } 4246 4247 static void 4248 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 4249 { 4250 /* Default vfuncs which can be overriden by each engine. */ 4251 4252 engine->resume = execlists_resume; 4253 4254 engine->cops = &execlists_context_ops; 4255 engine->request_alloc = execlists_request_alloc; 4256 4257 engine->emit_flush = gen8_emit_flush; 4258 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 4259 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 4260 if (INTEL_GEN(engine->i915) >= 12) 4261 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 4262 4263 engine->set_default_submission = intel_execlists_set_default_submission; 4264 4265 if (INTEL_GEN(engine->i915) < 11) { 4266 engine->irq_enable = gen8_logical_ring_enable_irq; 4267 engine->irq_disable = gen8_logical_ring_disable_irq; 4268 } else { 4269 /* 4270 * TODO: On Gen11 interrupt masks need to be clear 4271 * to allow C6 entry. Keep interrupts enabled at 4272 * and take the hit of generating extra interrupts 4273 * until a more refined solution exists. 4274 */ 4275 } 4276 } 4277 4278 static inline void 4279 logical_ring_default_irqs(struct intel_engine_cs *engine) 4280 { 4281 unsigned int shift = 0; 4282 4283 if (INTEL_GEN(engine->i915) < 11) { 4284 const u8 irq_shifts[] = { 4285 [RCS0] = GEN8_RCS_IRQ_SHIFT, 4286 [BCS0] = GEN8_BCS_IRQ_SHIFT, 4287 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 4288 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 4289 [VECS0] = GEN8_VECS_IRQ_SHIFT, 4290 }; 4291 4292 shift = irq_shifts[engine->id]; 4293 } 4294 4295 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 4296 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 4297 } 4298 4299 static void rcs_submission_override(struct intel_engine_cs *engine) 4300 { 4301 switch (INTEL_GEN(engine->i915)) { 4302 case 12: 4303 engine->emit_flush = gen12_emit_flush_render; 4304 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 4305 break; 4306 case 11: 4307 engine->emit_flush = gen11_emit_flush_render; 4308 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 4309 break; 4310 default: 4311 engine->emit_flush = gen8_emit_flush_render; 4312 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 4313 break; 4314 } 4315 } 4316 4317 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 4318 { 4319 struct intel_engine_execlists * const execlists = &engine->execlists; 4320 struct drm_i915_private *i915 = engine->i915; 4321 struct intel_uncore *uncore = engine->uncore; 4322 u32 base = engine->mmio_base; 4323 4324 tasklet_init(&engine->execlists.tasklet, 4325 execlists_submission_tasklet, (unsigned long)engine); 4326 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 4327 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 4328 4329 logical_ring_default_vfuncs(engine); 4330 logical_ring_default_irqs(engine); 4331 4332 if (engine->class == RENDER_CLASS) 4333 rcs_submission_override(engine); 4334 4335 if (intel_init_workaround_bb(engine)) 4336 /* 4337 * We continue even if we fail to initialize WA batch 4338 * because we only expect rare glitches but nothing 4339 * critical to prevent us from using GPU 4340 */ 4341 DRM_ERROR("WA batch buffer initialization failed\n"); 4342 4343 if (HAS_LOGICAL_RING_ELSQ(i915)) { 4344 execlists->submit_reg = uncore->regs + 4345 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 4346 execlists->ctrl_reg = uncore->regs + 4347 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 4348 } else { 4349 execlists->submit_reg = uncore->regs + 4350 i915_mmio_reg_offset(RING_ELSP(base)); 4351 } 4352 4353 execlists->csb_status = 4354 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 4355 4356 execlists->csb_write = 4357 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 4358 4359 if (INTEL_GEN(i915) < 11) 4360 execlists->csb_size = GEN8_CSB_ENTRIES; 4361 else 4362 execlists->csb_size = GEN11_CSB_ENTRIES; 4363 4364 reset_csb_pointers(engine); 4365 4366 /* Finally, take ownership and responsibility for cleanup! */ 4367 engine->release = execlists_release; 4368 4369 return 0; 4370 } 4371 4372 static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) 4373 { 4374 u32 indirect_ctx_offset; 4375 4376 switch (INTEL_GEN(engine->i915)) { 4377 default: 4378 MISSING_CASE(INTEL_GEN(engine->i915)); 4379 /* fall through */ 4380 case 12: 4381 indirect_ctx_offset = 4382 GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4383 break; 4384 case 11: 4385 indirect_ctx_offset = 4386 GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4387 break; 4388 case 10: 4389 indirect_ctx_offset = 4390 GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4391 break; 4392 case 9: 4393 indirect_ctx_offset = 4394 GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4395 break; 4396 case 8: 4397 indirect_ctx_offset = 4398 GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 4399 break; 4400 } 4401 4402 return indirect_ctx_offset; 4403 } 4404 4405 4406 static void init_common_reg_state(u32 * const regs, 4407 const struct intel_engine_cs *engine, 4408 const struct intel_ring *ring, 4409 bool inhibit) 4410 { 4411 u32 ctl; 4412 4413 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 4414 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 4415 if (inhibit) 4416 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 4417 if (INTEL_GEN(engine->i915) < 11) 4418 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 4419 CTX_CTRL_RS_CTX_ENABLE); 4420 regs[CTX_CONTEXT_CONTROL] = ctl; 4421 4422 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 4423 } 4424 4425 static void init_wa_bb_reg_state(u32 * const regs, 4426 const struct intel_engine_cs *engine, 4427 u32 pos_bb_per_ctx) 4428 { 4429 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 4430 4431 if (wa_ctx->per_ctx.size) { 4432 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4433 4434 regs[pos_bb_per_ctx] = 4435 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 4436 } 4437 4438 if (wa_ctx->indirect_ctx.size) { 4439 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 4440 4441 regs[pos_bb_per_ctx + 2] = 4442 (ggtt_offset + wa_ctx->indirect_ctx.offset) | 4443 (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); 4444 4445 regs[pos_bb_per_ctx + 4] = 4446 intel_lr_indirect_ctx_offset(engine) << 6; 4447 } 4448 } 4449 4450 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 4451 { 4452 if (i915_vm_is_4lvl(&ppgtt->vm)) { 4453 /* 64b PPGTT (48bit canonical) 4454 * PDP0_DESCRIPTOR contains the base address to PML4 and 4455 * other PDP Descriptors are ignored. 4456 */ 4457 ASSIGN_CTX_PML4(ppgtt, regs); 4458 } else { 4459 ASSIGN_CTX_PDP(ppgtt, regs, 3); 4460 ASSIGN_CTX_PDP(ppgtt, regs, 2); 4461 ASSIGN_CTX_PDP(ppgtt, regs, 1); 4462 ASSIGN_CTX_PDP(ppgtt, regs, 0); 4463 } 4464 } 4465 4466 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 4467 { 4468 if (i915_is_ggtt(vm)) 4469 return i915_vm_to_ggtt(vm)->alias; 4470 else 4471 return i915_vm_to_ppgtt(vm); 4472 } 4473 4474 static void execlists_init_reg_state(u32 *regs, 4475 const struct intel_context *ce, 4476 const struct intel_engine_cs *engine, 4477 const struct intel_ring *ring, 4478 bool inhibit) 4479 { 4480 /* 4481 * A context is actually a big batch buffer with several 4482 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 4483 * values we are setting here are only for the first context restore: 4484 * on a subsequent save, the GPU will recreate this batchbuffer with new 4485 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 4486 * we are not initializing here). 4487 * 4488 * Must keep consistent with virtual_update_register_offsets(). 4489 */ 4490 set_offsets(regs, reg_offsets(engine), engine, inhibit); 4491 4492 init_common_reg_state(regs, engine, ring, inhibit); 4493 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 4494 4495 init_wa_bb_reg_state(regs, engine, 4496 INTEL_GEN(engine->i915) >= 12 ? 4497 GEN12_CTX_BB_PER_CTX_PTR : 4498 CTX_BB_PER_CTX_PTR); 4499 4500 __reset_stop_ring(regs, engine); 4501 } 4502 4503 static int 4504 populate_lr_context(struct intel_context *ce, 4505 struct drm_i915_gem_object *ctx_obj, 4506 struct intel_engine_cs *engine, 4507 struct intel_ring *ring) 4508 { 4509 bool inhibit = true; 4510 void *vaddr; 4511 int ret; 4512 4513 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 4514 if (IS_ERR(vaddr)) { 4515 ret = PTR_ERR(vaddr); 4516 DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); 4517 return ret; 4518 } 4519 4520 set_redzone(vaddr, engine); 4521 4522 if (engine->default_state) { 4523 void *defaults; 4524 4525 defaults = i915_gem_object_pin_map(engine->default_state, 4526 I915_MAP_WB); 4527 if (IS_ERR(defaults)) { 4528 ret = PTR_ERR(defaults); 4529 goto err_unpin_ctx; 4530 } 4531 4532 memcpy(vaddr, defaults, engine->context_size); 4533 i915_gem_object_unpin_map(engine->default_state); 4534 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 4535 inhibit = false; 4536 } 4537 4538 /* The second page of the context object contains some fields which must 4539 * be set up prior to the first execution. */ 4540 execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE, 4541 ce, engine, ring, inhibit); 4542 4543 ret = 0; 4544 err_unpin_ctx: 4545 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 4546 i915_gem_object_unpin_map(ctx_obj); 4547 return ret; 4548 } 4549 4550 static int __execlists_context_alloc(struct intel_context *ce, 4551 struct intel_engine_cs *engine) 4552 { 4553 struct drm_i915_gem_object *ctx_obj; 4554 struct intel_ring *ring; 4555 struct i915_vma *vma; 4556 u32 context_size; 4557 int ret; 4558 4559 GEM_BUG_ON(ce->state); 4560 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 4561 4562 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4563 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 4564 4565 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 4566 if (IS_ERR(ctx_obj)) 4567 return PTR_ERR(ctx_obj); 4568 4569 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 4570 if (IS_ERR(vma)) { 4571 ret = PTR_ERR(vma); 4572 goto error_deref_obj; 4573 } 4574 4575 if (!ce->timeline) { 4576 struct intel_timeline *tl; 4577 4578 tl = intel_timeline_create(engine->gt, NULL); 4579 if (IS_ERR(tl)) { 4580 ret = PTR_ERR(tl); 4581 goto error_deref_obj; 4582 } 4583 4584 ce->timeline = tl; 4585 } 4586 4587 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 4588 if (IS_ERR(ring)) { 4589 ret = PTR_ERR(ring); 4590 goto error_deref_obj; 4591 } 4592 4593 ret = populate_lr_context(ce, ctx_obj, engine, ring); 4594 if (ret) { 4595 DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); 4596 goto error_ring_free; 4597 } 4598 4599 ce->ring = ring; 4600 ce->state = vma; 4601 4602 return 0; 4603 4604 error_ring_free: 4605 intel_ring_put(ring); 4606 error_deref_obj: 4607 i915_gem_object_put(ctx_obj); 4608 return ret; 4609 } 4610 4611 static struct list_head *virtual_queue(struct virtual_engine *ve) 4612 { 4613 return &ve->base.execlists.default_priolist.requests[0]; 4614 } 4615 4616 static void virtual_context_destroy(struct kref *kref) 4617 { 4618 struct virtual_engine *ve = 4619 container_of(kref, typeof(*ve), context.ref); 4620 unsigned int n; 4621 4622 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4623 GEM_BUG_ON(ve->request); 4624 GEM_BUG_ON(ve->context.inflight); 4625 4626 for (n = 0; n < ve->num_siblings; n++) { 4627 struct intel_engine_cs *sibling = ve->siblings[n]; 4628 struct rb_node *node = &ve->nodes[sibling->id].rb; 4629 unsigned long flags; 4630 4631 if (RB_EMPTY_NODE(node)) 4632 continue; 4633 4634 spin_lock_irqsave(&sibling->active.lock, flags); 4635 4636 /* Detachment is lazily performed in the execlists tasklet */ 4637 if (!RB_EMPTY_NODE(node)) 4638 rb_erase_cached(node, &sibling->execlists.virtual); 4639 4640 spin_unlock_irqrestore(&sibling->active.lock, flags); 4641 } 4642 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 4643 4644 if (ve->context.state) 4645 __execlists_context_fini(&ve->context); 4646 intel_context_fini(&ve->context); 4647 4648 kfree(ve->bonds); 4649 kfree(ve); 4650 } 4651 4652 static void virtual_engine_initial_hint(struct virtual_engine *ve) 4653 { 4654 int swp; 4655 4656 /* 4657 * Pick a random sibling on starting to help spread the load around. 4658 * 4659 * New contexts are typically created with exactly the same order 4660 * of siblings, and often started in batches. Due to the way we iterate 4661 * the array of sibling when submitting requests, sibling[0] is 4662 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 4663 * randomised across the system, we also help spread the load by the 4664 * first engine we inspect being different each time. 4665 * 4666 * NB This does not force us to execute on this engine, it will just 4667 * typically be the first we inspect for submission. 4668 */ 4669 swp = prandom_u32_max(ve->num_siblings); 4670 if (!swp) 4671 return; 4672 4673 swap(ve->siblings[swp], ve->siblings[0]); 4674 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 4675 virtual_update_register_offsets(ve->context.lrc_reg_state, 4676 ve->siblings[0]); 4677 } 4678 4679 static int virtual_context_alloc(struct intel_context *ce) 4680 { 4681 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4682 4683 return __execlists_context_alloc(ce, ve->siblings[0]); 4684 } 4685 4686 static int virtual_context_pin(struct intel_context *ce) 4687 { 4688 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4689 int err; 4690 4691 /* Note: we must use a real engine class for setting up reg state */ 4692 err = __execlists_context_pin(ce, ve->siblings[0]); 4693 if (err) 4694 return err; 4695 4696 virtual_engine_initial_hint(ve); 4697 return 0; 4698 } 4699 4700 static void virtual_context_enter(struct intel_context *ce) 4701 { 4702 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4703 unsigned int n; 4704 4705 for (n = 0; n < ve->num_siblings; n++) 4706 intel_engine_pm_get(ve->siblings[n]); 4707 4708 intel_timeline_enter(ce->timeline); 4709 } 4710 4711 static void virtual_context_exit(struct intel_context *ce) 4712 { 4713 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 4714 unsigned int n; 4715 4716 intel_timeline_exit(ce->timeline); 4717 4718 for (n = 0; n < ve->num_siblings; n++) 4719 intel_engine_pm_put(ve->siblings[n]); 4720 } 4721 4722 static const struct intel_context_ops virtual_context_ops = { 4723 .alloc = virtual_context_alloc, 4724 4725 .pin = virtual_context_pin, 4726 .unpin = execlists_context_unpin, 4727 4728 .enter = virtual_context_enter, 4729 .exit = virtual_context_exit, 4730 4731 .destroy = virtual_context_destroy, 4732 }; 4733 4734 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 4735 { 4736 struct i915_request *rq; 4737 intel_engine_mask_t mask; 4738 4739 rq = READ_ONCE(ve->request); 4740 if (!rq) 4741 return 0; 4742 4743 /* The rq is ready for submission; rq->execution_mask is now stable. */ 4744 mask = rq->execution_mask; 4745 if (unlikely(!mask)) { 4746 /* Invalid selection, submit to a random engine in error */ 4747 i915_request_skip(rq, -ENODEV); 4748 mask = ve->siblings[0]->mask; 4749 } 4750 4751 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 4752 rq->fence.context, rq->fence.seqno, 4753 mask, ve->base.execlists.queue_priority_hint); 4754 4755 return mask; 4756 } 4757 4758 static void virtual_submission_tasklet(unsigned long data) 4759 { 4760 struct virtual_engine * const ve = (struct virtual_engine *)data; 4761 const int prio = ve->base.execlists.queue_priority_hint; 4762 intel_engine_mask_t mask; 4763 unsigned int n; 4764 4765 rcu_read_lock(); 4766 mask = virtual_submission_mask(ve); 4767 rcu_read_unlock(); 4768 if (unlikely(!mask)) 4769 return; 4770 4771 local_irq_disable(); 4772 for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { 4773 struct intel_engine_cs *sibling = ve->siblings[n]; 4774 struct ve_node * const node = &ve->nodes[sibling->id]; 4775 struct rb_node **parent, *rb; 4776 bool first; 4777 4778 if (unlikely(!(mask & sibling->mask))) { 4779 if (!RB_EMPTY_NODE(&node->rb)) { 4780 spin_lock(&sibling->active.lock); 4781 rb_erase_cached(&node->rb, 4782 &sibling->execlists.virtual); 4783 RB_CLEAR_NODE(&node->rb); 4784 spin_unlock(&sibling->active.lock); 4785 } 4786 continue; 4787 } 4788 4789 spin_lock(&sibling->active.lock); 4790 4791 if (!RB_EMPTY_NODE(&node->rb)) { 4792 /* 4793 * Cheat and avoid rebalancing the tree if we can 4794 * reuse this node in situ. 4795 */ 4796 first = rb_first_cached(&sibling->execlists.virtual) == 4797 &node->rb; 4798 if (prio == node->prio || (prio > node->prio && first)) 4799 goto submit_engine; 4800 4801 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 4802 } 4803 4804 rb = NULL; 4805 first = true; 4806 parent = &sibling->execlists.virtual.rb_root.rb_node; 4807 while (*parent) { 4808 struct ve_node *other; 4809 4810 rb = *parent; 4811 other = rb_entry(rb, typeof(*other), rb); 4812 if (prio > other->prio) { 4813 parent = &rb->rb_left; 4814 } else { 4815 parent = &rb->rb_right; 4816 first = false; 4817 } 4818 } 4819 4820 rb_link_node(&node->rb, rb, parent); 4821 rb_insert_color_cached(&node->rb, 4822 &sibling->execlists.virtual, 4823 first); 4824 4825 submit_engine: 4826 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 4827 node->prio = prio; 4828 if (first && prio > sibling->execlists.queue_priority_hint) { 4829 sibling->execlists.queue_priority_hint = prio; 4830 tasklet_hi_schedule(&sibling->execlists.tasklet); 4831 } 4832 4833 spin_unlock(&sibling->active.lock); 4834 } 4835 local_irq_enable(); 4836 } 4837 4838 static void virtual_submit_request(struct i915_request *rq) 4839 { 4840 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4841 struct i915_request *old; 4842 unsigned long flags; 4843 4844 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 4845 rq->fence.context, 4846 rq->fence.seqno); 4847 4848 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 4849 4850 spin_lock_irqsave(&ve->base.active.lock, flags); 4851 4852 old = ve->request; 4853 if (old) { /* background completion event from preempt-to-busy */ 4854 GEM_BUG_ON(!i915_request_completed(old)); 4855 __i915_request_submit(old); 4856 i915_request_put(old); 4857 } 4858 4859 if (i915_request_completed(rq)) { 4860 __i915_request_submit(rq); 4861 4862 ve->base.execlists.queue_priority_hint = INT_MIN; 4863 ve->request = NULL; 4864 } else { 4865 ve->base.execlists.queue_priority_hint = rq_prio(rq); 4866 ve->request = i915_request_get(rq); 4867 4868 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 4869 list_move_tail(&rq->sched.link, virtual_queue(ve)); 4870 4871 tasklet_schedule(&ve->base.execlists.tasklet); 4872 } 4873 4874 spin_unlock_irqrestore(&ve->base.active.lock, flags); 4875 } 4876 4877 static struct ve_bond * 4878 virtual_find_bond(struct virtual_engine *ve, 4879 const struct intel_engine_cs *master) 4880 { 4881 int i; 4882 4883 for (i = 0; i < ve->num_bonds; i++) { 4884 if (ve->bonds[i].master == master) 4885 return &ve->bonds[i]; 4886 } 4887 4888 return NULL; 4889 } 4890 4891 static void 4892 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 4893 { 4894 struct virtual_engine *ve = to_virtual_engine(rq->engine); 4895 intel_engine_mask_t allowed, exec; 4896 struct ve_bond *bond; 4897 4898 allowed = ~to_request(signal)->engine->mask; 4899 4900 bond = virtual_find_bond(ve, to_request(signal)->engine); 4901 if (bond) 4902 allowed &= bond->sibling_mask; 4903 4904 /* Restrict the bonded request to run on only the available engines */ 4905 exec = READ_ONCE(rq->execution_mask); 4906 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 4907 ; 4908 4909 /* Prevent the master from being re-run on the bonded engines */ 4910 to_request(signal)->execution_mask &= ~allowed; 4911 } 4912 4913 struct intel_context * 4914 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 4915 unsigned int count) 4916 { 4917 struct virtual_engine *ve; 4918 unsigned int n; 4919 int err; 4920 4921 if (count == 0) 4922 return ERR_PTR(-EINVAL); 4923 4924 if (count == 1) 4925 return intel_context_create(siblings[0]); 4926 4927 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 4928 if (!ve) 4929 return ERR_PTR(-ENOMEM); 4930 4931 ve->base.i915 = siblings[0]->i915; 4932 ve->base.gt = siblings[0]->gt; 4933 ve->base.uncore = siblings[0]->uncore; 4934 ve->base.id = -1; 4935 4936 ve->base.class = OTHER_CLASS; 4937 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 4938 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4939 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 4940 4941 /* 4942 * The decision on whether to submit a request using semaphores 4943 * depends on the saturated state of the engine. We only compute 4944 * this during HW submission of the request, and we need for this 4945 * state to be globally applied to all requests being submitted 4946 * to this engine. Virtual engines encompass more than one physical 4947 * engine and so we cannot accurately tell in advance if one of those 4948 * engines is already saturated and so cannot afford to use a semaphore 4949 * and be pessimized in priority for doing so -- if we are the only 4950 * context using semaphores after all other clients have stopped, we 4951 * will be starved on the saturated system. Such a global switch for 4952 * semaphores is less than ideal, but alas is the current compromise. 4953 */ 4954 ve->base.saturated = ALL_ENGINES; 4955 4956 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 4957 4958 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 4959 intel_engine_init_breadcrumbs(&ve->base); 4960 intel_engine_init_execlists(&ve->base); 4961 4962 ve->base.cops = &virtual_context_ops; 4963 ve->base.request_alloc = execlists_request_alloc; 4964 4965 ve->base.schedule = i915_schedule; 4966 ve->base.submit_request = virtual_submit_request; 4967 ve->base.bond_execute = virtual_bond_execute; 4968 4969 INIT_LIST_HEAD(virtual_queue(ve)); 4970 ve->base.execlists.queue_priority_hint = INT_MIN; 4971 tasklet_init(&ve->base.execlists.tasklet, 4972 virtual_submission_tasklet, 4973 (unsigned long)ve); 4974 4975 intel_context_init(&ve->context, &ve->base); 4976 4977 for (n = 0; n < count; n++) { 4978 struct intel_engine_cs *sibling = siblings[n]; 4979 4980 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 4981 if (sibling->mask & ve->base.mask) { 4982 DRM_DEBUG("duplicate %s entry in load balancer\n", 4983 sibling->name); 4984 err = -EINVAL; 4985 goto err_put; 4986 } 4987 4988 /* 4989 * The virtual engine implementation is tightly coupled to 4990 * the execlists backend -- we push out request directly 4991 * into a tree inside each physical engine. We could support 4992 * layering if we handle cloning of the requests and 4993 * submitting a copy into each backend. 4994 */ 4995 if (sibling->execlists.tasklet.func != 4996 execlists_submission_tasklet) { 4997 err = -ENODEV; 4998 goto err_put; 4999 } 5000 5001 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5002 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5003 5004 ve->siblings[ve->num_siblings++] = sibling; 5005 ve->base.mask |= sibling->mask; 5006 5007 /* 5008 * All physical engines must be compatible for their emission 5009 * functions (as we build the instructions during request 5010 * construction and do not alter them before submission 5011 * on the physical engine). We use the engine class as a guide 5012 * here, although that could be refined. 5013 */ 5014 if (ve->base.class != OTHER_CLASS) { 5015 if (ve->base.class != sibling->class) { 5016 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5017 sibling->class, ve->base.class); 5018 err = -EINVAL; 5019 goto err_put; 5020 } 5021 continue; 5022 } 5023 5024 ve->base.class = sibling->class; 5025 ve->base.uabi_class = sibling->uabi_class; 5026 snprintf(ve->base.name, sizeof(ve->base.name), 5027 "v%dx%d", ve->base.class, count); 5028 ve->base.context_size = sibling->context_size; 5029 5030 ve->base.emit_bb_start = sibling->emit_bb_start; 5031 ve->base.emit_flush = sibling->emit_flush; 5032 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5033 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5034 ve->base.emit_fini_breadcrumb_dw = 5035 sibling->emit_fini_breadcrumb_dw; 5036 5037 ve->base.flags = sibling->flags; 5038 } 5039 5040 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5041 5042 return &ve->context; 5043 5044 err_put: 5045 intel_context_put(&ve->context); 5046 return ERR_PTR(err); 5047 } 5048 5049 struct intel_context * 5050 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5051 { 5052 struct virtual_engine *se = to_virtual_engine(src); 5053 struct intel_context *dst; 5054 5055 dst = intel_execlists_create_virtual(se->siblings, 5056 se->num_siblings); 5057 if (IS_ERR(dst)) 5058 return dst; 5059 5060 if (se->num_bonds) { 5061 struct virtual_engine *de = to_virtual_engine(dst->engine); 5062 5063 de->bonds = kmemdup(se->bonds, 5064 sizeof(*se->bonds) * se->num_bonds, 5065 GFP_KERNEL); 5066 if (!de->bonds) { 5067 intel_context_put(dst); 5068 return ERR_PTR(-ENOMEM); 5069 } 5070 5071 de->num_bonds = se->num_bonds; 5072 } 5073 5074 return dst; 5075 } 5076 5077 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5078 const struct intel_engine_cs *master, 5079 const struct intel_engine_cs *sibling) 5080 { 5081 struct virtual_engine *ve = to_virtual_engine(engine); 5082 struct ve_bond *bond; 5083 int n; 5084 5085 /* Sanity check the sibling is part of the virtual engine */ 5086 for (n = 0; n < ve->num_siblings; n++) 5087 if (sibling == ve->siblings[n]) 5088 break; 5089 if (n == ve->num_siblings) 5090 return -EINVAL; 5091 5092 bond = virtual_find_bond(ve, master); 5093 if (bond) { 5094 bond->sibling_mask |= sibling->mask; 5095 return 0; 5096 } 5097 5098 bond = krealloc(ve->bonds, 5099 sizeof(*bond) * (ve->num_bonds + 1), 5100 GFP_KERNEL); 5101 if (!bond) 5102 return -ENOMEM; 5103 5104 bond[ve->num_bonds].master = master; 5105 bond[ve->num_bonds].sibling_mask = sibling->mask; 5106 5107 ve->bonds = bond; 5108 ve->num_bonds++; 5109 5110 return 0; 5111 } 5112 5113 struct intel_engine_cs * 5114 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5115 unsigned int sibling) 5116 { 5117 struct virtual_engine *ve = to_virtual_engine(engine); 5118 5119 if (sibling >= ve->num_siblings) 5120 return NULL; 5121 5122 return ve->siblings[sibling]; 5123 } 5124 5125 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5126 struct drm_printer *m, 5127 void (*show_request)(struct drm_printer *m, 5128 struct i915_request *rq, 5129 const char *prefix), 5130 unsigned int max) 5131 { 5132 const struct intel_engine_execlists *execlists = &engine->execlists; 5133 struct i915_request *rq, *last; 5134 unsigned long flags; 5135 unsigned int count; 5136 struct rb_node *rb; 5137 5138 spin_lock_irqsave(&engine->active.lock, flags); 5139 5140 last = NULL; 5141 count = 0; 5142 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5143 if (count++ < max - 1) 5144 show_request(m, rq, "\t\tE "); 5145 else 5146 last = rq; 5147 } 5148 if (last) { 5149 if (count > max) { 5150 drm_printf(m, 5151 "\t\t...skipping %d executing requests...\n", 5152 count - max); 5153 } 5154 show_request(m, last, "\t\tE "); 5155 } 5156 5157 last = NULL; 5158 count = 0; 5159 if (execlists->queue_priority_hint != INT_MIN) 5160 drm_printf(m, "\t\tQueue priority hint: %d\n", 5161 execlists->queue_priority_hint); 5162 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5163 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5164 int i; 5165 5166 priolist_for_each_request(rq, p, i) { 5167 if (count++ < max - 1) 5168 show_request(m, rq, "\t\tQ "); 5169 else 5170 last = rq; 5171 } 5172 } 5173 if (last) { 5174 if (count > max) { 5175 drm_printf(m, 5176 "\t\t...skipping %d queued requests...\n", 5177 count - max); 5178 } 5179 show_request(m, last, "\t\tQ "); 5180 } 5181 5182 last = NULL; 5183 count = 0; 5184 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5185 struct virtual_engine *ve = 5186 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5187 struct i915_request *rq = READ_ONCE(ve->request); 5188 5189 if (rq) { 5190 if (count++ < max - 1) 5191 show_request(m, rq, "\t\tV "); 5192 else 5193 last = rq; 5194 } 5195 } 5196 if (last) { 5197 if (count > max) { 5198 drm_printf(m, 5199 "\t\t...skipping %d virtual requests...\n", 5200 count - max); 5201 } 5202 show_request(m, last, "\t\tV "); 5203 } 5204 5205 spin_unlock_irqrestore(&engine->active.lock, flags); 5206 } 5207 5208 void intel_lr_context_reset(struct intel_engine_cs *engine, 5209 struct intel_context *ce, 5210 u32 head, 5211 bool scrub) 5212 { 5213 GEM_BUG_ON(!intel_context_is_pinned(ce)); 5214 5215 /* 5216 * We want a simple context + ring to execute the breadcrumb update. 5217 * We cannot rely on the context being intact across the GPU hang, 5218 * so clear it and rebuild just what we need for the breadcrumb. 5219 * All pending requests for this context will be zapped, and any 5220 * future request will be after userspace has had the opportunity 5221 * to recreate its own state. 5222 */ 5223 if (scrub) 5224 restore_default_state(ce, engine); 5225 5226 /* Rerun the request; its payload has been neutered (if guilty). */ 5227 __execlists_update_reg_state(ce, engine, head); 5228 } 5229 5230 bool 5231 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 5232 { 5233 return engine->set_default_submission == 5234 intel_execlists_set_default_submission; 5235 } 5236 5237 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 5238 #include "selftest_lrc.c" 5239 #endif 5240