1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_context.h" 141 #include "intel_engine_pm.h" 142 #include "intel_gt.h" 143 #include "intel_gt_pm.h" 144 #include "intel_gt_requests.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 #include "shmem_utils.h" 151 152 #define RING_EXECLIST_QFULL (1 << 0x2) 153 #define RING_EXECLIST1_VALID (1 << 0x3) 154 #define RING_EXECLIST0_VALID (1 << 0x4) 155 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 156 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 157 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 158 159 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 160 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 162 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 163 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 164 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 165 166 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 167 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 168 169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 170 171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 173 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 174 #define GEN12_IDLE_CTX_ID 0x7FF 175 #define GEN12_CSB_CTX_VALID(csb_dw) \ 176 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 177 178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 180 181 struct virtual_engine { 182 struct intel_engine_cs base; 183 struct intel_context context; 184 185 /* 186 * We allow only a single request through the virtual engine at a time 187 * (each request in the timeline waits for the completion fence of 188 * the previous before being submitted). By restricting ourselves to 189 * only submitting a single request, each request is placed on to a 190 * physical to maximise load spreading (by virtue of the late greedy 191 * scheduling -- each real engine takes the next available request 192 * upon idling). 193 */ 194 struct i915_request *request; 195 196 /* 197 * We keep a rbtree of available virtual engines inside each physical 198 * engine, sorted by priority. Here we preallocate the nodes we need 199 * for the virtual engine, indexed by physical_engine->id. 200 */ 201 struct ve_node { 202 struct rb_node rb; 203 int prio; 204 } nodes[I915_NUM_ENGINES]; 205 206 /* 207 * Keep track of bonded pairs -- restrictions upon on our selection 208 * of physical engines any particular request may be submitted to. 209 * If we receive a submit-fence from a master engine, we will only 210 * use one of sibling_mask physical engines. 211 */ 212 struct ve_bond { 213 const struct intel_engine_cs *master; 214 intel_engine_mask_t sibling_mask; 215 } *bonds; 216 unsigned int num_bonds; 217 218 /* And finally, which physical engines this virtual engine maps onto. */ 219 unsigned int num_siblings; 220 struct intel_engine_cs *siblings[]; 221 }; 222 223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 224 { 225 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 226 return container_of(engine, struct virtual_engine, base); 227 } 228 229 static int __execlists_context_alloc(struct intel_context *ce, 230 struct intel_engine_cs *engine); 231 232 static void execlists_init_reg_state(u32 *reg_state, 233 const struct intel_context *ce, 234 const struct intel_engine_cs *engine, 235 const struct intel_ring *ring, 236 bool close); 237 static void 238 __execlists_update_reg_state(const struct intel_context *ce, 239 const struct intel_engine_cs *engine, 240 u32 head); 241 242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 243 { 244 if (INTEL_GEN(engine->i915) >= 12) 245 return 0x60; 246 else if (INTEL_GEN(engine->i915) >= 9) 247 return 0x54; 248 else if (engine->class == RENDER_CLASS) 249 return 0x58; 250 else 251 return -1; 252 } 253 254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 255 { 256 if (INTEL_GEN(engine->i915) >= 12) 257 return 0x74; 258 else if (INTEL_GEN(engine->i915) >= 9) 259 return 0x68; 260 else if (engine->class == RENDER_CLASS) 261 return 0xd8; 262 else 263 return -1; 264 } 265 266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 267 { 268 if (INTEL_GEN(engine->i915) >= 12) 269 return 0x12; 270 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 271 return 0x18; 272 else 273 return -1; 274 } 275 276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 277 { 278 int x; 279 280 x = lrc_ring_wa_bb_per_ctx(engine); 281 if (x < 0) 282 return x; 283 284 return x + 2; 285 } 286 287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 288 { 289 int x; 290 291 x = lrc_ring_indirect_ptr(engine); 292 if (x < 0) 293 return x; 294 295 return x + 2; 296 } 297 298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 299 { 300 if (engine->class != RENDER_CLASS) 301 return -1; 302 303 if (INTEL_GEN(engine->i915) >= 12) 304 return 0xb6; 305 else if (INTEL_GEN(engine->i915) >= 11) 306 return 0xaa; 307 else 308 return -1; 309 } 310 311 static u32 312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 313 { 314 switch (INTEL_GEN(engine->i915)) { 315 default: 316 MISSING_CASE(INTEL_GEN(engine->i915)); 317 fallthrough; 318 case 12: 319 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 320 case 11: 321 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 322 case 10: 323 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 324 case 9: 325 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 326 case 8: 327 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 328 } 329 } 330 331 static void 332 lrc_ring_setup_indirect_ctx(u32 *regs, 333 const struct intel_engine_cs *engine, 334 u32 ctx_bb_ggtt_addr, 335 u32 size) 336 { 337 GEM_BUG_ON(!size); 338 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 339 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 340 regs[lrc_ring_indirect_ptr(engine) + 1] = 341 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 342 343 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 344 regs[lrc_ring_indirect_offset(engine) + 1] = 345 lrc_ring_indirect_offset_default(engine) << 6; 346 } 347 348 static u32 intel_context_get_runtime(const struct intel_context *ce) 349 { 350 /* 351 * We can use either ppHWSP[16] which is recorded before the context 352 * switch (and so excludes the cost of context switches) or use the 353 * value from the context image itself, which is saved/restored earlier 354 * and so includes the cost of the save. 355 */ 356 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 357 } 358 359 static void mark_eio(struct i915_request *rq) 360 { 361 if (i915_request_completed(rq)) 362 return; 363 364 GEM_BUG_ON(i915_request_signaled(rq)); 365 366 i915_request_set_error_once(rq, -EIO); 367 i915_request_mark_complete(rq); 368 } 369 370 static struct i915_request * 371 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 372 { 373 struct i915_request *active = rq; 374 375 rcu_read_lock(); 376 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 377 if (i915_request_completed(rq)) 378 break; 379 380 active = rq; 381 } 382 rcu_read_unlock(); 383 384 return active; 385 } 386 387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 388 { 389 return (i915_ggtt_offset(engine->status_page.vma) + 390 I915_GEM_HWS_PREEMPT_ADDR); 391 } 392 393 static inline void 394 ring_set_paused(const struct intel_engine_cs *engine, int state) 395 { 396 /* 397 * We inspect HWS_PREEMPT with a semaphore inside 398 * engine->emit_fini_breadcrumb. If the dword is true, 399 * the ring is paused as the semaphore will busywait 400 * until the dword is false. 401 */ 402 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 403 if (state) 404 wmb(); 405 } 406 407 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 408 { 409 return rb_entry(rb, struct i915_priolist, node); 410 } 411 412 static inline int rq_prio(const struct i915_request *rq) 413 { 414 return READ_ONCE(rq->sched.attr.priority); 415 } 416 417 static int effective_prio(const struct i915_request *rq) 418 { 419 int prio = rq_prio(rq); 420 421 /* 422 * If this request is special and must not be interrupted at any 423 * cost, so be it. Note we are only checking the most recent request 424 * in the context and so may be masking an earlier vip request. It 425 * is hoped that under the conditions where nopreempt is used, this 426 * will not matter (i.e. all requests to that context will be 427 * nopreempt for as long as desired). 428 */ 429 if (i915_request_has_nopreempt(rq)) 430 prio = I915_PRIORITY_UNPREEMPTABLE; 431 432 return prio; 433 } 434 435 static int queue_prio(const struct intel_engine_execlists *execlists) 436 { 437 struct i915_priolist *p; 438 struct rb_node *rb; 439 440 rb = rb_first_cached(&execlists->queue); 441 if (!rb) 442 return INT_MIN; 443 444 /* 445 * As the priolist[] are inverted, with the highest priority in [0], 446 * we have to flip the index value to become priority. 447 */ 448 p = to_priolist(rb); 449 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 450 } 451 452 static inline bool need_preempt(const struct intel_engine_cs *engine, 453 const struct i915_request *rq, 454 struct rb_node *rb) 455 { 456 int last_prio; 457 458 if (!intel_engine_has_semaphores(engine)) 459 return false; 460 461 /* 462 * Check if the current priority hint merits a preemption attempt. 463 * 464 * We record the highest value priority we saw during rescheduling 465 * prior to this dequeue, therefore we know that if it is strictly 466 * less than the current tail of ESLP[0], we do not need to force 467 * a preempt-to-idle cycle. 468 * 469 * However, the priority hint is a mere hint that we may need to 470 * preempt. If that hint is stale or we may be trying to preempt 471 * ourselves, ignore the request. 472 * 473 * More naturally we would write 474 * prio >= max(0, last); 475 * except that we wish to prevent triggering preemption at the same 476 * priority level: the task that is running should remain running 477 * to preserve FIFO ordering of dependencies. 478 */ 479 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 480 if (engine->execlists.queue_priority_hint <= last_prio) 481 return false; 482 483 /* 484 * Check against the first request in ELSP[1], it will, thanks to the 485 * power of PI, be the highest priority of that context. 486 */ 487 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 488 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 489 return true; 490 491 if (rb) { 492 struct virtual_engine *ve = 493 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 494 bool preempt = false; 495 496 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 497 struct i915_request *next; 498 499 rcu_read_lock(); 500 next = READ_ONCE(ve->request); 501 if (next) 502 preempt = rq_prio(next) > last_prio; 503 rcu_read_unlock(); 504 } 505 506 if (preempt) 507 return preempt; 508 } 509 510 /* 511 * If the inflight context did not trigger the preemption, then maybe 512 * it was the set of queued requests? Pick the highest priority in 513 * the queue (the first active priolist) and see if it deserves to be 514 * running instead of ELSP[0]. 515 * 516 * The highest priority request in the queue can not be either 517 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 518 * context, it's priority would not exceed ELSP[0] aka last_prio. 519 */ 520 return queue_prio(&engine->execlists) > last_prio; 521 } 522 523 __maybe_unused static inline bool 524 assert_priority_queue(const struct i915_request *prev, 525 const struct i915_request *next) 526 { 527 /* 528 * Without preemption, the prev may refer to the still active element 529 * which we refuse to let go. 530 * 531 * Even with preemption, there are times when we think it is better not 532 * to preempt and leave an ostensibly lower priority request in flight. 533 */ 534 if (i915_request_is_active(prev)) 535 return true; 536 537 return rq_prio(prev) >= rq_prio(next); 538 } 539 540 /* 541 * The context descriptor encodes various attributes of a context, 542 * including its GTT address and some flags. Because it's fairly 543 * expensive to calculate, we'll just do it once and cache the result, 544 * which remains valid until the context is unpinned. 545 * 546 * This is what a descriptor looks like, from LSB to MSB:: 547 * 548 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 549 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 550 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 551 * bits 53-54: mbz, reserved for use by hardware 552 * bits 55-63: group ID, currently unused and set to 0 553 * 554 * Starting from Gen11, the upper dword of the descriptor has a new format: 555 * 556 * bits 32-36: reserved 557 * bits 37-47: SW context ID 558 * bits 48:53: engine instance 559 * bit 54: mbz, reserved for use by hardware 560 * bits 55-60: SW counter 561 * bits 61-63: engine class 562 * 563 * engine info, SW context ID and SW counter need to form a unique number 564 * (Context ID) per lrc. 565 */ 566 static u32 567 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 568 { 569 u32 desc; 570 571 desc = INTEL_LEGACY_32B_CONTEXT; 572 if (i915_vm_is_4lvl(ce->vm)) 573 desc = INTEL_LEGACY_64B_CONTEXT; 574 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 575 576 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 577 if (IS_GEN(engine->i915, 8)) 578 desc |= GEN8_CTX_L3LLC_COHERENT; 579 580 return i915_ggtt_offset(ce->state) | desc; 581 } 582 583 static inline unsigned int dword_in_page(void *addr) 584 { 585 return offset_in_page(addr) / sizeof(u32); 586 } 587 588 static void set_offsets(u32 *regs, 589 const u8 *data, 590 const struct intel_engine_cs *engine, 591 bool clear) 592 #define NOP(x) (BIT(7) | (x)) 593 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 594 #define POSTED BIT(0) 595 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 596 #define REG16(x) \ 597 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 598 (((x) >> 2) & 0x7f) 599 #define END(total_state_size) 0, (total_state_size) 600 { 601 const u32 base = engine->mmio_base; 602 603 while (*data) { 604 u8 count, flags; 605 606 if (*data & BIT(7)) { /* skip */ 607 count = *data++ & ~BIT(7); 608 if (clear) 609 memset32(regs, MI_NOOP, count); 610 regs += count; 611 continue; 612 } 613 614 count = *data & 0x3f; 615 flags = *data >> 6; 616 data++; 617 618 *regs = MI_LOAD_REGISTER_IMM(count); 619 if (flags & POSTED) 620 *regs |= MI_LRI_FORCE_POSTED; 621 if (INTEL_GEN(engine->i915) >= 11) 622 *regs |= MI_LRI_LRM_CS_MMIO; 623 regs++; 624 625 GEM_BUG_ON(!count); 626 do { 627 u32 offset = 0; 628 u8 v; 629 630 do { 631 v = *data++; 632 offset <<= 7; 633 offset |= v & ~BIT(7); 634 } while (v & BIT(7)); 635 636 regs[0] = base + (offset << 2); 637 if (clear) 638 regs[1] = 0; 639 regs += 2; 640 } while (--count); 641 } 642 643 if (clear) { 644 u8 count = *++data; 645 646 /* Clear past the tail for HW access */ 647 GEM_BUG_ON(dword_in_page(regs) > count); 648 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 649 650 /* Close the batch; used mainly by live_lrc_layout() */ 651 *regs = MI_BATCH_BUFFER_END; 652 if (INTEL_GEN(engine->i915) >= 10) 653 *regs |= BIT(0); 654 } 655 } 656 657 static const u8 gen8_xcs_offsets[] = { 658 NOP(1), 659 LRI(11, 0), 660 REG16(0x244), 661 REG(0x034), 662 REG(0x030), 663 REG(0x038), 664 REG(0x03c), 665 REG(0x168), 666 REG(0x140), 667 REG(0x110), 668 REG(0x11c), 669 REG(0x114), 670 REG(0x118), 671 672 NOP(9), 673 LRI(9, 0), 674 REG16(0x3a8), 675 REG16(0x28c), 676 REG16(0x288), 677 REG16(0x284), 678 REG16(0x280), 679 REG16(0x27c), 680 REG16(0x278), 681 REG16(0x274), 682 REG16(0x270), 683 684 NOP(13), 685 LRI(2, 0), 686 REG16(0x200), 687 REG(0x028), 688 689 END(80) 690 }; 691 692 static const u8 gen9_xcs_offsets[] = { 693 NOP(1), 694 LRI(14, POSTED), 695 REG16(0x244), 696 REG(0x034), 697 REG(0x030), 698 REG(0x038), 699 REG(0x03c), 700 REG(0x168), 701 REG(0x140), 702 REG(0x110), 703 REG(0x11c), 704 REG(0x114), 705 REG(0x118), 706 REG(0x1c0), 707 REG(0x1c4), 708 REG(0x1c8), 709 710 NOP(3), 711 LRI(9, POSTED), 712 REG16(0x3a8), 713 REG16(0x28c), 714 REG16(0x288), 715 REG16(0x284), 716 REG16(0x280), 717 REG16(0x27c), 718 REG16(0x278), 719 REG16(0x274), 720 REG16(0x270), 721 722 NOP(13), 723 LRI(1, POSTED), 724 REG16(0x200), 725 726 NOP(13), 727 LRI(44, POSTED), 728 REG(0x028), 729 REG(0x09c), 730 REG(0x0c0), 731 REG(0x178), 732 REG(0x17c), 733 REG16(0x358), 734 REG(0x170), 735 REG(0x150), 736 REG(0x154), 737 REG(0x158), 738 REG16(0x41c), 739 REG16(0x600), 740 REG16(0x604), 741 REG16(0x608), 742 REG16(0x60c), 743 REG16(0x610), 744 REG16(0x614), 745 REG16(0x618), 746 REG16(0x61c), 747 REG16(0x620), 748 REG16(0x624), 749 REG16(0x628), 750 REG16(0x62c), 751 REG16(0x630), 752 REG16(0x634), 753 REG16(0x638), 754 REG16(0x63c), 755 REG16(0x640), 756 REG16(0x644), 757 REG16(0x648), 758 REG16(0x64c), 759 REG16(0x650), 760 REG16(0x654), 761 REG16(0x658), 762 REG16(0x65c), 763 REG16(0x660), 764 REG16(0x664), 765 REG16(0x668), 766 REG16(0x66c), 767 REG16(0x670), 768 REG16(0x674), 769 REG16(0x678), 770 REG16(0x67c), 771 REG(0x068), 772 773 END(176) 774 }; 775 776 static const u8 gen12_xcs_offsets[] = { 777 NOP(1), 778 LRI(13, POSTED), 779 REG16(0x244), 780 REG(0x034), 781 REG(0x030), 782 REG(0x038), 783 REG(0x03c), 784 REG(0x168), 785 REG(0x140), 786 REG(0x110), 787 REG(0x1c0), 788 REG(0x1c4), 789 REG(0x1c8), 790 REG(0x180), 791 REG16(0x2b4), 792 793 NOP(5), 794 LRI(9, POSTED), 795 REG16(0x3a8), 796 REG16(0x28c), 797 REG16(0x288), 798 REG16(0x284), 799 REG16(0x280), 800 REG16(0x27c), 801 REG16(0x278), 802 REG16(0x274), 803 REG16(0x270), 804 805 END(80) 806 }; 807 808 static const u8 gen8_rcs_offsets[] = { 809 NOP(1), 810 LRI(14, POSTED), 811 REG16(0x244), 812 REG(0x034), 813 REG(0x030), 814 REG(0x038), 815 REG(0x03c), 816 REG(0x168), 817 REG(0x140), 818 REG(0x110), 819 REG(0x11c), 820 REG(0x114), 821 REG(0x118), 822 REG(0x1c0), 823 REG(0x1c4), 824 REG(0x1c8), 825 826 NOP(3), 827 LRI(9, POSTED), 828 REG16(0x3a8), 829 REG16(0x28c), 830 REG16(0x288), 831 REG16(0x284), 832 REG16(0x280), 833 REG16(0x27c), 834 REG16(0x278), 835 REG16(0x274), 836 REG16(0x270), 837 838 NOP(13), 839 LRI(1, 0), 840 REG(0x0c8), 841 842 END(80) 843 }; 844 845 static const u8 gen9_rcs_offsets[] = { 846 NOP(1), 847 LRI(14, POSTED), 848 REG16(0x244), 849 REG(0x34), 850 REG(0x30), 851 REG(0x38), 852 REG(0x3c), 853 REG(0x168), 854 REG(0x140), 855 REG(0x110), 856 REG(0x11c), 857 REG(0x114), 858 REG(0x118), 859 REG(0x1c0), 860 REG(0x1c4), 861 REG(0x1c8), 862 863 NOP(3), 864 LRI(9, POSTED), 865 REG16(0x3a8), 866 REG16(0x28c), 867 REG16(0x288), 868 REG16(0x284), 869 REG16(0x280), 870 REG16(0x27c), 871 REG16(0x278), 872 REG16(0x274), 873 REG16(0x270), 874 875 NOP(13), 876 LRI(1, 0), 877 REG(0xc8), 878 879 NOP(13), 880 LRI(44, POSTED), 881 REG(0x28), 882 REG(0x9c), 883 REG(0xc0), 884 REG(0x178), 885 REG(0x17c), 886 REG16(0x358), 887 REG(0x170), 888 REG(0x150), 889 REG(0x154), 890 REG(0x158), 891 REG16(0x41c), 892 REG16(0x600), 893 REG16(0x604), 894 REG16(0x608), 895 REG16(0x60c), 896 REG16(0x610), 897 REG16(0x614), 898 REG16(0x618), 899 REG16(0x61c), 900 REG16(0x620), 901 REG16(0x624), 902 REG16(0x628), 903 REG16(0x62c), 904 REG16(0x630), 905 REG16(0x634), 906 REG16(0x638), 907 REG16(0x63c), 908 REG16(0x640), 909 REG16(0x644), 910 REG16(0x648), 911 REG16(0x64c), 912 REG16(0x650), 913 REG16(0x654), 914 REG16(0x658), 915 REG16(0x65c), 916 REG16(0x660), 917 REG16(0x664), 918 REG16(0x668), 919 REG16(0x66c), 920 REG16(0x670), 921 REG16(0x674), 922 REG16(0x678), 923 REG16(0x67c), 924 REG(0x68), 925 926 END(176) 927 }; 928 929 static const u8 gen11_rcs_offsets[] = { 930 NOP(1), 931 LRI(15, POSTED), 932 REG16(0x244), 933 REG(0x034), 934 REG(0x030), 935 REG(0x038), 936 REG(0x03c), 937 REG(0x168), 938 REG(0x140), 939 REG(0x110), 940 REG(0x11c), 941 REG(0x114), 942 REG(0x118), 943 REG(0x1c0), 944 REG(0x1c4), 945 REG(0x1c8), 946 REG(0x180), 947 948 NOP(1), 949 LRI(9, POSTED), 950 REG16(0x3a8), 951 REG16(0x28c), 952 REG16(0x288), 953 REG16(0x284), 954 REG16(0x280), 955 REG16(0x27c), 956 REG16(0x278), 957 REG16(0x274), 958 REG16(0x270), 959 960 LRI(1, POSTED), 961 REG(0x1b0), 962 963 NOP(10), 964 LRI(1, 0), 965 REG(0x0c8), 966 967 END(80) 968 }; 969 970 static const u8 gen12_rcs_offsets[] = { 971 NOP(1), 972 LRI(13, POSTED), 973 REG16(0x244), 974 REG(0x034), 975 REG(0x030), 976 REG(0x038), 977 REG(0x03c), 978 REG(0x168), 979 REG(0x140), 980 REG(0x110), 981 REG(0x1c0), 982 REG(0x1c4), 983 REG(0x1c8), 984 REG(0x180), 985 REG16(0x2b4), 986 987 NOP(5), 988 LRI(9, POSTED), 989 REG16(0x3a8), 990 REG16(0x28c), 991 REG16(0x288), 992 REG16(0x284), 993 REG16(0x280), 994 REG16(0x27c), 995 REG16(0x278), 996 REG16(0x274), 997 REG16(0x270), 998 999 LRI(3, POSTED), 1000 REG(0x1b0), 1001 REG16(0x5a8), 1002 REG16(0x5ac), 1003 1004 NOP(6), 1005 LRI(1, 0), 1006 REG(0x0c8), 1007 NOP(3 + 9 + 1), 1008 1009 LRI(51, POSTED), 1010 REG16(0x588), 1011 REG16(0x588), 1012 REG16(0x588), 1013 REG16(0x588), 1014 REG16(0x588), 1015 REG16(0x588), 1016 REG(0x028), 1017 REG(0x09c), 1018 REG(0x0c0), 1019 REG(0x178), 1020 REG(0x17c), 1021 REG16(0x358), 1022 REG(0x170), 1023 REG(0x150), 1024 REG(0x154), 1025 REG(0x158), 1026 REG16(0x41c), 1027 REG16(0x600), 1028 REG16(0x604), 1029 REG16(0x608), 1030 REG16(0x60c), 1031 REG16(0x610), 1032 REG16(0x614), 1033 REG16(0x618), 1034 REG16(0x61c), 1035 REG16(0x620), 1036 REG16(0x624), 1037 REG16(0x628), 1038 REG16(0x62c), 1039 REG16(0x630), 1040 REG16(0x634), 1041 REG16(0x638), 1042 REG16(0x63c), 1043 REG16(0x640), 1044 REG16(0x644), 1045 REG16(0x648), 1046 REG16(0x64c), 1047 REG16(0x650), 1048 REG16(0x654), 1049 REG16(0x658), 1050 REG16(0x65c), 1051 REG16(0x660), 1052 REG16(0x664), 1053 REG16(0x668), 1054 REG16(0x66c), 1055 REG16(0x670), 1056 REG16(0x674), 1057 REG16(0x678), 1058 REG16(0x67c), 1059 REG(0x068), 1060 REG(0x084), 1061 NOP(1), 1062 1063 END(192) 1064 }; 1065 1066 #undef END 1067 #undef REG16 1068 #undef REG 1069 #undef LRI 1070 #undef NOP 1071 1072 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1073 { 1074 /* 1075 * The gen12+ lists only have the registers we program in the basic 1076 * default state. We rely on the context image using relative 1077 * addressing to automatic fixup the register state between the 1078 * physical engines for virtual engine. 1079 */ 1080 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1081 !intel_engine_has_relative_mmio(engine)); 1082 1083 if (engine->class == RENDER_CLASS) { 1084 if (INTEL_GEN(engine->i915) >= 12) 1085 return gen12_rcs_offsets; 1086 else if (INTEL_GEN(engine->i915) >= 11) 1087 return gen11_rcs_offsets; 1088 else if (INTEL_GEN(engine->i915) >= 9) 1089 return gen9_rcs_offsets; 1090 else 1091 return gen8_rcs_offsets; 1092 } else { 1093 if (INTEL_GEN(engine->i915) >= 12) 1094 return gen12_xcs_offsets; 1095 else if (INTEL_GEN(engine->i915) >= 9) 1096 return gen9_xcs_offsets; 1097 else 1098 return gen8_xcs_offsets; 1099 } 1100 } 1101 1102 static struct i915_request * 1103 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1104 { 1105 struct i915_request *rq, *rn, *active = NULL; 1106 struct list_head *uninitialized_var(pl); 1107 int prio = I915_PRIORITY_INVALID; 1108 1109 lockdep_assert_held(&engine->active.lock); 1110 1111 list_for_each_entry_safe_reverse(rq, rn, 1112 &engine->active.requests, 1113 sched.link) { 1114 if (i915_request_completed(rq)) 1115 continue; /* XXX */ 1116 1117 __i915_request_unsubmit(rq); 1118 1119 /* 1120 * Push the request back into the queue for later resubmission. 1121 * If this request is not native to this physical engine (i.e. 1122 * it came from a virtual source), push it back onto the virtual 1123 * engine so that it can be moved across onto another physical 1124 * engine as load dictates. 1125 */ 1126 if (likely(rq->execution_mask == engine->mask)) { 1127 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1128 if (rq_prio(rq) != prio) { 1129 prio = rq_prio(rq); 1130 pl = i915_sched_lookup_priolist(engine, prio); 1131 } 1132 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1133 1134 list_move(&rq->sched.link, pl); 1135 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1136 1137 active = rq; 1138 } else { 1139 struct intel_engine_cs *owner = rq->context->engine; 1140 1141 /* 1142 * Decouple the virtual breadcrumb before moving it 1143 * back to the virtual engine -- we don't want the 1144 * request to complete in the background and try 1145 * and cancel the breadcrumb on the virtual engine 1146 * (instead of the old engine where it is linked)! 1147 */ 1148 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 1149 &rq->fence.flags)) { 1150 spin_lock_nested(&rq->lock, 1151 SINGLE_DEPTH_NESTING); 1152 i915_request_cancel_breadcrumb(rq); 1153 spin_unlock(&rq->lock); 1154 } 1155 WRITE_ONCE(rq->engine, owner); 1156 owner->submit_request(rq); 1157 active = NULL; 1158 } 1159 } 1160 1161 return active; 1162 } 1163 1164 struct i915_request * 1165 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1166 { 1167 struct intel_engine_cs *engine = 1168 container_of(execlists, typeof(*engine), execlists); 1169 1170 return __unwind_incomplete_requests(engine); 1171 } 1172 1173 static inline void 1174 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1175 { 1176 /* 1177 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1178 * The compiler should eliminate this function as dead-code. 1179 */ 1180 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1181 return; 1182 1183 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1184 status, rq); 1185 } 1186 1187 static void intel_engine_context_in(struct intel_engine_cs *engine) 1188 { 1189 unsigned long flags; 1190 1191 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1192 return; 1193 1194 write_seqlock_irqsave(&engine->stats.lock, flags); 1195 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1196 engine->stats.start = ktime_get(); 1197 atomic_inc(&engine->stats.active); 1198 } 1199 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1200 } 1201 1202 static void intel_engine_context_out(struct intel_engine_cs *engine) 1203 { 1204 unsigned long flags; 1205 1206 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1207 1208 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1209 return; 1210 1211 write_seqlock_irqsave(&engine->stats.lock, flags); 1212 if (atomic_dec_and_test(&engine->stats.active)) { 1213 engine->stats.total = 1214 ktime_add(engine->stats.total, 1215 ktime_sub(ktime_get(), engine->stats.start)); 1216 } 1217 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1218 } 1219 1220 static void 1221 execlists_check_context(const struct intel_context *ce, 1222 const struct intel_engine_cs *engine) 1223 { 1224 const struct intel_ring *ring = ce->ring; 1225 u32 *regs = ce->lrc_reg_state; 1226 bool valid = true; 1227 int x; 1228 1229 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1230 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1231 engine->name, 1232 regs[CTX_RING_START], 1233 i915_ggtt_offset(ring->vma)); 1234 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1235 valid = false; 1236 } 1237 1238 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1239 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1240 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1241 engine->name, 1242 regs[CTX_RING_CTL], 1243 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1244 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1245 valid = false; 1246 } 1247 1248 x = lrc_ring_mi_mode(engine); 1249 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1250 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1251 engine->name, regs[x + 1]); 1252 regs[x + 1] &= ~STOP_RING; 1253 regs[x + 1] |= STOP_RING << 16; 1254 valid = false; 1255 } 1256 1257 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1258 } 1259 1260 static void restore_default_state(struct intel_context *ce, 1261 struct intel_engine_cs *engine) 1262 { 1263 u32 *regs; 1264 1265 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1266 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1267 1268 ce->runtime.last = intel_context_get_runtime(ce); 1269 } 1270 1271 static void reset_active(struct i915_request *rq, 1272 struct intel_engine_cs *engine) 1273 { 1274 struct intel_context * const ce = rq->context; 1275 u32 head; 1276 1277 /* 1278 * The executing context has been cancelled. We want to prevent 1279 * further execution along this context and propagate the error on 1280 * to anything depending on its results. 1281 * 1282 * In __i915_request_submit(), we apply the -EIO and remove the 1283 * requests' payloads for any banned requests. But first, we must 1284 * rewind the context back to the start of the incomplete request so 1285 * that we do not jump back into the middle of the batch. 1286 * 1287 * We preserve the breadcrumbs and semaphores of the incomplete 1288 * requests so that inter-timeline dependencies (i.e other timelines) 1289 * remain correctly ordered. And we defer to __i915_request_submit() 1290 * so that all asynchronous waits are correctly handled. 1291 */ 1292 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1293 rq->fence.context, rq->fence.seqno); 1294 1295 /* On resubmission of the active request, payload will be scrubbed */ 1296 if (i915_request_completed(rq)) 1297 head = rq->tail; 1298 else 1299 head = active_request(ce->timeline, rq)->head; 1300 head = intel_ring_wrap(ce->ring, head); 1301 1302 /* Scrub the context image to prevent replaying the previous batch */ 1303 restore_default_state(ce, engine); 1304 __execlists_update_reg_state(ce, engine, head); 1305 1306 /* We've switched away, so this should be a no-op, but intent matters */ 1307 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1308 } 1309 1310 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1311 { 1312 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1313 ce->runtime.num_underflow += dt < 0; 1314 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1315 #endif 1316 } 1317 1318 static void intel_context_update_runtime(struct intel_context *ce) 1319 { 1320 u32 old; 1321 s32 dt; 1322 1323 if (intel_context_is_barrier(ce)) 1324 return; 1325 1326 old = ce->runtime.last; 1327 ce->runtime.last = intel_context_get_runtime(ce); 1328 dt = ce->runtime.last - old; 1329 1330 if (unlikely(dt <= 0)) { 1331 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1332 old, ce->runtime.last, dt); 1333 st_update_runtime_underflow(ce, dt); 1334 return; 1335 } 1336 1337 ewma_runtime_add(&ce->runtime.avg, dt); 1338 ce->runtime.total += dt; 1339 } 1340 1341 static inline struct intel_engine_cs * 1342 __execlists_schedule_in(struct i915_request *rq) 1343 { 1344 struct intel_engine_cs * const engine = rq->engine; 1345 struct intel_context * const ce = rq->context; 1346 1347 intel_context_get(ce); 1348 1349 if (unlikely(intel_context_is_banned(ce))) 1350 reset_active(rq, engine); 1351 1352 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1353 execlists_check_context(ce, engine); 1354 1355 if (ce->tag) { 1356 /* Use a fixed tag for OA and friends */ 1357 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1358 ce->lrc.ccid = ce->tag; 1359 } else { 1360 /* We don't need a strict matching tag, just different values */ 1361 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1362 1363 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1364 clear_bit(tag - 1, &engine->context_tag); 1365 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1366 1367 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1368 } 1369 1370 ce->lrc.ccid |= engine->execlists.ccid; 1371 1372 __intel_gt_pm_get(engine->gt); 1373 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1374 intel_engine_context_in(engine); 1375 1376 return engine; 1377 } 1378 1379 static inline struct i915_request * 1380 execlists_schedule_in(struct i915_request *rq, int idx) 1381 { 1382 struct intel_context * const ce = rq->context; 1383 struct intel_engine_cs *old; 1384 1385 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1386 trace_i915_request_in(rq, idx); 1387 1388 old = READ_ONCE(ce->inflight); 1389 do { 1390 if (!old) { 1391 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1392 break; 1393 } 1394 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1395 1396 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1397 return i915_request_get(rq); 1398 } 1399 1400 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1401 { 1402 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1403 struct i915_request *next = READ_ONCE(ve->request); 1404 1405 if (next && next->execution_mask & ~rq->execution_mask) 1406 tasklet_schedule(&ve->base.execlists.tasklet); 1407 } 1408 1409 static inline void 1410 __execlists_schedule_out(struct i915_request *rq, 1411 struct intel_engine_cs * const engine, 1412 unsigned int ccid) 1413 { 1414 struct intel_context * const ce = rq->context; 1415 1416 /* 1417 * NB process_csb() is not under the engine->active.lock and hence 1418 * schedule_out can race with schedule_in meaning that we should 1419 * refrain from doing non-trivial work here. 1420 */ 1421 1422 /* 1423 * If we have just completed this context, the engine may now be 1424 * idle and we want to re-enter powersaving. 1425 */ 1426 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1427 i915_request_completed(rq)) 1428 intel_engine_add_retire(engine, ce->timeline); 1429 1430 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1431 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1432 if (ccid < BITS_PER_LONG) { 1433 GEM_BUG_ON(ccid == 0); 1434 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1435 set_bit(ccid - 1, &engine->context_tag); 1436 } 1437 1438 intel_context_update_runtime(ce); 1439 intel_engine_context_out(engine); 1440 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1441 intel_gt_pm_put_async(engine->gt); 1442 1443 /* 1444 * If this is part of a virtual engine, its next request may 1445 * have been blocked waiting for access to the active context. 1446 * We have to kick all the siblings again in case we need to 1447 * switch (e.g. the next request is not runnable on this 1448 * engine). Hopefully, we will already have submitted the next 1449 * request before the tasklet runs and do not need to rebuild 1450 * each virtual tree and kick everyone again. 1451 */ 1452 if (ce->engine != engine) 1453 kick_siblings(rq, ce); 1454 1455 intel_context_put(ce); 1456 } 1457 1458 static inline void 1459 execlists_schedule_out(struct i915_request *rq) 1460 { 1461 struct intel_context * const ce = rq->context; 1462 struct intel_engine_cs *cur, *old; 1463 u32 ccid; 1464 1465 trace_i915_request_out(rq); 1466 1467 ccid = rq->context->lrc.ccid; 1468 old = READ_ONCE(ce->inflight); 1469 do 1470 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1471 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1472 if (!cur) 1473 __execlists_schedule_out(rq, old, ccid); 1474 1475 i915_request_put(rq); 1476 } 1477 1478 static u64 execlists_update_context(struct i915_request *rq) 1479 { 1480 struct intel_context *ce = rq->context; 1481 u64 desc = ce->lrc.desc; 1482 u32 tail, prev; 1483 1484 /* 1485 * WaIdleLiteRestore:bdw,skl 1486 * 1487 * We should never submit the context with the same RING_TAIL twice 1488 * just in case we submit an empty ring, which confuses the HW. 1489 * 1490 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1491 * the normal request to be able to always advance the RING_TAIL on 1492 * subsequent resubmissions (for lite restore). Should that fail us, 1493 * and we try and submit the same tail again, force the context 1494 * reload. 1495 * 1496 * If we need to return to a preempted context, we need to skip the 1497 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1498 * HW has a tendency to ignore us rewinding the TAIL to the end of 1499 * an earlier request. 1500 */ 1501 tail = intel_ring_set_tail(rq->ring, rq->tail); 1502 prev = ce->lrc_reg_state[CTX_RING_TAIL]; 1503 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1504 desc |= CTX_DESC_FORCE_RESTORE; 1505 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1506 rq->tail = rq->wa_tail; 1507 1508 /* 1509 * Make sure the context image is complete before we submit it to HW. 1510 * 1511 * Ostensibly, writes (including the WCB) should be flushed prior to 1512 * an uncached write such as our mmio register access, the empirical 1513 * evidence (esp. on Braswell) suggests that the WC write into memory 1514 * may not be visible to the HW prior to the completion of the UC 1515 * register write and that we may begin execution from the context 1516 * before its image is complete leading to invalid PD chasing. 1517 */ 1518 wmb(); 1519 1520 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1521 return desc; 1522 } 1523 1524 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1525 { 1526 if (execlists->ctrl_reg) { 1527 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1528 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1529 } else { 1530 writel(upper_32_bits(desc), execlists->submit_reg); 1531 writel(lower_32_bits(desc), execlists->submit_reg); 1532 } 1533 } 1534 1535 static __maybe_unused char * 1536 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1537 { 1538 if (!rq) 1539 return ""; 1540 1541 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1542 prefix, 1543 rq->context->lrc.ccid, 1544 rq->fence.context, rq->fence.seqno, 1545 i915_request_completed(rq) ? "!" : 1546 i915_request_started(rq) ? "*" : 1547 "", 1548 rq_prio(rq)); 1549 1550 return buf; 1551 } 1552 1553 static __maybe_unused void 1554 trace_ports(const struct intel_engine_execlists *execlists, 1555 const char *msg, 1556 struct i915_request * const *ports) 1557 { 1558 const struct intel_engine_cs *engine = 1559 container_of(execlists, typeof(*engine), execlists); 1560 char __maybe_unused p0[40], p1[40]; 1561 1562 if (!ports[0]) 1563 return; 1564 1565 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1566 dump_port(p0, sizeof(p0), "", ports[0]), 1567 dump_port(p1, sizeof(p1), ", ", ports[1])); 1568 } 1569 1570 static inline bool 1571 reset_in_progress(const struct intel_engine_execlists *execlists) 1572 { 1573 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1574 } 1575 1576 static __maybe_unused bool 1577 assert_pending_valid(const struct intel_engine_execlists *execlists, 1578 const char *msg) 1579 { 1580 struct intel_engine_cs *engine = 1581 container_of(execlists, typeof(*engine), execlists); 1582 struct i915_request * const *port, *rq; 1583 struct intel_context *ce = NULL; 1584 bool sentinel = false; 1585 u32 ccid = -1; 1586 1587 trace_ports(execlists, msg, execlists->pending); 1588 1589 /* We may be messing around with the lists during reset, lalala */ 1590 if (reset_in_progress(execlists)) 1591 return true; 1592 1593 if (!execlists->pending[0]) { 1594 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1595 engine->name); 1596 return false; 1597 } 1598 1599 if (execlists->pending[execlists_num_ports(execlists)]) { 1600 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1601 engine->name, execlists_num_ports(execlists)); 1602 return false; 1603 } 1604 1605 for (port = execlists->pending; (rq = *port); port++) { 1606 unsigned long flags; 1607 bool ok = true; 1608 1609 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1610 GEM_BUG_ON(!i915_request_is_active(rq)); 1611 1612 if (ce == rq->context) { 1613 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1614 engine->name, 1615 ce->timeline->fence_context, 1616 port - execlists->pending); 1617 return false; 1618 } 1619 ce = rq->context; 1620 1621 if (ccid == ce->lrc.ccid) { 1622 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1623 engine->name, 1624 ccid, ce->timeline->fence_context, 1625 port - execlists->pending); 1626 return false; 1627 } 1628 ccid = ce->lrc.ccid; 1629 1630 /* 1631 * Sentinels are supposed to be lonely so they flush the 1632 * current exection off the HW. Check that they are the 1633 * only request in the pending submission. 1634 */ 1635 if (sentinel) { 1636 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1637 engine->name, 1638 ce->timeline->fence_context, 1639 port - execlists->pending); 1640 return false; 1641 } 1642 1643 sentinel = i915_request_has_sentinel(rq); 1644 if (sentinel && port != execlists->pending) { 1645 GEM_TRACE_ERR("%s: sentinel context:%llx not in prime position[%zd]\n", 1646 engine->name, 1647 ce->timeline->fence_context, 1648 port - execlists->pending); 1649 return false; 1650 } 1651 1652 /* Hold tightly onto the lock to prevent concurrent retires! */ 1653 if (!spin_trylock_irqsave(&rq->lock, flags)) 1654 continue; 1655 1656 if (i915_request_completed(rq)) 1657 goto unlock; 1658 1659 if (i915_active_is_idle(&ce->active) && 1660 !intel_context_is_barrier(ce)) { 1661 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1662 engine->name, 1663 ce->timeline->fence_context, 1664 port - execlists->pending); 1665 ok = false; 1666 goto unlock; 1667 } 1668 1669 if (!i915_vma_is_pinned(ce->state)) { 1670 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1671 engine->name, 1672 ce->timeline->fence_context, 1673 port - execlists->pending); 1674 ok = false; 1675 goto unlock; 1676 } 1677 1678 if (!i915_vma_is_pinned(ce->ring->vma)) { 1679 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1680 engine->name, 1681 ce->timeline->fence_context, 1682 port - execlists->pending); 1683 ok = false; 1684 goto unlock; 1685 } 1686 1687 unlock: 1688 spin_unlock_irqrestore(&rq->lock, flags); 1689 if (!ok) 1690 return false; 1691 } 1692 1693 return ce; 1694 } 1695 1696 static void execlists_submit_ports(struct intel_engine_cs *engine) 1697 { 1698 struct intel_engine_execlists *execlists = &engine->execlists; 1699 unsigned int n; 1700 1701 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1702 1703 /* 1704 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1705 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1706 * not be relinquished until the device is idle (see 1707 * i915_gem_idle_work_handler()). As a precaution, we make sure 1708 * that all ELSP are drained i.e. we have processed the CSB, 1709 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1710 */ 1711 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1712 1713 /* 1714 * ELSQ note: the submit queue is not cleared after being submitted 1715 * to the HW so we need to make sure we always clean it up. This is 1716 * currently ensured by the fact that we always write the same number 1717 * of elsq entries, keep this in mind before changing the loop below. 1718 */ 1719 for (n = execlists_num_ports(execlists); n--; ) { 1720 struct i915_request *rq = execlists->pending[n]; 1721 1722 write_desc(execlists, 1723 rq ? execlists_update_context(rq) : 0, 1724 n); 1725 } 1726 1727 /* we need to manually load the submit queue */ 1728 if (execlists->ctrl_reg) 1729 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1730 } 1731 1732 static bool ctx_single_port_submission(const struct intel_context *ce) 1733 { 1734 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1735 intel_context_force_single_submission(ce)); 1736 } 1737 1738 static bool can_merge_ctx(const struct intel_context *prev, 1739 const struct intel_context *next) 1740 { 1741 if (prev != next) 1742 return false; 1743 1744 if (ctx_single_port_submission(prev)) 1745 return false; 1746 1747 return true; 1748 } 1749 1750 static unsigned long i915_request_flags(const struct i915_request *rq) 1751 { 1752 return READ_ONCE(rq->fence.flags); 1753 } 1754 1755 static bool can_merge_rq(const struct i915_request *prev, 1756 const struct i915_request *next) 1757 { 1758 GEM_BUG_ON(prev == next); 1759 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1760 1761 /* 1762 * We do not submit known completed requests. Therefore if the next 1763 * request is already completed, we can pretend to merge it in 1764 * with the previous context (and we will skip updating the ELSP 1765 * and tracking). Thus hopefully keeping the ELSP full with active 1766 * contexts, despite the best efforts of preempt-to-busy to confuse 1767 * us. 1768 */ 1769 if (i915_request_completed(next)) 1770 return true; 1771 1772 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1773 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1774 BIT(I915_FENCE_FLAG_SENTINEL)))) 1775 return false; 1776 1777 if (!can_merge_ctx(prev->context, next->context)) 1778 return false; 1779 1780 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1781 return true; 1782 } 1783 1784 static void virtual_update_register_offsets(u32 *regs, 1785 struct intel_engine_cs *engine) 1786 { 1787 set_offsets(regs, reg_offsets(engine), engine, false); 1788 } 1789 1790 static bool virtual_matches(const struct virtual_engine *ve, 1791 const struct i915_request *rq, 1792 const struct intel_engine_cs *engine) 1793 { 1794 const struct intel_engine_cs *inflight; 1795 1796 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1797 return false; 1798 1799 /* 1800 * We track when the HW has completed saving the context image 1801 * (i.e. when we have seen the final CS event switching out of 1802 * the context) and must not overwrite the context image before 1803 * then. This restricts us to only using the active engine 1804 * while the previous virtualized request is inflight (so 1805 * we reuse the register offsets). This is a very small 1806 * hystersis on the greedy seelction algorithm. 1807 */ 1808 inflight = intel_context_inflight(&ve->context); 1809 if (inflight && inflight != engine) 1810 return false; 1811 1812 return true; 1813 } 1814 1815 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve) 1816 { 1817 /* 1818 * All the outstanding signals on ve->siblings[0] must have 1819 * been completed, just pending the interrupt handler. As those 1820 * signals still refer to the old sibling (via rq->engine), we must 1821 * transfer those to the old irq_worker to keep our locking 1822 * consistent. 1823 */ 1824 intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context); 1825 } 1826 1827 #define for_each_waiter(p__, rq__) \ 1828 list_for_each_entry_lockless(p__, \ 1829 &(rq__)->sched.waiters_list, \ 1830 wait_link) 1831 1832 #define for_each_signaler(p__, rq__) \ 1833 list_for_each_entry_rcu(p__, \ 1834 &(rq__)->sched.signalers_list, \ 1835 signal_link) 1836 1837 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1838 { 1839 LIST_HEAD(list); 1840 1841 /* 1842 * We want to move the interrupted request to the back of 1843 * the round-robin list (i.e. its priority level), but 1844 * in doing so, we must then move all requests that were in 1845 * flight and were waiting for the interrupted request to 1846 * be run after it again. 1847 */ 1848 do { 1849 struct i915_dependency *p; 1850 1851 GEM_BUG_ON(i915_request_is_active(rq)); 1852 list_move_tail(&rq->sched.link, pl); 1853 1854 for_each_waiter(p, rq) { 1855 struct i915_request *w = 1856 container_of(p->waiter, typeof(*w), sched); 1857 1858 if (p->flags & I915_DEPENDENCY_WEAK) 1859 continue; 1860 1861 /* Leave semaphores spinning on the other engines */ 1862 if (w->engine != rq->engine) 1863 continue; 1864 1865 /* No waiter should start before its signaler */ 1866 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1867 i915_request_started(w) && 1868 !i915_request_completed(rq)); 1869 1870 GEM_BUG_ON(i915_request_is_active(w)); 1871 if (!i915_request_is_ready(w)) 1872 continue; 1873 1874 if (rq_prio(w) < rq_prio(rq)) 1875 continue; 1876 1877 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1878 list_move_tail(&w->sched.link, &list); 1879 } 1880 1881 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1882 } while (rq); 1883 } 1884 1885 static void defer_active(struct intel_engine_cs *engine) 1886 { 1887 struct i915_request *rq; 1888 1889 rq = __unwind_incomplete_requests(engine); 1890 if (!rq) 1891 return; 1892 1893 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1894 } 1895 1896 static bool 1897 need_timeslice(const struct intel_engine_cs *engine, 1898 const struct i915_request *rq) 1899 { 1900 int hint; 1901 1902 if (!intel_engine_has_timeslices(engine)) 1903 return false; 1904 1905 hint = engine->execlists.queue_priority_hint; 1906 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1907 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1908 1909 return hint >= effective_prio(rq); 1910 } 1911 1912 static bool 1913 timeslice_yield(const struct intel_engine_execlists *el, 1914 const struct i915_request *rq) 1915 { 1916 /* 1917 * Once bitten, forever smitten! 1918 * 1919 * If the active context ever busy-waited on a semaphore, 1920 * it will be treated as a hog until the end of its timeslice (i.e. 1921 * until it is scheduled out and replaced by a new submission, 1922 * possibly even its own lite-restore). The HW only sends an interrupt 1923 * on the first miss, and we do know if that semaphore has been 1924 * signaled, or even if it is now stuck on another semaphore. Play 1925 * safe, yield if it might be stuck -- it will be given a fresh 1926 * timeslice in the near future. 1927 */ 1928 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1929 } 1930 1931 static bool 1932 timeslice_expired(const struct intel_engine_execlists *el, 1933 const struct i915_request *rq) 1934 { 1935 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1936 } 1937 1938 static int 1939 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1940 { 1941 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1942 return INT_MIN; 1943 1944 return rq_prio(list_next_entry(rq, sched.link)); 1945 } 1946 1947 static inline unsigned long 1948 timeslice(const struct intel_engine_cs *engine) 1949 { 1950 return READ_ONCE(engine->props.timeslice_duration_ms); 1951 } 1952 1953 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1954 { 1955 const struct intel_engine_execlists *execlists = &engine->execlists; 1956 const struct i915_request *rq = *execlists->active; 1957 1958 if (!rq || i915_request_completed(rq)) 1959 return 0; 1960 1961 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1962 return 0; 1963 1964 return timeslice(engine); 1965 } 1966 1967 static void set_timeslice(struct intel_engine_cs *engine) 1968 { 1969 unsigned long duration; 1970 1971 if (!intel_engine_has_timeslices(engine)) 1972 return; 1973 1974 duration = active_timeslice(engine); 1975 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 1976 1977 set_timer_ms(&engine->execlists.timer, duration); 1978 } 1979 1980 static void start_timeslice(struct intel_engine_cs *engine) 1981 { 1982 struct intel_engine_execlists *execlists = &engine->execlists; 1983 const int prio = queue_prio(execlists); 1984 unsigned long duration; 1985 1986 if (!intel_engine_has_timeslices(engine)) 1987 return; 1988 1989 WRITE_ONCE(execlists->switch_priority_hint, prio); 1990 if (prio == INT_MIN) 1991 return; 1992 1993 if (timer_pending(&execlists->timer)) 1994 return; 1995 1996 duration = timeslice(engine); 1997 ENGINE_TRACE(engine, 1998 "start timeslicing, prio:%d, interval:%lu", 1999 prio, duration); 2000 2001 set_timer_ms(&execlists->timer, duration); 2002 } 2003 2004 static void record_preemption(struct intel_engine_execlists *execlists) 2005 { 2006 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2007 } 2008 2009 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2010 const struct i915_request *rq) 2011 { 2012 if (!rq) 2013 return 0; 2014 2015 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2016 if (unlikely(intel_context_is_banned(rq->context))) 2017 return 1; 2018 2019 return READ_ONCE(engine->props.preempt_timeout_ms); 2020 } 2021 2022 static void set_preempt_timeout(struct intel_engine_cs *engine, 2023 const struct i915_request *rq) 2024 { 2025 if (!intel_engine_has_preempt_reset(engine)) 2026 return; 2027 2028 set_timer_ms(&engine->execlists.preempt, 2029 active_preempt_timeout(engine, rq)); 2030 } 2031 2032 static inline void clear_ports(struct i915_request **ports, int count) 2033 { 2034 memset_p((void **)ports, NULL, count); 2035 } 2036 2037 static void execlists_dequeue(struct intel_engine_cs *engine) 2038 { 2039 struct intel_engine_execlists * const execlists = &engine->execlists; 2040 struct i915_request **port = execlists->pending; 2041 struct i915_request ** const last_port = port + execlists->port_mask; 2042 struct i915_request * const *active; 2043 struct i915_request *last; 2044 struct rb_node *rb; 2045 bool submit = false; 2046 2047 /* 2048 * Hardware submission is through 2 ports. Conceptually each port 2049 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2050 * static for a context, and unique to each, so we only execute 2051 * requests belonging to a single context from each ring. RING_HEAD 2052 * is maintained by the CS in the context image, it marks the place 2053 * where it got up to last time, and through RING_TAIL we tell the CS 2054 * where we want to execute up to this time. 2055 * 2056 * In this list the requests are in order of execution. Consecutive 2057 * requests from the same context are adjacent in the ringbuffer. We 2058 * can combine these requests into a single RING_TAIL update: 2059 * 2060 * RING_HEAD...req1...req2 2061 * ^- RING_TAIL 2062 * since to execute req2 the CS must first execute req1. 2063 * 2064 * Our goal then is to point each port to the end of a consecutive 2065 * sequence of requests as being the most optimal (fewest wake ups 2066 * and context switches) submission. 2067 */ 2068 2069 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2070 struct virtual_engine *ve = 2071 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2072 struct i915_request *rq = READ_ONCE(ve->request); 2073 2074 if (!rq) { /* lazily cleanup after another engine handled rq */ 2075 rb_erase_cached(rb, &execlists->virtual); 2076 RB_CLEAR_NODE(rb); 2077 rb = rb_first_cached(&execlists->virtual); 2078 continue; 2079 } 2080 2081 if (!virtual_matches(ve, rq, engine)) { 2082 rb = rb_next(rb); 2083 continue; 2084 } 2085 2086 break; 2087 } 2088 2089 /* 2090 * If the queue is higher priority than the last 2091 * request in the currently active context, submit afresh. 2092 * We will resubmit again afterwards in case we need to split 2093 * the active context to interject the preemption request, 2094 * i.e. we will retrigger preemption following the ack in case 2095 * of trouble. 2096 */ 2097 active = READ_ONCE(execlists->active); 2098 2099 /* 2100 * In theory we can skip over completed contexts that have not 2101 * yet been processed by events (as those events are in flight): 2102 * 2103 * while ((last = *active) && i915_request_completed(last)) 2104 * active++; 2105 * 2106 * However, the GPU cannot handle this as it will ultimately 2107 * find itself trying to jump back into a context it has just 2108 * completed and barf. 2109 */ 2110 2111 if ((last = *active)) { 2112 if (need_preempt(engine, last, rb)) { 2113 if (i915_request_completed(last)) { 2114 tasklet_hi_schedule(&execlists->tasklet); 2115 return; 2116 } 2117 2118 ENGINE_TRACE(engine, 2119 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2120 last->fence.context, 2121 last->fence.seqno, 2122 last->sched.attr.priority, 2123 execlists->queue_priority_hint); 2124 record_preemption(execlists); 2125 2126 /* 2127 * Don't let the RING_HEAD advance past the breadcrumb 2128 * as we unwind (and until we resubmit) so that we do 2129 * not accidentally tell it to go backwards. 2130 */ 2131 ring_set_paused(engine, 1); 2132 2133 /* 2134 * Note that we have not stopped the GPU at this point, 2135 * so we are unwinding the incomplete requests as they 2136 * remain inflight and so by the time we do complete 2137 * the preemption, some of the unwound requests may 2138 * complete! 2139 */ 2140 __unwind_incomplete_requests(engine); 2141 2142 last = NULL; 2143 } else if (need_timeslice(engine, last) && 2144 timeslice_expired(execlists, last)) { 2145 if (i915_request_completed(last)) { 2146 tasklet_hi_schedule(&execlists->tasklet); 2147 return; 2148 } 2149 2150 ENGINE_TRACE(engine, 2151 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2152 last->fence.context, 2153 last->fence.seqno, 2154 last->sched.attr.priority, 2155 execlists->queue_priority_hint, 2156 yesno(timeslice_yield(execlists, last))); 2157 2158 ring_set_paused(engine, 1); 2159 defer_active(engine); 2160 2161 /* 2162 * Unlike for preemption, if we rewind and continue 2163 * executing the same context as previously active, 2164 * the order of execution will remain the same and 2165 * the tail will only advance. We do not need to 2166 * force a full context restore, as a lite-restore 2167 * is sufficient to resample the monotonic TAIL. 2168 * 2169 * If we switch to any other context, similarly we 2170 * will not rewind TAIL of current context, and 2171 * normal save/restore will preserve state and allow 2172 * us to later continue executing the same request. 2173 */ 2174 last = NULL; 2175 } else { 2176 /* 2177 * Otherwise if we already have a request pending 2178 * for execution after the current one, we can 2179 * just wait until the next CS event before 2180 * queuing more. In either case we will force a 2181 * lite-restore preemption event, but if we wait 2182 * we hopefully coalesce several updates into a single 2183 * submission. 2184 */ 2185 if (!list_is_last(&last->sched.link, 2186 &engine->active.requests)) { 2187 /* 2188 * Even if ELSP[1] is occupied and not worthy 2189 * of timeslices, our queue might be. 2190 */ 2191 start_timeslice(engine); 2192 return; 2193 } 2194 } 2195 } 2196 2197 while (rb) { /* XXX virtual is always taking precedence */ 2198 struct virtual_engine *ve = 2199 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2200 struct i915_request *rq; 2201 2202 spin_lock(&ve->base.active.lock); 2203 2204 rq = ve->request; 2205 if (unlikely(!rq)) { /* lost the race to a sibling */ 2206 spin_unlock(&ve->base.active.lock); 2207 rb_erase_cached(rb, &execlists->virtual); 2208 RB_CLEAR_NODE(rb); 2209 rb = rb_first_cached(&execlists->virtual); 2210 continue; 2211 } 2212 2213 GEM_BUG_ON(rq != ve->request); 2214 GEM_BUG_ON(rq->engine != &ve->base); 2215 GEM_BUG_ON(rq->context != &ve->context); 2216 2217 if (rq_prio(rq) >= queue_prio(execlists)) { 2218 if (!virtual_matches(ve, rq, engine)) { 2219 spin_unlock(&ve->base.active.lock); 2220 rb = rb_next(rb); 2221 continue; 2222 } 2223 2224 if (last && !can_merge_rq(last, rq)) { 2225 spin_unlock(&ve->base.active.lock); 2226 start_timeslice(engine); 2227 return; /* leave this for another sibling */ 2228 } 2229 2230 ENGINE_TRACE(engine, 2231 "virtual rq=%llx:%lld%s, new engine? %s\n", 2232 rq->fence.context, 2233 rq->fence.seqno, 2234 i915_request_completed(rq) ? "!" : 2235 i915_request_started(rq) ? "*" : 2236 "", 2237 yesno(engine != ve->siblings[0])); 2238 2239 WRITE_ONCE(ve->request, NULL); 2240 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2241 INT_MIN); 2242 rb_erase_cached(rb, &execlists->virtual); 2243 RB_CLEAR_NODE(rb); 2244 2245 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2246 WRITE_ONCE(rq->engine, engine); 2247 2248 if (engine != ve->siblings[0]) { 2249 u32 *regs = ve->context.lrc_reg_state; 2250 unsigned int n; 2251 2252 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 2253 2254 if (!intel_engine_has_relative_mmio(engine)) 2255 virtual_update_register_offsets(regs, 2256 engine); 2257 2258 if (!list_empty(&ve->context.signals)) 2259 virtual_xfer_breadcrumbs(ve); 2260 2261 /* 2262 * Move the bound engine to the top of the list 2263 * for future execution. We then kick this 2264 * tasklet first before checking others, so that 2265 * we preferentially reuse this set of bound 2266 * registers. 2267 */ 2268 for (n = 1; n < ve->num_siblings; n++) { 2269 if (ve->siblings[n] == engine) { 2270 swap(ve->siblings[n], 2271 ve->siblings[0]); 2272 break; 2273 } 2274 } 2275 2276 GEM_BUG_ON(ve->siblings[0] != engine); 2277 } 2278 2279 if (__i915_request_submit(rq)) { 2280 submit = true; 2281 last = rq; 2282 } 2283 i915_request_put(rq); 2284 2285 /* 2286 * Hmm, we have a bunch of virtual engine requests, 2287 * but the first one was already completed (thanks 2288 * preempt-to-busy!). Keep looking at the veng queue 2289 * until we have no more relevant requests (i.e. 2290 * the normal submit queue has higher priority). 2291 */ 2292 if (!submit) { 2293 spin_unlock(&ve->base.active.lock); 2294 rb = rb_first_cached(&execlists->virtual); 2295 continue; 2296 } 2297 } 2298 2299 spin_unlock(&ve->base.active.lock); 2300 break; 2301 } 2302 2303 while ((rb = rb_first_cached(&execlists->queue))) { 2304 struct i915_priolist *p = to_priolist(rb); 2305 struct i915_request *rq, *rn; 2306 int i; 2307 2308 priolist_for_each_request_consume(rq, rn, p, i) { 2309 bool merge = true; 2310 2311 /* 2312 * Can we combine this request with the current port? 2313 * It has to be the same context/ringbuffer and not 2314 * have any exceptions (e.g. GVT saying never to 2315 * combine contexts). 2316 * 2317 * If we can combine the requests, we can execute both 2318 * by updating the RING_TAIL to point to the end of the 2319 * second request, and so we never need to tell the 2320 * hardware about the first. 2321 */ 2322 if (last && !can_merge_rq(last, rq)) { 2323 /* 2324 * If we are on the second port and cannot 2325 * combine this request with the last, then we 2326 * are done. 2327 */ 2328 if (port == last_port) 2329 goto done; 2330 2331 /* 2332 * We must not populate both ELSP[] with the 2333 * same LRCA, i.e. we must submit 2 different 2334 * contexts if we submit 2 ELSP. 2335 */ 2336 if (last->context == rq->context) 2337 goto done; 2338 2339 if (i915_request_has_sentinel(last)) 2340 goto done; 2341 2342 /* 2343 * If GVT overrides us we only ever submit 2344 * port[0], leaving port[1] empty. Note that we 2345 * also have to be careful that we don't queue 2346 * the same context (even though a different 2347 * request) to the second port. 2348 */ 2349 if (ctx_single_port_submission(last->context) || 2350 ctx_single_port_submission(rq->context)) 2351 goto done; 2352 2353 merge = false; 2354 } 2355 2356 if (__i915_request_submit(rq)) { 2357 if (!merge) { 2358 *port = execlists_schedule_in(last, port - execlists->pending); 2359 port++; 2360 last = NULL; 2361 } 2362 2363 GEM_BUG_ON(last && 2364 !can_merge_ctx(last->context, 2365 rq->context)); 2366 GEM_BUG_ON(last && 2367 i915_seqno_passed(last->fence.seqno, 2368 rq->fence.seqno)); 2369 2370 submit = true; 2371 last = rq; 2372 } 2373 } 2374 2375 rb_erase_cached(&p->node, &execlists->queue); 2376 i915_priolist_free(p); 2377 } 2378 2379 done: 2380 /* 2381 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2382 * 2383 * We choose the priority hint such that if we add a request of greater 2384 * priority than this, we kick the submission tasklet to decide on 2385 * the right order of submitting the requests to hardware. We must 2386 * also be prepared to reorder requests as they are in-flight on the 2387 * HW. We derive the priority hint then as the first "hole" in 2388 * the HW submission ports and if there are no available slots, 2389 * the priority of the lowest executing request, i.e. last. 2390 * 2391 * When we do receive a higher priority request ready to run from the 2392 * user, see queue_request(), the priority hint is bumped to that 2393 * request triggering preemption on the next dequeue (or subsequent 2394 * interrupt for secondary ports). 2395 */ 2396 execlists->queue_priority_hint = queue_prio(execlists); 2397 2398 if (submit) { 2399 *port = execlists_schedule_in(last, port - execlists->pending); 2400 execlists->switch_priority_hint = 2401 switch_prio(engine, *execlists->pending); 2402 2403 /* 2404 * Skip if we ended up with exactly the same set of requests, 2405 * e.g. trying to timeslice a pair of ordered contexts 2406 */ 2407 if (!memcmp(active, execlists->pending, 2408 (port - execlists->pending + 1) * sizeof(*port))) { 2409 do 2410 execlists_schedule_out(fetch_and_zero(port)); 2411 while (port-- != execlists->pending); 2412 2413 goto skip_submit; 2414 } 2415 clear_ports(port + 1, last_port - port); 2416 2417 WRITE_ONCE(execlists->yield, -1); 2418 set_preempt_timeout(engine, *active); 2419 execlists_submit_ports(engine); 2420 } else { 2421 skip_submit: 2422 ring_set_paused(engine, 0); 2423 } 2424 } 2425 2426 static void 2427 cancel_port_requests(struct intel_engine_execlists * const execlists) 2428 { 2429 struct i915_request * const *port; 2430 2431 for (port = execlists->pending; *port; port++) 2432 execlists_schedule_out(*port); 2433 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2434 2435 /* Mark the end of active before we overwrite *active */ 2436 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2437 execlists_schedule_out(*port); 2438 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2439 2440 smp_wmb(); /* complete the seqlock for execlists_active() */ 2441 WRITE_ONCE(execlists->active, execlists->inflight); 2442 } 2443 2444 static inline void 2445 invalidate_csb_entries(const u32 *first, const u32 *last) 2446 { 2447 clflush((void *)first); 2448 clflush((void *)last); 2449 } 2450 2451 /* 2452 * Starting with Gen12, the status has a new format: 2453 * 2454 * bit 0: switched to new queue 2455 * bit 1: reserved 2456 * bit 2: semaphore wait mode (poll or signal), only valid when 2457 * switch detail is set to "wait on semaphore" 2458 * bits 3-5: engine class 2459 * bits 6-11: engine instance 2460 * bits 12-14: reserved 2461 * bits 15-25: sw context id of the lrc the GT switched to 2462 * bits 26-31: sw counter of the lrc the GT switched to 2463 * bits 32-35: context switch detail 2464 * - 0: ctx complete 2465 * - 1: wait on sync flip 2466 * - 2: wait on vblank 2467 * - 3: wait on scanline 2468 * - 4: wait on semaphore 2469 * - 5: context preempted (not on SEMAPHORE_WAIT or 2470 * WAIT_FOR_EVENT) 2471 * bit 36: reserved 2472 * bits 37-43: wait detail (for switch detail 1 to 4) 2473 * bits 44-46: reserved 2474 * bits 47-57: sw context id of the lrc the GT switched away from 2475 * bits 58-63: sw counter of the lrc the GT switched away from 2476 */ 2477 static inline bool 2478 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2479 { 2480 u32 lower_dw = csb[0]; 2481 u32 upper_dw = csb[1]; 2482 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2483 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2484 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2485 2486 /* 2487 * The context switch detail is not guaranteed to be 5 when a preemption 2488 * occurs, so we can't just check for that. The check below works for 2489 * all the cases we care about, including preemptions of WAIT 2490 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2491 * would require some extra handling, but we don't support that. 2492 */ 2493 if (!ctx_away_valid || new_queue) { 2494 GEM_BUG_ON(!ctx_to_valid); 2495 return true; 2496 } 2497 2498 /* 2499 * switch detail = 5 is covered by the case above and we do not expect a 2500 * context switch on an unsuccessful wait instruction since we always 2501 * use polling mode. 2502 */ 2503 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2504 return false; 2505 } 2506 2507 static inline bool 2508 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2509 { 2510 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2511 } 2512 2513 static void process_csb(struct intel_engine_cs *engine) 2514 { 2515 struct intel_engine_execlists * const execlists = &engine->execlists; 2516 const u32 * const buf = execlists->csb_status; 2517 const u8 num_entries = execlists->csb_size; 2518 u8 head, tail; 2519 2520 /* 2521 * As we modify our execlists state tracking we require exclusive 2522 * access. Either we are inside the tasklet, or the tasklet is disabled 2523 * and we assume that is only inside the reset paths and so serialised. 2524 */ 2525 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2526 !reset_in_progress(execlists)); 2527 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2528 2529 /* 2530 * Note that csb_write, csb_status may be either in HWSP or mmio. 2531 * When reading from the csb_write mmio register, we have to be 2532 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2533 * the low 4bits. As it happens we know the next 4bits are always 2534 * zero and so we can simply masked off the low u8 of the register 2535 * and treat it identically to reading from the HWSP (without having 2536 * to use explicit shifting and masking, and probably bifurcating 2537 * the code to handle the legacy mmio read). 2538 */ 2539 head = execlists->csb_head; 2540 tail = READ_ONCE(*execlists->csb_write); 2541 if (unlikely(head == tail)) 2542 return; 2543 2544 /* 2545 * Hopefully paired with a wmb() in HW! 2546 * 2547 * We must complete the read of the write pointer before any reads 2548 * from the CSB, so that we do not see stale values. Without an rmb 2549 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2550 * we perform the READ_ONCE(*csb_write). 2551 */ 2552 rmb(); 2553 2554 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2555 do { 2556 bool promote; 2557 2558 if (++head == num_entries) 2559 head = 0; 2560 2561 /* 2562 * We are flying near dragons again. 2563 * 2564 * We hold a reference to the request in execlist_port[] 2565 * but no more than that. We are operating in softirq 2566 * context and so cannot hold any mutex or sleep. That 2567 * prevents us stopping the requests we are processing 2568 * in port[] from being retired simultaneously (the 2569 * breadcrumb will be complete before we see the 2570 * context-switch). As we only hold the reference to the 2571 * request, any pointer chasing underneath the request 2572 * is subject to a potential use-after-free. Thus we 2573 * store all of the bookkeeping within port[] as 2574 * required, and avoid using unguarded pointers beneath 2575 * request itself. The same applies to the atomic 2576 * status notifier. 2577 */ 2578 2579 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2580 head, buf[2 * head + 0], buf[2 * head + 1]); 2581 2582 if (INTEL_GEN(engine->i915) >= 12) 2583 promote = gen12_csb_parse(execlists, buf + 2 * head); 2584 else 2585 promote = gen8_csb_parse(execlists, buf + 2 * head); 2586 if (promote) { 2587 struct i915_request * const *old = execlists->active; 2588 2589 ring_set_paused(engine, 0); 2590 2591 /* Point active to the new ELSP; prevent overwriting */ 2592 WRITE_ONCE(execlists->active, execlists->pending); 2593 smp_wmb(); /* notify execlists_active() */ 2594 2595 /* cancel old inflight, prepare for switch */ 2596 trace_ports(execlists, "preempted", old); 2597 while (*old) 2598 execlists_schedule_out(*old++); 2599 2600 /* switch pending to inflight */ 2601 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2602 memcpy(execlists->inflight, 2603 execlists->pending, 2604 execlists_num_ports(execlists) * 2605 sizeof(*execlists->pending)); 2606 smp_wmb(); /* complete the seqlock */ 2607 WRITE_ONCE(execlists->active, execlists->inflight); 2608 2609 WRITE_ONCE(execlists->pending[0], NULL); 2610 } else { 2611 GEM_BUG_ON(!*execlists->active); 2612 2613 /* port0 completed, advanced to port1 */ 2614 trace_ports(execlists, "completed", execlists->active); 2615 2616 /* 2617 * We rely on the hardware being strongly 2618 * ordered, that the breadcrumb write is 2619 * coherent (visible from the CPU) before the 2620 * user interrupt is processed. One might assume 2621 * that the breadcrumb write being before the 2622 * user interrupt and the CS event for the context 2623 * switch would therefore be before the CS event 2624 * itself... 2625 */ 2626 if (GEM_SHOW_DEBUG() && 2627 !i915_request_completed(*execlists->active)) { 2628 struct i915_request *rq = *execlists->active; 2629 const u32 *regs __maybe_unused = 2630 rq->context->lrc_reg_state; 2631 2632 ENGINE_TRACE(engine, 2633 "context completed before request!\n"); 2634 ENGINE_TRACE(engine, 2635 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2636 ENGINE_READ(engine, RING_START), 2637 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2638 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2639 ENGINE_READ(engine, RING_CTL), 2640 ENGINE_READ(engine, RING_MI_MODE)); 2641 ENGINE_TRACE(engine, 2642 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2643 i915_ggtt_offset(rq->ring->vma), 2644 rq->head, rq->tail, 2645 rq->fence.context, 2646 lower_32_bits(rq->fence.seqno), 2647 hwsp_seqno(rq)); 2648 ENGINE_TRACE(engine, 2649 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2650 regs[CTX_RING_START], 2651 regs[CTX_RING_HEAD], 2652 regs[CTX_RING_TAIL]); 2653 } 2654 2655 execlists_schedule_out(*execlists->active++); 2656 2657 GEM_BUG_ON(execlists->active - execlists->inflight > 2658 execlists_num_ports(execlists)); 2659 } 2660 } while (head != tail); 2661 2662 execlists->csb_head = head; 2663 set_timeslice(engine); 2664 2665 /* 2666 * Gen11 has proven to fail wrt global observation point between 2667 * entry and tail update, failing on the ordering and thus 2668 * we see an old entry in the context status buffer. 2669 * 2670 * Forcibly evict out entries for the next gpu csb update, 2671 * to increase the odds that we get a fresh entries with non 2672 * working hardware. The cost for doing so comes out mostly with 2673 * the wash as hardware, working or not, will need to do the 2674 * invalidation before. 2675 */ 2676 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2677 } 2678 2679 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2680 { 2681 lockdep_assert_held(&engine->active.lock); 2682 if (!READ_ONCE(engine->execlists.pending[0])) { 2683 rcu_read_lock(); /* protect peeking at execlists->active */ 2684 execlists_dequeue(engine); 2685 rcu_read_unlock(); 2686 } 2687 } 2688 2689 static void __execlists_hold(struct i915_request *rq) 2690 { 2691 LIST_HEAD(list); 2692 2693 do { 2694 struct i915_dependency *p; 2695 2696 if (i915_request_is_active(rq)) 2697 __i915_request_unsubmit(rq); 2698 2699 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2700 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2701 i915_request_set_hold(rq); 2702 RQ_TRACE(rq, "on hold\n"); 2703 2704 for_each_waiter(p, rq) { 2705 struct i915_request *w = 2706 container_of(p->waiter, typeof(*w), sched); 2707 2708 /* Leave semaphores spinning on the other engines */ 2709 if (w->engine != rq->engine) 2710 continue; 2711 2712 if (!i915_request_is_ready(w)) 2713 continue; 2714 2715 if (i915_request_completed(w)) 2716 continue; 2717 2718 if (i915_request_on_hold(w)) 2719 continue; 2720 2721 list_move_tail(&w->sched.link, &list); 2722 } 2723 2724 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2725 } while (rq); 2726 } 2727 2728 static bool execlists_hold(struct intel_engine_cs *engine, 2729 struct i915_request *rq) 2730 { 2731 spin_lock_irq(&engine->active.lock); 2732 2733 if (i915_request_completed(rq)) { /* too late! */ 2734 rq = NULL; 2735 goto unlock; 2736 } 2737 2738 if (rq->engine != engine) { /* preempted virtual engine */ 2739 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2740 2741 /* 2742 * intel_context_inflight() is only protected by virtue 2743 * of process_csb() being called only by the tasklet (or 2744 * directly from inside reset while the tasklet is suspended). 2745 * Assert that neither of those are allowed to run while we 2746 * poke at the request queues. 2747 */ 2748 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2749 2750 /* 2751 * An unsubmitted request along a virtual engine will 2752 * remain on the active (this) engine until we are able 2753 * to process the context switch away (and so mark the 2754 * context as no longer in flight). That cannot have happened 2755 * yet, otherwise we would not be hanging! 2756 */ 2757 spin_lock(&ve->base.active.lock); 2758 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2759 GEM_BUG_ON(ve->request != rq); 2760 ve->request = NULL; 2761 spin_unlock(&ve->base.active.lock); 2762 i915_request_put(rq); 2763 2764 rq->engine = engine; 2765 } 2766 2767 /* 2768 * Transfer this request onto the hold queue to prevent it 2769 * being resumbitted to HW (and potentially completed) before we have 2770 * released it. Since we may have already submitted following 2771 * requests, we need to remove those as well. 2772 */ 2773 GEM_BUG_ON(i915_request_on_hold(rq)); 2774 GEM_BUG_ON(rq->engine != engine); 2775 __execlists_hold(rq); 2776 GEM_BUG_ON(list_empty(&engine->active.hold)); 2777 2778 unlock: 2779 spin_unlock_irq(&engine->active.lock); 2780 return rq; 2781 } 2782 2783 static bool hold_request(const struct i915_request *rq) 2784 { 2785 struct i915_dependency *p; 2786 bool result = false; 2787 2788 /* 2789 * If one of our ancestors is on hold, we must also be on hold, 2790 * otherwise we will bypass it and execute before it. 2791 */ 2792 rcu_read_lock(); 2793 for_each_signaler(p, rq) { 2794 const struct i915_request *s = 2795 container_of(p->signaler, typeof(*s), sched); 2796 2797 if (s->engine != rq->engine) 2798 continue; 2799 2800 result = i915_request_on_hold(s); 2801 if (result) 2802 break; 2803 } 2804 rcu_read_unlock(); 2805 2806 return result; 2807 } 2808 2809 static void __execlists_unhold(struct i915_request *rq) 2810 { 2811 LIST_HEAD(list); 2812 2813 do { 2814 struct i915_dependency *p; 2815 2816 RQ_TRACE(rq, "hold release\n"); 2817 2818 GEM_BUG_ON(!i915_request_on_hold(rq)); 2819 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2820 2821 i915_request_clear_hold(rq); 2822 list_move_tail(&rq->sched.link, 2823 i915_sched_lookup_priolist(rq->engine, 2824 rq_prio(rq))); 2825 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2826 2827 /* Also release any children on this engine that are ready */ 2828 for_each_waiter(p, rq) { 2829 struct i915_request *w = 2830 container_of(p->waiter, typeof(*w), sched); 2831 2832 /* Propagate any change in error status */ 2833 if (rq->fence.error) 2834 i915_request_set_error_once(w, rq->fence.error); 2835 2836 if (w->engine != rq->engine) 2837 continue; 2838 2839 if (!i915_request_on_hold(w)) 2840 continue; 2841 2842 /* Check that no other parents are also on hold */ 2843 if (hold_request(w)) 2844 continue; 2845 2846 list_move_tail(&w->sched.link, &list); 2847 } 2848 2849 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2850 } while (rq); 2851 } 2852 2853 static void execlists_unhold(struct intel_engine_cs *engine, 2854 struct i915_request *rq) 2855 { 2856 spin_lock_irq(&engine->active.lock); 2857 2858 /* 2859 * Move this request back to the priority queue, and all of its 2860 * children and grandchildren that were suspended along with it. 2861 */ 2862 __execlists_unhold(rq); 2863 2864 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2865 engine->execlists.queue_priority_hint = rq_prio(rq); 2866 tasklet_hi_schedule(&engine->execlists.tasklet); 2867 } 2868 2869 spin_unlock_irq(&engine->active.lock); 2870 } 2871 2872 struct execlists_capture { 2873 struct work_struct work; 2874 struct i915_request *rq; 2875 struct i915_gpu_coredump *error; 2876 }; 2877 2878 static void execlists_capture_work(struct work_struct *work) 2879 { 2880 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2881 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2882 struct intel_engine_cs *engine = cap->rq->engine; 2883 struct intel_gt_coredump *gt = cap->error->gt; 2884 struct intel_engine_capture_vma *vma; 2885 2886 /* Compress all the objects attached to the request, slow! */ 2887 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2888 if (vma) { 2889 struct i915_vma_compress *compress = 2890 i915_vma_capture_prepare(gt); 2891 2892 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2893 i915_vma_capture_finish(gt, compress); 2894 } 2895 2896 gt->simulated = gt->engine->simulated; 2897 cap->error->simulated = gt->simulated; 2898 2899 /* Publish the error state, and announce it to the world */ 2900 i915_error_state_store(cap->error); 2901 i915_gpu_coredump_put(cap->error); 2902 2903 /* Return this request and all that depend upon it for signaling */ 2904 execlists_unhold(engine, cap->rq); 2905 i915_request_put(cap->rq); 2906 2907 kfree(cap); 2908 } 2909 2910 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2911 { 2912 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2913 struct execlists_capture *cap; 2914 2915 cap = kmalloc(sizeof(*cap), gfp); 2916 if (!cap) 2917 return NULL; 2918 2919 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2920 if (!cap->error) 2921 goto err_cap; 2922 2923 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2924 if (!cap->error->gt) 2925 goto err_gpu; 2926 2927 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2928 if (!cap->error->gt->engine) 2929 goto err_gt; 2930 2931 return cap; 2932 2933 err_gt: 2934 kfree(cap->error->gt); 2935 err_gpu: 2936 kfree(cap->error); 2937 err_cap: 2938 kfree(cap); 2939 return NULL; 2940 } 2941 2942 static struct i915_request * 2943 active_context(struct intel_engine_cs *engine, u32 ccid) 2944 { 2945 const struct intel_engine_execlists * const el = &engine->execlists; 2946 struct i915_request * const *port, *rq; 2947 2948 /* 2949 * Use the most recent result from process_csb(), but just in case 2950 * we trigger an error (via interrupt) before the first CS event has 2951 * been written, peek at the next submission. 2952 */ 2953 2954 for (port = el->active; (rq = *port); port++) { 2955 if (rq->context->lrc.ccid == ccid) { 2956 ENGINE_TRACE(engine, 2957 "ccid found at active:%zd\n", 2958 port - el->active); 2959 return rq; 2960 } 2961 } 2962 2963 for (port = el->pending; (rq = *port); port++) { 2964 if (rq->context->lrc.ccid == ccid) { 2965 ENGINE_TRACE(engine, 2966 "ccid found at pending:%zd\n", 2967 port - el->pending); 2968 return rq; 2969 } 2970 } 2971 2972 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 2973 return NULL; 2974 } 2975 2976 static u32 active_ccid(struct intel_engine_cs *engine) 2977 { 2978 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 2979 } 2980 2981 static bool execlists_capture(struct intel_engine_cs *engine) 2982 { 2983 struct execlists_capture *cap; 2984 2985 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 2986 return true; 2987 2988 /* 2989 * We need to _quickly_ capture the engine state before we reset. 2990 * We are inside an atomic section (softirq) here and we are delaying 2991 * the forced preemption event. 2992 */ 2993 cap = capture_regs(engine); 2994 if (!cap) 2995 return true; 2996 2997 spin_lock_irq(&engine->active.lock); 2998 cap->rq = active_context(engine, active_ccid(engine)); 2999 if (cap->rq) { 3000 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3001 cap->rq = i915_request_get_rcu(cap->rq); 3002 } 3003 spin_unlock_irq(&engine->active.lock); 3004 if (!cap->rq) 3005 goto err_free; 3006 3007 /* 3008 * Remove the request from the execlists queue, and take ownership 3009 * of the request. We pass it to our worker who will _slowly_ compress 3010 * all the pages the _user_ requested for debugging their batch, after 3011 * which we return it to the queue for signaling. 3012 * 3013 * By removing them from the execlists queue, we also remove the 3014 * requests from being processed by __unwind_incomplete_requests() 3015 * during the intel_engine_reset(), and so they will *not* be replayed 3016 * afterwards. 3017 * 3018 * Note that because we have not yet reset the engine at this point, 3019 * it is possible for the request that we have identified as being 3020 * guilty, did in fact complete and we will then hit an arbitration 3021 * point allowing the outstanding preemption to succeed. The likelihood 3022 * of that is very low (as capturing of the engine registers should be 3023 * fast enough to run inside an irq-off atomic section!), so we will 3024 * simply hold that request accountable for being non-preemptible 3025 * long enough to force the reset. 3026 */ 3027 if (!execlists_hold(engine, cap->rq)) 3028 goto err_rq; 3029 3030 INIT_WORK(&cap->work, execlists_capture_work); 3031 schedule_work(&cap->work); 3032 return true; 3033 3034 err_rq: 3035 i915_request_put(cap->rq); 3036 err_free: 3037 i915_gpu_coredump_put(cap->error); 3038 kfree(cap); 3039 return false; 3040 } 3041 3042 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3043 { 3044 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3045 unsigned long *lock = &engine->gt->reset.flags; 3046 3047 if (!intel_has_reset_engine(engine->gt)) 3048 return; 3049 3050 if (test_and_set_bit(bit, lock)) 3051 return; 3052 3053 ENGINE_TRACE(engine, "reset for %s\n", msg); 3054 3055 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3056 tasklet_disable_nosync(&engine->execlists.tasklet); 3057 3058 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3059 if (execlists_capture(engine)) 3060 intel_engine_reset(engine, msg); 3061 else 3062 ring_set_paused(engine, 0); 3063 3064 tasklet_enable(&engine->execlists.tasklet); 3065 clear_and_wake_up_bit(bit, lock); 3066 } 3067 3068 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3069 { 3070 const struct timer_list *t = &engine->execlists.preempt; 3071 3072 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3073 return false; 3074 3075 if (!timer_expired(t)) 3076 return false; 3077 3078 return READ_ONCE(engine->execlists.pending[0]); 3079 } 3080 3081 /* 3082 * Check the unread Context Status Buffers and manage the submission of new 3083 * contexts to the ELSP accordingly. 3084 */ 3085 static void execlists_submission_tasklet(unsigned long data) 3086 { 3087 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3088 bool timeout = preempt_timeout(engine); 3089 3090 process_csb(engine); 3091 3092 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3093 engine->execlists.error_interrupt = 0; 3094 if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */ 3095 execlists_reset(engine, "CS error"); 3096 } 3097 3098 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3099 unsigned long flags; 3100 3101 spin_lock_irqsave(&engine->active.lock, flags); 3102 __execlists_submission_tasklet(engine); 3103 spin_unlock_irqrestore(&engine->active.lock, flags); 3104 3105 /* Recheck after serialising with direct-submission */ 3106 if (unlikely(timeout && preempt_timeout(engine))) 3107 execlists_reset(engine, "preemption time out"); 3108 } 3109 } 3110 3111 static void __execlists_kick(struct intel_engine_execlists *execlists) 3112 { 3113 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3114 tasklet_hi_schedule(&execlists->tasklet); 3115 } 3116 3117 #define execlists_kick(t, member) \ 3118 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3119 3120 static void execlists_timeslice(struct timer_list *timer) 3121 { 3122 execlists_kick(timer, timer); 3123 } 3124 3125 static void execlists_preempt(struct timer_list *timer) 3126 { 3127 execlists_kick(timer, preempt); 3128 } 3129 3130 static void queue_request(struct intel_engine_cs *engine, 3131 struct i915_request *rq) 3132 { 3133 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3134 list_add_tail(&rq->sched.link, 3135 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3136 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3137 } 3138 3139 static void __submit_queue_imm(struct intel_engine_cs *engine) 3140 { 3141 struct intel_engine_execlists * const execlists = &engine->execlists; 3142 3143 if (reset_in_progress(execlists)) 3144 return; /* defer until we restart the engine following reset */ 3145 3146 /* Hopefully we clear execlists->pending[] to let us through */ 3147 if (READ_ONCE(execlists->pending[0]) && 3148 tasklet_trylock(&execlists->tasklet)) { 3149 process_csb(engine); 3150 tasklet_unlock(&execlists->tasklet); 3151 } 3152 3153 __execlists_submission_tasklet(engine); 3154 } 3155 3156 static void submit_queue(struct intel_engine_cs *engine, 3157 const struct i915_request *rq) 3158 { 3159 struct intel_engine_execlists *execlists = &engine->execlists; 3160 3161 if (rq_prio(rq) <= execlists->queue_priority_hint) 3162 return; 3163 3164 execlists->queue_priority_hint = rq_prio(rq); 3165 __submit_queue_imm(engine); 3166 } 3167 3168 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3169 const struct i915_request *rq) 3170 { 3171 GEM_BUG_ON(i915_request_on_hold(rq)); 3172 return !list_empty(&engine->active.hold) && hold_request(rq); 3173 } 3174 3175 static void execlists_submit_request(struct i915_request *request) 3176 { 3177 struct intel_engine_cs *engine = request->engine; 3178 unsigned long flags; 3179 3180 /* Will be called from irq-context when using foreign fences. */ 3181 spin_lock_irqsave(&engine->active.lock, flags); 3182 3183 if (unlikely(ancestor_on_hold(engine, request))) { 3184 RQ_TRACE(request, "ancestor on hold\n"); 3185 list_add_tail(&request->sched.link, &engine->active.hold); 3186 i915_request_set_hold(request); 3187 } else { 3188 queue_request(engine, request); 3189 3190 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3191 GEM_BUG_ON(list_empty(&request->sched.link)); 3192 3193 submit_queue(engine, request); 3194 } 3195 3196 spin_unlock_irqrestore(&engine->active.lock, flags); 3197 } 3198 3199 static void __execlists_context_fini(struct intel_context *ce) 3200 { 3201 intel_ring_put(ce->ring); 3202 i915_vma_put(ce->state); 3203 } 3204 3205 static void execlists_context_destroy(struct kref *kref) 3206 { 3207 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3208 3209 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3210 GEM_BUG_ON(intel_context_is_pinned(ce)); 3211 3212 if (ce->state) 3213 __execlists_context_fini(ce); 3214 3215 intel_context_fini(ce); 3216 intel_context_free(ce); 3217 } 3218 3219 static void 3220 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3221 { 3222 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3223 return; 3224 3225 vaddr += engine->context_size; 3226 3227 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3228 } 3229 3230 static void 3231 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3232 { 3233 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3234 return; 3235 3236 vaddr += engine->context_size; 3237 3238 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3239 drm_err_once(&engine->i915->drm, 3240 "%s context redzone overwritten!\n", 3241 engine->name); 3242 } 3243 3244 static void execlists_context_unpin(struct intel_context *ce) 3245 { 3246 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3247 ce->engine); 3248 3249 i915_gem_object_unpin_map(ce->state->obj); 3250 } 3251 3252 static u32 * 3253 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3254 { 3255 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3256 MI_SRM_LRM_GLOBAL_GTT | 3257 MI_LRI_LRM_CS_MMIO; 3258 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3259 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3260 CTX_TIMESTAMP * sizeof(u32); 3261 *cs++ = 0; 3262 3263 *cs++ = MI_LOAD_REGISTER_REG | 3264 MI_LRR_SOURCE_CS_MMIO | 3265 MI_LRI_LRM_CS_MMIO; 3266 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3267 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3268 3269 *cs++ = MI_LOAD_REGISTER_REG | 3270 MI_LRR_SOURCE_CS_MMIO | 3271 MI_LRI_LRM_CS_MMIO; 3272 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3273 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3274 3275 return cs; 3276 } 3277 3278 static u32 * 3279 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3280 { 3281 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3282 3283 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3284 MI_SRM_LRM_GLOBAL_GTT | 3285 MI_LRI_LRM_CS_MMIO; 3286 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3287 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3288 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3289 *cs++ = 0; 3290 3291 return cs; 3292 } 3293 3294 static u32 * 3295 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3296 { 3297 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3298 3299 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3300 MI_SRM_LRM_GLOBAL_GTT | 3301 MI_LRI_LRM_CS_MMIO; 3302 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3303 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3304 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3305 *cs++ = 0; 3306 3307 *cs++ = MI_LOAD_REGISTER_REG | 3308 MI_LRR_SOURCE_CS_MMIO | 3309 MI_LRI_LRM_CS_MMIO; 3310 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3311 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3312 3313 return cs; 3314 } 3315 3316 static u32 * 3317 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3318 { 3319 cs = gen12_emit_timestamp_wa(ce, cs); 3320 cs = gen12_emit_cmd_buf_wa(ce, cs); 3321 cs = gen12_emit_restore_scratch(ce, cs); 3322 3323 return cs; 3324 } 3325 3326 static u32 * 3327 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3328 { 3329 cs = gen12_emit_timestamp_wa(ce, cs); 3330 cs = gen12_emit_restore_scratch(ce, cs); 3331 3332 return cs; 3333 } 3334 3335 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3336 { 3337 return PAGE_SIZE * ce->wa_bb_page; 3338 } 3339 3340 static u32 *context_indirect_bb(const struct intel_context *ce) 3341 { 3342 void *ptr; 3343 3344 GEM_BUG_ON(!ce->wa_bb_page); 3345 3346 ptr = ce->lrc_reg_state; 3347 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3348 ptr += context_wa_bb_offset(ce); 3349 3350 return ptr; 3351 } 3352 3353 static void 3354 setup_indirect_ctx_bb(const struct intel_context *ce, 3355 const struct intel_engine_cs *engine, 3356 u32 *(*emit)(const struct intel_context *, u32 *)) 3357 { 3358 u32 * const start = context_indirect_bb(ce); 3359 u32 *cs; 3360 3361 cs = emit(ce, start); 3362 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3363 while ((unsigned long)cs % CACHELINE_BYTES) 3364 *cs++ = MI_NOOP; 3365 3366 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3367 i915_ggtt_offset(ce->state) + 3368 context_wa_bb_offset(ce), 3369 (cs - start) * sizeof(*cs)); 3370 } 3371 3372 static void 3373 __execlists_update_reg_state(const struct intel_context *ce, 3374 const struct intel_engine_cs *engine, 3375 u32 head) 3376 { 3377 struct intel_ring *ring = ce->ring; 3378 u32 *regs = ce->lrc_reg_state; 3379 3380 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3381 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3382 3383 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3384 regs[CTX_RING_HEAD] = head; 3385 regs[CTX_RING_TAIL] = ring->tail; 3386 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3387 3388 /* RPCS */ 3389 if (engine->class == RENDER_CLASS) { 3390 regs[CTX_R_PWR_CLK_STATE] = 3391 intel_sseu_make_rpcs(engine->i915, &ce->sseu); 3392 3393 i915_oa_init_reg_state(ce, engine); 3394 } 3395 3396 if (ce->wa_bb_page) { 3397 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3398 3399 fn = gen12_emit_indirect_ctx_xcs; 3400 if (ce->engine->class == RENDER_CLASS) 3401 fn = gen12_emit_indirect_ctx_rcs; 3402 3403 /* Mutually exclusive wrt to global indirect bb */ 3404 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3405 setup_indirect_ctx_bb(ce, engine, fn); 3406 } 3407 } 3408 3409 static int 3410 __execlists_context_pin(struct intel_context *ce, 3411 struct intel_engine_cs *engine) 3412 { 3413 void *vaddr; 3414 3415 GEM_BUG_ON(!ce->state); 3416 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3417 3418 vaddr = i915_gem_object_pin_map(ce->state->obj, 3419 i915_coherent_map_type(engine->i915) | 3420 I915_MAP_OVERRIDE); 3421 if (IS_ERR(vaddr)) 3422 return PTR_ERR(vaddr); 3423 3424 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3425 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3426 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3427 3428 return 0; 3429 } 3430 3431 static int execlists_context_pin(struct intel_context *ce) 3432 { 3433 return __execlists_context_pin(ce, ce->engine); 3434 } 3435 3436 static int execlists_context_alloc(struct intel_context *ce) 3437 { 3438 return __execlists_context_alloc(ce, ce->engine); 3439 } 3440 3441 static void execlists_context_reset(struct intel_context *ce) 3442 { 3443 CE_TRACE(ce, "reset\n"); 3444 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3445 3446 intel_ring_reset(ce->ring, ce->ring->emit); 3447 3448 /* Scrub away the garbage */ 3449 execlists_init_reg_state(ce->lrc_reg_state, 3450 ce, ce->engine, ce->ring, true); 3451 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3452 3453 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3454 } 3455 3456 static const struct intel_context_ops execlists_context_ops = { 3457 .alloc = execlists_context_alloc, 3458 3459 .pin = execlists_context_pin, 3460 .unpin = execlists_context_unpin, 3461 3462 .enter = intel_context_enter_engine, 3463 .exit = intel_context_exit_engine, 3464 3465 .reset = execlists_context_reset, 3466 .destroy = execlists_context_destroy, 3467 }; 3468 3469 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3470 { 3471 u32 *cs; 3472 3473 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3474 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3475 return 0; 3476 3477 cs = intel_ring_begin(rq, 6); 3478 if (IS_ERR(cs)) 3479 return PTR_ERR(cs); 3480 3481 /* 3482 * Check if we have been preempted before we even get started. 3483 * 3484 * After this point i915_request_started() reports true, even if 3485 * we get preempted and so are no longer running. 3486 */ 3487 *cs++ = MI_ARB_CHECK; 3488 *cs++ = MI_NOOP; 3489 3490 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3491 *cs++ = i915_request_timeline(rq)->hwsp_offset; 3492 *cs++ = 0; 3493 *cs++ = rq->fence.seqno - 1; 3494 3495 intel_ring_advance(rq, cs); 3496 3497 /* Record the updated position of the request's payload */ 3498 rq->infix = intel_ring_offset(rq, cs); 3499 3500 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3501 3502 return 0; 3503 } 3504 3505 static int emit_pdps(struct i915_request *rq) 3506 { 3507 const struct intel_engine_cs * const engine = rq->engine; 3508 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3509 int err, i; 3510 u32 *cs; 3511 3512 GEM_BUG_ON(intel_vgpu_active(rq->i915)); 3513 3514 /* 3515 * Beware ye of the dragons, this sequence is magic! 3516 * 3517 * Small changes to this sequence can cause anything from 3518 * GPU hangs to forcewake errors and machine lockups! 3519 */ 3520 3521 /* Flush any residual operations from the context load */ 3522 err = engine->emit_flush(rq, EMIT_FLUSH); 3523 if (err) 3524 return err; 3525 3526 /* Magic required to prevent forcewake errors! */ 3527 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3528 if (err) 3529 return err; 3530 3531 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3532 if (IS_ERR(cs)) 3533 return PTR_ERR(cs); 3534 3535 /* Ensure the LRI have landed before we invalidate & continue */ 3536 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3537 for (i = GEN8_3LVL_PDPES; i--; ) { 3538 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3539 u32 base = engine->mmio_base; 3540 3541 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3542 *cs++ = upper_32_bits(pd_daddr); 3543 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3544 *cs++ = lower_32_bits(pd_daddr); 3545 } 3546 *cs++ = MI_NOOP; 3547 3548 intel_ring_advance(rq, cs); 3549 3550 return 0; 3551 } 3552 3553 static int execlists_request_alloc(struct i915_request *request) 3554 { 3555 int ret; 3556 3557 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3558 3559 /* 3560 * Flush enough space to reduce the likelihood of waiting after 3561 * we start building the request - in which case we will just 3562 * have to repeat work. 3563 */ 3564 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3565 3566 /* 3567 * Note that after this point, we have committed to using 3568 * this request as it is being used to both track the 3569 * state of engine initialisation and liveness of the 3570 * golden renderstate above. Think twice before you try 3571 * to cancel/unwind this request now. 3572 */ 3573 3574 if (!i915_vm_is_4lvl(request->context->vm)) { 3575 ret = emit_pdps(request); 3576 if (ret) 3577 return ret; 3578 } 3579 3580 /* Unconditionally invalidate GPU caches and TLBs. */ 3581 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3582 if (ret) 3583 return ret; 3584 3585 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3586 return 0; 3587 } 3588 3589 /* 3590 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3591 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3592 * but there is a slight complication as this is applied in WA batch where the 3593 * values are only initialized once so we cannot take register value at the 3594 * beginning and reuse it further; hence we save its value to memory, upload a 3595 * constant value with bit21 set and then we restore it back with the saved value. 3596 * To simplify the WA, a constant value is formed by using the default value 3597 * of this register. This shouldn't be a problem because we are only modifying 3598 * it for a short period and this batch in non-premptible. We can ofcourse 3599 * use additional instructions that read the actual value of the register 3600 * at that time and set our bit of interest but it makes the WA complicated. 3601 * 3602 * This WA is also required for Gen9 so extracting as a function avoids 3603 * code duplication. 3604 */ 3605 static u32 * 3606 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3607 { 3608 /* NB no one else is allowed to scribble over scratch + 256! */ 3609 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3610 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3611 *batch++ = intel_gt_scratch_offset(engine->gt, 3612 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3613 *batch++ = 0; 3614 3615 *batch++ = MI_LOAD_REGISTER_IMM(1); 3616 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3617 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3618 3619 batch = gen8_emit_pipe_control(batch, 3620 PIPE_CONTROL_CS_STALL | 3621 PIPE_CONTROL_DC_FLUSH_ENABLE, 3622 0); 3623 3624 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3625 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3626 *batch++ = intel_gt_scratch_offset(engine->gt, 3627 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3628 *batch++ = 0; 3629 3630 return batch; 3631 } 3632 3633 /* 3634 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3635 * initialized at the beginning and shared across all contexts but this field 3636 * helps us to have multiple batches at different offsets and select them based 3637 * on a criteria. At the moment this batch always start at the beginning of the page 3638 * and at this point we don't have multiple wa_ctx batch buffers. 3639 * 3640 * The number of WA applied are not known at the beginning; we use this field 3641 * to return the no of DWORDS written. 3642 * 3643 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3644 * so it adds NOOPs as padding to make it cacheline aligned. 3645 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3646 * makes a complete batch buffer. 3647 */ 3648 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3649 { 3650 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3651 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3652 3653 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3654 if (IS_BROADWELL(engine->i915)) 3655 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3656 3657 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3658 /* Actual scratch location is at 128 bytes offset */ 3659 batch = gen8_emit_pipe_control(batch, 3660 PIPE_CONTROL_FLUSH_L3 | 3661 PIPE_CONTROL_STORE_DATA_INDEX | 3662 PIPE_CONTROL_CS_STALL | 3663 PIPE_CONTROL_QW_WRITE, 3664 LRC_PPHWSP_SCRATCH_ADDR); 3665 3666 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3667 3668 /* Pad to end of cacheline */ 3669 while ((unsigned long)batch % CACHELINE_BYTES) 3670 *batch++ = MI_NOOP; 3671 3672 /* 3673 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3674 * execution depends on the length specified in terms of cache lines 3675 * in the register CTX_RCS_INDIRECT_CTX 3676 */ 3677 3678 return batch; 3679 } 3680 3681 struct lri { 3682 i915_reg_t reg; 3683 u32 value; 3684 }; 3685 3686 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3687 { 3688 GEM_BUG_ON(!count || count > 63); 3689 3690 *batch++ = MI_LOAD_REGISTER_IMM(count); 3691 do { 3692 *batch++ = i915_mmio_reg_offset(lri->reg); 3693 *batch++ = lri->value; 3694 } while (lri++, --count); 3695 *batch++ = MI_NOOP; 3696 3697 return batch; 3698 } 3699 3700 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3701 { 3702 static const struct lri lri[] = { 3703 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3704 { 3705 COMMON_SLICE_CHICKEN2, 3706 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3707 0), 3708 }, 3709 3710 /* BSpec: 11391 */ 3711 { 3712 FF_SLICE_CHICKEN, 3713 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3714 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3715 }, 3716 3717 /* BSpec: 11299 */ 3718 { 3719 _3D_CHICKEN3, 3720 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3721 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3722 } 3723 }; 3724 3725 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3726 3727 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3728 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3729 3730 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3731 batch = gen8_emit_pipe_control(batch, 3732 PIPE_CONTROL_FLUSH_L3 | 3733 PIPE_CONTROL_STORE_DATA_INDEX | 3734 PIPE_CONTROL_CS_STALL | 3735 PIPE_CONTROL_QW_WRITE, 3736 LRC_PPHWSP_SCRATCH_ADDR); 3737 3738 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3739 3740 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3741 if (HAS_POOLED_EU(engine->i915)) { 3742 /* 3743 * EU pool configuration is setup along with golden context 3744 * during context initialization. This value depends on 3745 * device type (2x6 or 3x6) and needs to be updated based 3746 * on which subslice is disabled especially for 2x6 3747 * devices, however it is safe to load default 3748 * configuration of 3x6 device instead of masking off 3749 * corresponding bits because HW ignores bits of a disabled 3750 * subslice and drops down to appropriate config. Please 3751 * see render_state_setup() in i915_gem_render_state.c for 3752 * possible configurations, to avoid duplication they are 3753 * not shown here again. 3754 */ 3755 *batch++ = GEN9_MEDIA_POOL_STATE; 3756 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3757 *batch++ = 0x00777000; 3758 *batch++ = 0; 3759 *batch++ = 0; 3760 *batch++ = 0; 3761 } 3762 3763 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3764 3765 /* Pad to end of cacheline */ 3766 while ((unsigned long)batch % CACHELINE_BYTES) 3767 *batch++ = MI_NOOP; 3768 3769 return batch; 3770 } 3771 3772 static u32 * 3773 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3774 { 3775 int i; 3776 3777 /* 3778 * WaPipeControlBefore3DStateSamplePattern: cnl 3779 * 3780 * Ensure the engine is idle prior to programming a 3781 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3782 */ 3783 batch = gen8_emit_pipe_control(batch, 3784 PIPE_CONTROL_CS_STALL, 3785 0); 3786 /* 3787 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3788 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3789 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3790 * confusing. Since gen8_emit_pipe_control() already advances the 3791 * batch by 6 dwords, we advance the other 10 here, completing a 3792 * cacheline. It's not clear if the workaround requires this padding 3793 * before other commands, or if it's just the regular padding we would 3794 * already have for the workaround bb, so leave it here for now. 3795 */ 3796 for (i = 0; i < 10; i++) 3797 *batch++ = MI_NOOP; 3798 3799 /* Pad to end of cacheline */ 3800 while ((unsigned long)batch % CACHELINE_BYTES) 3801 *batch++ = MI_NOOP; 3802 3803 return batch; 3804 } 3805 3806 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3807 3808 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3809 { 3810 struct drm_i915_gem_object *obj; 3811 struct i915_vma *vma; 3812 int err; 3813 3814 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3815 if (IS_ERR(obj)) 3816 return PTR_ERR(obj); 3817 3818 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3819 if (IS_ERR(vma)) { 3820 err = PTR_ERR(vma); 3821 goto err; 3822 } 3823 3824 err = i915_ggtt_pin(vma, 0, PIN_HIGH); 3825 if (err) 3826 goto err; 3827 3828 engine->wa_ctx.vma = vma; 3829 return 0; 3830 3831 err: 3832 i915_gem_object_put(obj); 3833 return err; 3834 } 3835 3836 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3837 { 3838 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3839 } 3840 3841 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3842 3843 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3844 { 3845 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3846 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3847 &wa_ctx->per_ctx }; 3848 wa_bb_func_t wa_bb_fn[2]; 3849 struct page *page; 3850 void *batch, *batch_ptr; 3851 unsigned int i; 3852 int ret; 3853 3854 if (engine->class != RENDER_CLASS) 3855 return 0; 3856 3857 switch (INTEL_GEN(engine->i915)) { 3858 case 12: 3859 case 11: 3860 return 0; 3861 case 10: 3862 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3863 wa_bb_fn[1] = NULL; 3864 break; 3865 case 9: 3866 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3867 wa_bb_fn[1] = NULL; 3868 break; 3869 case 8: 3870 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3871 wa_bb_fn[1] = NULL; 3872 break; 3873 default: 3874 MISSING_CASE(INTEL_GEN(engine->i915)); 3875 return 0; 3876 } 3877 3878 ret = lrc_setup_wa_ctx(engine); 3879 if (ret) { 3880 drm_dbg(&engine->i915->drm, 3881 "Failed to setup context WA page: %d\n", ret); 3882 return ret; 3883 } 3884 3885 page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); 3886 batch = batch_ptr = kmap_atomic(page); 3887 3888 /* 3889 * Emit the two workaround batch buffers, recording the offset from the 3890 * start of the workaround batch buffer object for each and their 3891 * respective sizes. 3892 */ 3893 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3894 wa_bb[i]->offset = batch_ptr - batch; 3895 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3896 CACHELINE_BYTES))) { 3897 ret = -EINVAL; 3898 break; 3899 } 3900 if (wa_bb_fn[i]) 3901 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3902 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3903 } 3904 3905 BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3906 3907 kunmap_atomic(batch); 3908 if (ret) 3909 lrc_destroy_wa_ctx(engine); 3910 3911 return ret; 3912 } 3913 3914 static void reset_csb_pointers(struct intel_engine_cs *engine) 3915 { 3916 struct intel_engine_execlists * const execlists = &engine->execlists; 3917 const unsigned int reset_value = execlists->csb_size - 1; 3918 3919 ring_set_paused(engine, 0); 3920 3921 /* 3922 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 3923 * Bludgeon them with a mmio update to be sure. 3924 */ 3925 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3926 0xffff << 16 | reset_value << 8 | reset_value); 3927 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3928 3929 /* 3930 * After a reset, the HW starts writing into CSB entry [0]. We 3931 * therefore have to set our HEAD pointer back one entry so that 3932 * the *first* entry we check is entry 0. To complicate this further, 3933 * as we don't wait for the first interrupt after reset, we have to 3934 * fake the HW write to point back to the last entry so that our 3935 * inline comparison of our cached head position against the last HW 3936 * write works even before the first interrupt. 3937 */ 3938 execlists->csb_head = reset_value; 3939 WRITE_ONCE(*execlists->csb_write, reset_value); 3940 wmb(); /* Make sure this is visible to HW (paranoia?) */ 3941 3942 invalidate_csb_entries(&execlists->csb_status[0], 3943 &execlists->csb_status[reset_value]); 3944 3945 /* Once more for luck and our trusty paranoia */ 3946 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3947 0xffff << 16 | reset_value << 8 | reset_value); 3948 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3949 3950 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 3951 } 3952 3953 static void execlists_sanitize(struct intel_engine_cs *engine) 3954 { 3955 /* 3956 * Poison residual state on resume, in case the suspend didn't! 3957 * 3958 * We have to assume that across suspend/resume (or other loss 3959 * of control) that the contents of our pinned buffers has been 3960 * lost, replaced by garbage. Since this doesn't always happen, 3961 * let's poison such state so that we more quickly spot when 3962 * we falsely assume it has been preserved. 3963 */ 3964 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3965 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 3966 3967 reset_csb_pointers(engine); 3968 3969 /* 3970 * The kernel_context HWSP is stored in the status_page. As above, 3971 * that may be lost on resume/initialisation, and so we need to 3972 * reset the value in the HWSP. 3973 */ 3974 intel_timeline_reset_seqno(engine->kernel_context->timeline); 3975 3976 /* And scrub the dirty cachelines for the HWSP */ 3977 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 3978 } 3979 3980 static void enable_error_interrupt(struct intel_engine_cs *engine) 3981 { 3982 u32 status; 3983 3984 engine->execlists.error_interrupt = 0; 3985 ENGINE_WRITE(engine, RING_EMR, ~0u); 3986 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 3987 3988 status = ENGINE_READ(engine, RING_ESR); 3989 if (unlikely(status)) { 3990 drm_err(&engine->i915->drm, 3991 "engine '%s' resumed still in error: %08x\n", 3992 engine->name, status); 3993 __intel_gt_reset(engine->gt, engine->mask); 3994 } 3995 3996 /* 3997 * On current gen8+, we have 2 signals to play with 3998 * 3999 * - I915_ERROR_INSTUCTION (bit 0) 4000 * 4001 * Generate an error if the command parser encounters an invalid 4002 * instruction 4003 * 4004 * This is a fatal error. 4005 * 4006 * - CP_PRIV (bit 2) 4007 * 4008 * Generate an error on privilege violation (where the CP replaces 4009 * the instruction with a no-op). This also fires for writes into 4010 * read-only scratch pages. 4011 * 4012 * This is a non-fatal error, parsing continues. 4013 * 4014 * * there are a few others defined for odd HW that we do not use 4015 * 4016 * Since CP_PRIV fires for cases where we have chosen to ignore the 4017 * error (as the HW is validating and suppressing the mistakes), we 4018 * only unmask the instruction error bit. 4019 */ 4020 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4021 } 4022 4023 static void enable_execlists(struct intel_engine_cs *engine) 4024 { 4025 u32 mode; 4026 4027 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4028 4029 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4030 4031 if (INTEL_GEN(engine->i915) >= 11) 4032 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4033 else 4034 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4035 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4036 4037 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4038 4039 ENGINE_WRITE_FW(engine, 4040 RING_HWS_PGA, 4041 i915_ggtt_offset(engine->status_page.vma)); 4042 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4043 4044 enable_error_interrupt(engine); 4045 4046 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4047 } 4048 4049 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4050 { 4051 bool unexpected = false; 4052 4053 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4054 drm_dbg(&engine->i915->drm, 4055 "STOP_RING still set in RING_MI_MODE\n"); 4056 unexpected = true; 4057 } 4058 4059 return unexpected; 4060 } 4061 4062 static int execlists_resume(struct intel_engine_cs *engine) 4063 { 4064 intel_mocs_init_engine(engine); 4065 4066 intel_engine_reset_breadcrumbs(engine); 4067 4068 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4069 struct drm_printer p = drm_debug_printer(__func__); 4070 4071 intel_engine_dump(engine, &p, NULL); 4072 } 4073 4074 enable_execlists(engine); 4075 4076 return 0; 4077 } 4078 4079 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4080 { 4081 struct intel_engine_execlists * const execlists = &engine->execlists; 4082 unsigned long flags; 4083 4084 ENGINE_TRACE(engine, "depth<-%d\n", 4085 atomic_read(&execlists->tasklet.count)); 4086 4087 /* 4088 * Prevent request submission to the hardware until we have 4089 * completed the reset in i915_gem_reset_finish(). If a request 4090 * is completed by one engine, it may then queue a request 4091 * to a second via its execlists->tasklet *just* as we are 4092 * calling engine->resume() and also writing the ELSP. 4093 * Turning off the execlists->tasklet until the reset is over 4094 * prevents the race. 4095 */ 4096 __tasklet_disable_sync_once(&execlists->tasklet); 4097 GEM_BUG_ON(!reset_in_progress(execlists)); 4098 4099 /* And flush any current direct submission. */ 4100 spin_lock_irqsave(&engine->active.lock, flags); 4101 spin_unlock_irqrestore(&engine->active.lock, flags); 4102 4103 /* 4104 * We stop engines, otherwise we might get failed reset and a 4105 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4106 * from system hang if batchbuffer is progressing when 4107 * the reset is issued, regardless of READY_TO_RESET ack. 4108 * Thus assume it is best to stop engines on all gens 4109 * where we have a gpu reset. 4110 * 4111 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4112 * 4113 * FIXME: Wa for more modern gens needs to be validated 4114 */ 4115 ring_set_paused(engine, 1); 4116 intel_engine_stop_cs(engine); 4117 4118 engine->execlists.reset_ccid = active_ccid(engine); 4119 } 4120 4121 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4122 { 4123 int x; 4124 4125 x = lrc_ring_mi_mode(engine); 4126 if (x != -1) { 4127 regs[x + 1] &= ~STOP_RING; 4128 regs[x + 1] |= STOP_RING << 16; 4129 } 4130 } 4131 4132 static void __execlists_reset_reg_state(const struct intel_context *ce, 4133 const struct intel_engine_cs *engine) 4134 { 4135 u32 *regs = ce->lrc_reg_state; 4136 4137 __reset_stop_ring(regs, engine); 4138 } 4139 4140 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4141 { 4142 struct intel_engine_execlists * const execlists = &engine->execlists; 4143 struct intel_context *ce; 4144 struct i915_request *rq; 4145 u32 head; 4146 4147 mb(); /* paranoia: read the CSB pointers from after the reset */ 4148 clflush(execlists->csb_write); 4149 mb(); 4150 4151 process_csb(engine); /* drain preemption events */ 4152 4153 /* Following the reset, we need to reload the CSB read/write pointers */ 4154 reset_csb_pointers(engine); 4155 4156 /* 4157 * Save the currently executing context, even if we completed 4158 * its request, it was still running at the time of the 4159 * reset and will have been clobbered. 4160 */ 4161 rq = active_context(engine, engine->execlists.reset_ccid); 4162 if (!rq) 4163 goto unwind; 4164 4165 ce = rq->context; 4166 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4167 4168 if (i915_request_completed(rq)) { 4169 /* Idle context; tidy up the ring so we can restart afresh */ 4170 head = intel_ring_wrap(ce->ring, rq->tail); 4171 goto out_replay; 4172 } 4173 4174 /* We still have requests in-flight; the engine should be active */ 4175 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4176 4177 /* Context has requests still in-flight; it should not be idle! */ 4178 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4179 4180 rq = active_request(ce->timeline, rq); 4181 head = intel_ring_wrap(ce->ring, rq->head); 4182 GEM_BUG_ON(head == ce->ring->tail); 4183 4184 /* 4185 * If this request hasn't started yet, e.g. it is waiting on a 4186 * semaphore, we need to avoid skipping the request or else we 4187 * break the signaling chain. However, if the context is corrupt 4188 * the request will not restart and we will be stuck with a wedged 4189 * device. It is quite often the case that if we issue a reset 4190 * while the GPU is loading the context image, that the context 4191 * image becomes corrupt. 4192 * 4193 * Otherwise, if we have not started yet, the request should replay 4194 * perfectly and we do not need to flag the result as being erroneous. 4195 */ 4196 if (!i915_request_started(rq)) 4197 goto out_replay; 4198 4199 /* 4200 * If the request was innocent, we leave the request in the ELSP 4201 * and will try to replay it on restarting. The context image may 4202 * have been corrupted by the reset, in which case we may have 4203 * to service a new GPU hang, but more likely we can continue on 4204 * without impact. 4205 * 4206 * If the request was guilty, we presume the context is corrupt 4207 * and have to at least restore the RING register in the context 4208 * image back to the expected values to skip over the guilty request. 4209 */ 4210 __i915_request_reset(rq, stalled); 4211 4212 /* 4213 * We want a simple context + ring to execute the breadcrumb update. 4214 * We cannot rely on the context being intact across the GPU hang, 4215 * so clear it and rebuild just what we need for the breadcrumb. 4216 * All pending requests for this context will be zapped, and any 4217 * future request will be after userspace has had the opportunity 4218 * to recreate its own state. 4219 */ 4220 out_replay: 4221 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4222 head, ce->ring->tail); 4223 __execlists_reset_reg_state(ce, engine); 4224 __execlists_update_reg_state(ce, engine, head); 4225 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4226 4227 unwind: 4228 /* Push back any incomplete requests for replay after the reset. */ 4229 cancel_port_requests(execlists); 4230 __unwind_incomplete_requests(engine); 4231 } 4232 4233 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4234 { 4235 unsigned long flags; 4236 4237 ENGINE_TRACE(engine, "\n"); 4238 4239 spin_lock_irqsave(&engine->active.lock, flags); 4240 4241 __execlists_reset(engine, stalled); 4242 4243 spin_unlock_irqrestore(&engine->active.lock, flags); 4244 } 4245 4246 static void nop_submission_tasklet(unsigned long data) 4247 { 4248 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4249 4250 /* The driver is wedged; don't process any more events. */ 4251 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4252 } 4253 4254 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4255 { 4256 struct intel_engine_execlists * const execlists = &engine->execlists; 4257 struct i915_request *rq, *rn; 4258 struct rb_node *rb; 4259 unsigned long flags; 4260 4261 ENGINE_TRACE(engine, "\n"); 4262 4263 /* 4264 * Before we call engine->cancel_requests(), we should have exclusive 4265 * access to the submission state. This is arranged for us by the 4266 * caller disabling the interrupt generation, the tasklet and other 4267 * threads that may then access the same state, giving us a free hand 4268 * to reset state. However, we still need to let lockdep be aware that 4269 * we know this state may be accessed in hardirq context, so we 4270 * disable the irq around this manipulation and we want to keep 4271 * the spinlock focused on its duties and not accidentally conflate 4272 * coverage to the submission's irq state. (Similarly, although we 4273 * shouldn't need to disable irq around the manipulation of the 4274 * submission's irq state, we also wish to remind ourselves that 4275 * it is irq state.) 4276 */ 4277 spin_lock_irqsave(&engine->active.lock, flags); 4278 4279 __execlists_reset(engine, true); 4280 4281 /* Mark all executing requests as skipped. */ 4282 list_for_each_entry(rq, &engine->active.requests, sched.link) 4283 mark_eio(rq); 4284 4285 /* Flush the queued requests to the timeline list (for retiring). */ 4286 while ((rb = rb_first_cached(&execlists->queue))) { 4287 struct i915_priolist *p = to_priolist(rb); 4288 int i; 4289 4290 priolist_for_each_request_consume(rq, rn, p, i) { 4291 mark_eio(rq); 4292 __i915_request_submit(rq); 4293 } 4294 4295 rb_erase_cached(&p->node, &execlists->queue); 4296 i915_priolist_free(p); 4297 } 4298 4299 /* On-hold requests will be flushed to timeline upon their release */ 4300 list_for_each_entry(rq, &engine->active.hold, sched.link) 4301 mark_eio(rq); 4302 4303 /* Cancel all attached virtual engines */ 4304 while ((rb = rb_first_cached(&execlists->virtual))) { 4305 struct virtual_engine *ve = 4306 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4307 4308 rb_erase_cached(rb, &execlists->virtual); 4309 RB_CLEAR_NODE(rb); 4310 4311 spin_lock(&ve->base.active.lock); 4312 rq = fetch_and_zero(&ve->request); 4313 if (rq) { 4314 mark_eio(rq); 4315 4316 rq->engine = engine; 4317 __i915_request_submit(rq); 4318 i915_request_put(rq); 4319 4320 ve->base.execlists.queue_priority_hint = INT_MIN; 4321 } 4322 spin_unlock(&ve->base.active.lock); 4323 } 4324 4325 /* Remaining _unready_ requests will be nop'ed when submitted */ 4326 4327 execlists->queue_priority_hint = INT_MIN; 4328 execlists->queue = RB_ROOT_CACHED; 4329 4330 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4331 execlists->tasklet.func = nop_submission_tasklet; 4332 4333 spin_unlock_irqrestore(&engine->active.lock, flags); 4334 } 4335 4336 static void execlists_reset_finish(struct intel_engine_cs *engine) 4337 { 4338 struct intel_engine_execlists * const execlists = &engine->execlists; 4339 4340 /* 4341 * After a GPU reset, we may have requests to replay. Do so now while 4342 * we still have the forcewake to be sure that the GPU is not allowed 4343 * to sleep before we restart and reload a context. 4344 */ 4345 GEM_BUG_ON(!reset_in_progress(execlists)); 4346 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4347 execlists->tasklet.func(execlists->tasklet.data); 4348 4349 if (__tasklet_enable(&execlists->tasklet)) 4350 /* And kick in case we missed a new request submission. */ 4351 tasklet_hi_schedule(&execlists->tasklet); 4352 ENGINE_TRACE(engine, "depth->%d\n", 4353 atomic_read(&execlists->tasklet.count)); 4354 } 4355 4356 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4357 u64 offset, u32 len, 4358 const unsigned int flags) 4359 { 4360 u32 *cs; 4361 4362 cs = intel_ring_begin(rq, 4); 4363 if (IS_ERR(cs)) 4364 return PTR_ERR(cs); 4365 4366 /* 4367 * WaDisableCtxRestoreArbitration:bdw,chv 4368 * 4369 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4370 * particular all the gen that do not need the w/a at all!), if we 4371 * took care to make sure that on every switch into this context 4372 * (both ordinary and for preemption) that arbitrartion was enabled 4373 * we would be fine. However, for gen8 there is another w/a that 4374 * requires us to not preempt inside GPGPU execution, so we keep 4375 * arbitration disabled for gen8 batches. Arbitration will be 4376 * re-enabled before we close the request 4377 * (engine->emit_fini_breadcrumb). 4378 */ 4379 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4380 4381 /* FIXME(BDW+): Address space and security selectors. */ 4382 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4383 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4384 *cs++ = lower_32_bits(offset); 4385 *cs++ = upper_32_bits(offset); 4386 4387 intel_ring_advance(rq, cs); 4388 4389 return 0; 4390 } 4391 4392 static int gen8_emit_bb_start(struct i915_request *rq, 4393 u64 offset, u32 len, 4394 const unsigned int flags) 4395 { 4396 u32 *cs; 4397 4398 cs = intel_ring_begin(rq, 6); 4399 if (IS_ERR(cs)) 4400 return PTR_ERR(cs); 4401 4402 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4403 4404 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4405 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4406 *cs++ = lower_32_bits(offset); 4407 *cs++ = upper_32_bits(offset); 4408 4409 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4410 *cs++ = MI_NOOP; 4411 4412 intel_ring_advance(rq, cs); 4413 4414 return 0; 4415 } 4416 4417 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4418 { 4419 ENGINE_WRITE(engine, RING_IMR, 4420 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4421 ENGINE_POSTING_READ(engine, RING_IMR); 4422 } 4423 4424 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4425 { 4426 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4427 } 4428 4429 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4430 { 4431 u32 cmd, *cs; 4432 4433 cs = intel_ring_begin(request, 4); 4434 if (IS_ERR(cs)) 4435 return PTR_ERR(cs); 4436 4437 cmd = MI_FLUSH_DW + 1; 4438 4439 /* We always require a command barrier so that subsequent 4440 * commands, such as breadcrumb interrupts, are strictly ordered 4441 * wrt the contents of the write cache being flushed to memory 4442 * (and thus being coherent from the CPU). 4443 */ 4444 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4445 4446 if (mode & EMIT_INVALIDATE) { 4447 cmd |= MI_INVALIDATE_TLB; 4448 if (request->engine->class == VIDEO_DECODE_CLASS) 4449 cmd |= MI_INVALIDATE_BSD; 4450 } 4451 4452 *cs++ = cmd; 4453 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4454 *cs++ = 0; /* upper addr */ 4455 *cs++ = 0; /* value */ 4456 intel_ring_advance(request, cs); 4457 4458 return 0; 4459 } 4460 4461 static int gen8_emit_flush_render(struct i915_request *request, 4462 u32 mode) 4463 { 4464 bool vf_flush_wa = false, dc_flush_wa = false; 4465 u32 *cs, flags = 0; 4466 int len; 4467 4468 flags |= PIPE_CONTROL_CS_STALL; 4469 4470 if (mode & EMIT_FLUSH) { 4471 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4472 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4473 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4474 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4475 } 4476 4477 if (mode & EMIT_INVALIDATE) { 4478 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4479 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4480 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4481 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4482 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4483 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4484 flags |= PIPE_CONTROL_QW_WRITE; 4485 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4486 4487 /* 4488 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4489 * pipe control. 4490 */ 4491 if (IS_GEN(request->i915, 9)) 4492 vf_flush_wa = true; 4493 4494 /* WaForGAMHang:kbl */ 4495 if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) 4496 dc_flush_wa = true; 4497 } 4498 4499 len = 6; 4500 4501 if (vf_flush_wa) 4502 len += 6; 4503 4504 if (dc_flush_wa) 4505 len += 12; 4506 4507 cs = intel_ring_begin(request, len); 4508 if (IS_ERR(cs)) 4509 return PTR_ERR(cs); 4510 4511 if (vf_flush_wa) 4512 cs = gen8_emit_pipe_control(cs, 0, 0); 4513 4514 if (dc_flush_wa) 4515 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4516 0); 4517 4518 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4519 4520 if (dc_flush_wa) 4521 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4522 4523 intel_ring_advance(request, cs); 4524 4525 return 0; 4526 } 4527 4528 static int gen11_emit_flush_render(struct i915_request *request, 4529 u32 mode) 4530 { 4531 if (mode & EMIT_FLUSH) { 4532 u32 *cs; 4533 u32 flags = 0; 4534 4535 flags |= PIPE_CONTROL_CS_STALL; 4536 4537 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4538 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4539 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4540 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4541 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4542 flags |= PIPE_CONTROL_QW_WRITE; 4543 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4544 4545 cs = intel_ring_begin(request, 6); 4546 if (IS_ERR(cs)) 4547 return PTR_ERR(cs); 4548 4549 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4550 intel_ring_advance(request, cs); 4551 } 4552 4553 if (mode & EMIT_INVALIDATE) { 4554 u32 *cs; 4555 u32 flags = 0; 4556 4557 flags |= PIPE_CONTROL_CS_STALL; 4558 4559 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4560 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4561 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4562 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4563 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4564 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4565 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4566 flags |= PIPE_CONTROL_QW_WRITE; 4567 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4568 4569 cs = intel_ring_begin(request, 6); 4570 if (IS_ERR(cs)) 4571 return PTR_ERR(cs); 4572 4573 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4574 intel_ring_advance(request, cs); 4575 } 4576 4577 return 0; 4578 } 4579 4580 static u32 preparser_disable(bool state) 4581 { 4582 return MI_ARB_CHECK | 1 << 8 | state; 4583 } 4584 4585 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4586 { 4587 static const i915_reg_t vd[] = { 4588 GEN12_VD0_AUX_NV, 4589 GEN12_VD1_AUX_NV, 4590 GEN12_VD2_AUX_NV, 4591 GEN12_VD3_AUX_NV, 4592 }; 4593 4594 static const i915_reg_t ve[] = { 4595 GEN12_VE0_AUX_NV, 4596 GEN12_VE1_AUX_NV, 4597 }; 4598 4599 if (engine->class == VIDEO_DECODE_CLASS) 4600 return vd[engine->instance]; 4601 4602 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4603 return ve[engine->instance]; 4604 4605 GEM_BUG_ON("unknown aux_inv_reg\n"); 4606 4607 return INVALID_MMIO_REG; 4608 } 4609 4610 static u32 * 4611 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4612 { 4613 *cs++ = MI_LOAD_REGISTER_IMM(1); 4614 *cs++ = i915_mmio_reg_offset(inv_reg); 4615 *cs++ = AUX_INV; 4616 *cs++ = MI_NOOP; 4617 4618 return cs; 4619 } 4620 4621 static int gen12_emit_flush_render(struct i915_request *request, 4622 u32 mode) 4623 { 4624 if (mode & EMIT_FLUSH) { 4625 u32 flags = 0; 4626 u32 *cs; 4627 4628 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4629 flags |= PIPE_CONTROL_FLUSH_L3; 4630 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4631 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4632 /* Wa_1409600907:tgl */ 4633 flags |= PIPE_CONTROL_DEPTH_STALL; 4634 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4635 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4636 4637 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4638 flags |= PIPE_CONTROL_QW_WRITE; 4639 4640 flags |= PIPE_CONTROL_CS_STALL; 4641 4642 cs = intel_ring_begin(request, 6); 4643 if (IS_ERR(cs)) 4644 return PTR_ERR(cs); 4645 4646 cs = gen12_emit_pipe_control(cs, 4647 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4648 flags, LRC_PPHWSP_SCRATCH_ADDR); 4649 intel_ring_advance(request, cs); 4650 } 4651 4652 if (mode & EMIT_INVALIDATE) { 4653 u32 flags = 0; 4654 u32 *cs; 4655 4656 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4657 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4658 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4659 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4660 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4661 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4662 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4663 4664 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4665 flags |= PIPE_CONTROL_QW_WRITE; 4666 4667 flags |= PIPE_CONTROL_CS_STALL; 4668 4669 cs = intel_ring_begin(request, 8 + 4); 4670 if (IS_ERR(cs)) 4671 return PTR_ERR(cs); 4672 4673 /* 4674 * Prevent the pre-parser from skipping past the TLB 4675 * invalidate and loading a stale page for the batch 4676 * buffer / request payload. 4677 */ 4678 *cs++ = preparser_disable(true); 4679 4680 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4681 4682 /* hsdes: 1809175790 */ 4683 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4684 4685 *cs++ = preparser_disable(false); 4686 intel_ring_advance(request, cs); 4687 } 4688 4689 return 0; 4690 } 4691 4692 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4693 { 4694 intel_engine_mask_t aux_inv = 0; 4695 u32 cmd, *cs; 4696 4697 if (mode & EMIT_INVALIDATE) 4698 aux_inv = request->engine->mask & ~BIT(BCS0); 4699 4700 cs = intel_ring_begin(request, 4701 4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0)); 4702 if (IS_ERR(cs)) 4703 return PTR_ERR(cs); 4704 4705 cmd = MI_FLUSH_DW + 1; 4706 4707 /* We always require a command barrier so that subsequent 4708 * commands, such as breadcrumb interrupts, are strictly ordered 4709 * wrt the contents of the write cache being flushed to memory 4710 * (and thus being coherent from the CPU). 4711 */ 4712 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4713 4714 if (mode & EMIT_INVALIDATE) { 4715 cmd |= MI_INVALIDATE_TLB; 4716 if (request->engine->class == VIDEO_DECODE_CLASS) 4717 cmd |= MI_INVALIDATE_BSD; 4718 } 4719 4720 *cs++ = cmd; 4721 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4722 *cs++ = 0; /* upper addr */ 4723 *cs++ = 0; /* value */ 4724 4725 if (aux_inv) { /* hsdes: 1809175790 */ 4726 struct intel_engine_cs *engine; 4727 unsigned int tmp; 4728 4729 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4730 for_each_engine_masked(engine, request->engine->gt, 4731 aux_inv, tmp) { 4732 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4733 *cs++ = AUX_INV; 4734 } 4735 *cs++ = MI_NOOP; 4736 } 4737 intel_ring_advance(request, cs); 4738 4739 return 0; 4740 } 4741 4742 /* 4743 * Reserve space for 2 NOOPs at the end of each request to be 4744 * used as a workaround for not being allowed to do lite 4745 * restore with HEAD==TAIL (WaIdleLiteRestore). 4746 */ 4747 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4748 { 4749 /* Ensure there's always at least one preemption point per-request. */ 4750 *cs++ = MI_ARB_CHECK; 4751 *cs++ = MI_NOOP; 4752 request->wa_tail = intel_ring_offset(request, cs); 4753 4754 return cs; 4755 } 4756 4757 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4758 { 4759 *cs++ = MI_SEMAPHORE_WAIT | 4760 MI_SEMAPHORE_GLOBAL_GTT | 4761 MI_SEMAPHORE_POLL | 4762 MI_SEMAPHORE_SAD_EQ_SDD; 4763 *cs++ = 0; 4764 *cs++ = intel_hws_preempt_address(request->engine); 4765 *cs++ = 0; 4766 4767 return cs; 4768 } 4769 4770 static __always_inline u32* 4771 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4772 { 4773 *cs++ = MI_USER_INTERRUPT; 4774 4775 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4776 if (intel_engine_has_semaphores(request->engine)) 4777 cs = emit_preempt_busywait(request, cs); 4778 4779 request->tail = intel_ring_offset(request, cs); 4780 assert_ring_tail_valid(request->ring, request->tail); 4781 4782 return gen8_emit_wa_tail(request, cs); 4783 } 4784 4785 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) 4786 { 4787 u32 addr = i915_request_active_timeline(request)->hwsp_offset; 4788 4789 return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); 4790 } 4791 4792 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4793 { 4794 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4795 } 4796 4797 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4798 { 4799 cs = gen8_emit_pipe_control(cs, 4800 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4801 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4802 PIPE_CONTROL_DC_FLUSH_ENABLE, 4803 0); 4804 4805 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4806 cs = gen8_emit_ggtt_write_rcs(cs, 4807 request->fence.seqno, 4808 i915_request_active_timeline(request)->hwsp_offset, 4809 PIPE_CONTROL_FLUSH_ENABLE | 4810 PIPE_CONTROL_CS_STALL); 4811 4812 return gen8_emit_fini_breadcrumb_tail(request, cs); 4813 } 4814 4815 static u32 * 4816 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4817 { 4818 cs = gen8_emit_ggtt_write_rcs(cs, 4819 request->fence.seqno, 4820 i915_request_active_timeline(request)->hwsp_offset, 4821 PIPE_CONTROL_CS_STALL | 4822 PIPE_CONTROL_TILE_CACHE_FLUSH | 4823 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4824 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4825 PIPE_CONTROL_DC_FLUSH_ENABLE | 4826 PIPE_CONTROL_FLUSH_ENABLE); 4827 4828 return gen8_emit_fini_breadcrumb_tail(request, cs); 4829 } 4830 4831 /* 4832 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4833 * flush and will continue pre-fetching the instructions after it before the 4834 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4835 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4836 * of the next request before the memory has been flushed, we're guaranteed that 4837 * we won't access the batch itself too early. 4838 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4839 * so, if the current request is modifying an instruction in the next request on 4840 * the same intel_context, we might pre-fetch and then execute the pre-update 4841 * instruction. To avoid this, the users of self-modifying code should either 4842 * disable the parser around the code emitting the memory writes, via a new flag 4843 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4844 * the in-kernel use-cases we've opted to use a separate context, see 4845 * reloc_gpu() as an example. 4846 * All the above applies only to the instructions themselves. Non-inline data 4847 * used by the instructions is not pre-fetched. 4848 */ 4849 4850 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4851 { 4852 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4853 MI_SEMAPHORE_GLOBAL_GTT | 4854 MI_SEMAPHORE_POLL | 4855 MI_SEMAPHORE_SAD_EQ_SDD; 4856 *cs++ = 0; 4857 *cs++ = intel_hws_preempt_address(request->engine); 4858 *cs++ = 0; 4859 *cs++ = 0; 4860 *cs++ = MI_NOOP; 4861 4862 return cs; 4863 } 4864 4865 static __always_inline u32* 4866 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4867 { 4868 *cs++ = MI_USER_INTERRUPT; 4869 4870 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4871 if (intel_engine_has_semaphores(request->engine)) 4872 cs = gen12_emit_preempt_busywait(request, cs); 4873 4874 request->tail = intel_ring_offset(request, cs); 4875 assert_ring_tail_valid(request->ring, request->tail); 4876 4877 return gen8_emit_wa_tail(request, cs); 4878 } 4879 4880 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4881 { 4882 return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4883 } 4884 4885 static u32 * 4886 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4887 { 4888 cs = gen12_emit_ggtt_write_rcs(cs, 4889 request->fence.seqno, 4890 i915_request_active_timeline(request)->hwsp_offset, 4891 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4892 PIPE_CONTROL_CS_STALL | 4893 PIPE_CONTROL_TILE_CACHE_FLUSH | 4894 PIPE_CONTROL_FLUSH_L3 | 4895 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4896 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4897 /* Wa_1409600907:tgl */ 4898 PIPE_CONTROL_DEPTH_STALL | 4899 PIPE_CONTROL_DC_FLUSH_ENABLE | 4900 PIPE_CONTROL_FLUSH_ENABLE); 4901 4902 return gen12_emit_fini_breadcrumb_tail(request, cs); 4903 } 4904 4905 static void execlists_park(struct intel_engine_cs *engine) 4906 { 4907 cancel_timer(&engine->execlists.timer); 4908 cancel_timer(&engine->execlists.preempt); 4909 } 4910 4911 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 4912 { 4913 engine->submit_request = execlists_submit_request; 4914 engine->schedule = i915_schedule; 4915 engine->execlists.tasklet.func = execlists_submission_tasklet; 4916 4917 engine->reset.prepare = execlists_reset_prepare; 4918 engine->reset.rewind = execlists_reset_rewind; 4919 engine->reset.cancel = execlists_reset_cancel; 4920 engine->reset.finish = execlists_reset_finish; 4921 4922 engine->park = execlists_park; 4923 engine->unpark = NULL; 4924 4925 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 4926 if (!intel_vgpu_active(engine->i915)) { 4927 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 4928 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 4929 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 4930 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 4931 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 4932 } 4933 } 4934 4935 if (INTEL_GEN(engine->i915) >= 12) 4936 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 4937 4938 if (intel_engine_has_preemption(engine)) 4939 engine->emit_bb_start = gen8_emit_bb_start; 4940 else 4941 engine->emit_bb_start = gen8_emit_bb_start_noarb; 4942 } 4943 4944 static void execlists_shutdown(struct intel_engine_cs *engine) 4945 { 4946 /* Synchronise with residual timers and any softirq they raise */ 4947 del_timer_sync(&engine->execlists.timer); 4948 del_timer_sync(&engine->execlists.preempt); 4949 tasklet_kill(&engine->execlists.tasklet); 4950 } 4951 4952 static void execlists_release(struct intel_engine_cs *engine) 4953 { 4954 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 4955 4956 execlists_shutdown(engine); 4957 4958 intel_engine_cleanup_common(engine); 4959 lrc_destroy_wa_ctx(engine); 4960 } 4961 4962 static void 4963 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 4964 { 4965 /* Default vfuncs which can be overriden by each engine. */ 4966 4967 engine->resume = execlists_resume; 4968 4969 engine->cops = &execlists_context_ops; 4970 engine->request_alloc = execlists_request_alloc; 4971 4972 engine->emit_flush = gen8_emit_flush; 4973 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 4974 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 4975 if (INTEL_GEN(engine->i915) >= 12) { 4976 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 4977 engine->emit_flush = gen12_emit_flush; 4978 } 4979 engine->set_default_submission = intel_execlists_set_default_submission; 4980 4981 if (INTEL_GEN(engine->i915) < 11) { 4982 engine->irq_enable = gen8_logical_ring_enable_irq; 4983 engine->irq_disable = gen8_logical_ring_disable_irq; 4984 } else { 4985 /* 4986 * TODO: On Gen11 interrupt masks need to be clear 4987 * to allow C6 entry. Keep interrupts enabled at 4988 * and take the hit of generating extra interrupts 4989 * until a more refined solution exists. 4990 */ 4991 } 4992 } 4993 4994 static inline void 4995 logical_ring_default_irqs(struct intel_engine_cs *engine) 4996 { 4997 unsigned int shift = 0; 4998 4999 if (INTEL_GEN(engine->i915) < 11) { 5000 const u8 irq_shifts[] = { 5001 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5002 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5003 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5004 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5005 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5006 }; 5007 5008 shift = irq_shifts[engine->id]; 5009 } 5010 5011 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5012 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5013 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5014 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5015 } 5016 5017 static void rcs_submission_override(struct intel_engine_cs *engine) 5018 { 5019 switch (INTEL_GEN(engine->i915)) { 5020 case 12: 5021 engine->emit_flush = gen12_emit_flush_render; 5022 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5023 break; 5024 case 11: 5025 engine->emit_flush = gen11_emit_flush_render; 5026 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5027 break; 5028 default: 5029 engine->emit_flush = gen8_emit_flush_render; 5030 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5031 break; 5032 } 5033 } 5034 5035 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5036 { 5037 struct intel_engine_execlists * const execlists = &engine->execlists; 5038 struct drm_i915_private *i915 = engine->i915; 5039 struct intel_uncore *uncore = engine->uncore; 5040 u32 base = engine->mmio_base; 5041 5042 tasklet_init(&engine->execlists.tasklet, 5043 execlists_submission_tasklet, (unsigned long)engine); 5044 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5045 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5046 5047 logical_ring_default_vfuncs(engine); 5048 logical_ring_default_irqs(engine); 5049 5050 if (engine->class == RENDER_CLASS) 5051 rcs_submission_override(engine); 5052 5053 if (intel_init_workaround_bb(engine)) 5054 /* 5055 * We continue even if we fail to initialize WA batch 5056 * because we only expect rare glitches but nothing 5057 * critical to prevent us from using GPU 5058 */ 5059 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5060 5061 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5062 execlists->submit_reg = uncore->regs + 5063 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5064 execlists->ctrl_reg = uncore->regs + 5065 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5066 } else { 5067 execlists->submit_reg = uncore->regs + 5068 i915_mmio_reg_offset(RING_ELSP(base)); 5069 } 5070 5071 execlists->csb_status = 5072 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5073 5074 execlists->csb_write = 5075 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5076 5077 if (INTEL_GEN(i915) < 11) 5078 execlists->csb_size = GEN8_CSB_ENTRIES; 5079 else 5080 execlists->csb_size = GEN11_CSB_ENTRIES; 5081 5082 if (INTEL_GEN(engine->i915) >= 11) { 5083 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5084 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5085 } 5086 5087 /* Finally, take ownership and responsibility for cleanup! */ 5088 engine->sanitize = execlists_sanitize; 5089 engine->release = execlists_release; 5090 5091 return 0; 5092 } 5093 5094 static void init_common_reg_state(u32 * const regs, 5095 const struct intel_engine_cs *engine, 5096 const struct intel_ring *ring, 5097 bool inhibit) 5098 { 5099 u32 ctl; 5100 5101 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5102 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5103 if (inhibit) 5104 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5105 if (INTEL_GEN(engine->i915) < 11) 5106 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5107 CTX_CTRL_RS_CTX_ENABLE); 5108 regs[CTX_CONTEXT_CONTROL] = ctl; 5109 5110 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5111 regs[CTX_TIMESTAMP] = 0; 5112 } 5113 5114 static void init_wa_bb_reg_state(u32 * const regs, 5115 const struct intel_engine_cs *engine) 5116 { 5117 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5118 5119 if (wa_ctx->per_ctx.size) { 5120 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5121 5122 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5123 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5124 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5125 } 5126 5127 if (wa_ctx->indirect_ctx.size) { 5128 lrc_ring_setup_indirect_ctx(regs, engine, 5129 i915_ggtt_offset(wa_ctx->vma) + 5130 wa_ctx->indirect_ctx.offset, 5131 wa_ctx->indirect_ctx.size); 5132 } 5133 } 5134 5135 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5136 { 5137 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5138 /* 64b PPGTT (48bit canonical) 5139 * PDP0_DESCRIPTOR contains the base address to PML4 and 5140 * other PDP Descriptors are ignored. 5141 */ 5142 ASSIGN_CTX_PML4(ppgtt, regs); 5143 } else { 5144 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5145 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5146 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5147 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5148 } 5149 } 5150 5151 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5152 { 5153 if (i915_is_ggtt(vm)) 5154 return i915_vm_to_ggtt(vm)->alias; 5155 else 5156 return i915_vm_to_ppgtt(vm); 5157 } 5158 5159 static void execlists_init_reg_state(u32 *regs, 5160 const struct intel_context *ce, 5161 const struct intel_engine_cs *engine, 5162 const struct intel_ring *ring, 5163 bool inhibit) 5164 { 5165 /* 5166 * A context is actually a big batch buffer with several 5167 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5168 * values we are setting here are only for the first context restore: 5169 * on a subsequent save, the GPU will recreate this batchbuffer with new 5170 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5171 * we are not initializing here). 5172 * 5173 * Must keep consistent with virtual_update_register_offsets(). 5174 */ 5175 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5176 5177 init_common_reg_state(regs, engine, ring, inhibit); 5178 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5179 5180 init_wa_bb_reg_state(regs, engine); 5181 5182 __reset_stop_ring(regs, engine); 5183 } 5184 5185 static int 5186 populate_lr_context(struct intel_context *ce, 5187 struct drm_i915_gem_object *ctx_obj, 5188 struct intel_engine_cs *engine, 5189 struct intel_ring *ring) 5190 { 5191 bool inhibit = true; 5192 void *vaddr; 5193 5194 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5195 if (IS_ERR(vaddr)) { 5196 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5197 return PTR_ERR(vaddr); 5198 } 5199 5200 set_redzone(vaddr, engine); 5201 5202 if (engine->default_state) { 5203 shmem_read(engine->default_state, 0, 5204 vaddr, engine->context_size); 5205 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5206 inhibit = false; 5207 } 5208 5209 /* Clear the ppHWSP (inc. per-context counters) */ 5210 memset(vaddr, 0, PAGE_SIZE); 5211 5212 /* 5213 * The second page of the context object contains some registers which 5214 * must be set up prior to the first execution. 5215 */ 5216 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5217 ce, engine, ring, inhibit); 5218 5219 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5220 i915_gem_object_unpin_map(ctx_obj); 5221 return 0; 5222 } 5223 5224 static int __execlists_context_alloc(struct intel_context *ce, 5225 struct intel_engine_cs *engine) 5226 { 5227 struct drm_i915_gem_object *ctx_obj; 5228 struct intel_ring *ring; 5229 struct i915_vma *vma; 5230 u32 context_size; 5231 int ret; 5232 5233 GEM_BUG_ON(ce->state); 5234 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5235 5236 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5237 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5238 5239 if (INTEL_GEN(engine->i915) == 12) { 5240 ce->wa_bb_page = context_size / PAGE_SIZE; 5241 context_size += PAGE_SIZE; 5242 } 5243 5244 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5245 if (IS_ERR(ctx_obj)) 5246 return PTR_ERR(ctx_obj); 5247 5248 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5249 if (IS_ERR(vma)) { 5250 ret = PTR_ERR(vma); 5251 goto error_deref_obj; 5252 } 5253 5254 if (!ce->timeline) { 5255 struct intel_timeline *tl; 5256 struct i915_vma *hwsp; 5257 5258 /* 5259 * Use the static global HWSP for the kernel context, and 5260 * a dynamically allocated cacheline for everyone else. 5261 */ 5262 hwsp = NULL; 5263 if (unlikely(intel_context_is_barrier(ce))) 5264 hwsp = engine->status_page.vma; 5265 5266 tl = intel_timeline_create(engine->gt, hwsp); 5267 if (IS_ERR(tl)) { 5268 ret = PTR_ERR(tl); 5269 goto error_deref_obj; 5270 } 5271 5272 ce->timeline = tl; 5273 } 5274 5275 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5276 if (IS_ERR(ring)) { 5277 ret = PTR_ERR(ring); 5278 goto error_deref_obj; 5279 } 5280 5281 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5282 if (ret) { 5283 drm_dbg(&engine->i915->drm, 5284 "Failed to populate LRC: %d\n", ret); 5285 goto error_ring_free; 5286 } 5287 5288 ce->ring = ring; 5289 ce->state = vma; 5290 5291 return 0; 5292 5293 error_ring_free: 5294 intel_ring_put(ring); 5295 error_deref_obj: 5296 i915_gem_object_put(ctx_obj); 5297 return ret; 5298 } 5299 5300 static struct list_head *virtual_queue(struct virtual_engine *ve) 5301 { 5302 return &ve->base.execlists.default_priolist.requests[0]; 5303 } 5304 5305 static void virtual_context_destroy(struct kref *kref) 5306 { 5307 struct virtual_engine *ve = 5308 container_of(kref, typeof(*ve), context.ref); 5309 unsigned int n; 5310 5311 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5312 GEM_BUG_ON(ve->request); 5313 GEM_BUG_ON(ve->context.inflight); 5314 5315 for (n = 0; n < ve->num_siblings; n++) { 5316 struct intel_engine_cs *sibling = ve->siblings[n]; 5317 struct rb_node *node = &ve->nodes[sibling->id].rb; 5318 unsigned long flags; 5319 5320 if (RB_EMPTY_NODE(node)) 5321 continue; 5322 5323 spin_lock_irqsave(&sibling->active.lock, flags); 5324 5325 /* Detachment is lazily performed in the execlists tasklet */ 5326 if (!RB_EMPTY_NODE(node)) 5327 rb_erase_cached(node, &sibling->execlists.virtual); 5328 5329 spin_unlock_irqrestore(&sibling->active.lock, flags); 5330 } 5331 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5332 5333 if (ve->context.state) 5334 __execlists_context_fini(&ve->context); 5335 intel_context_fini(&ve->context); 5336 5337 intel_engine_free_request_pool(&ve->base); 5338 5339 kfree(ve->bonds); 5340 kfree(ve); 5341 } 5342 5343 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5344 { 5345 int swp; 5346 5347 /* 5348 * Pick a random sibling on starting to help spread the load around. 5349 * 5350 * New contexts are typically created with exactly the same order 5351 * of siblings, and often started in batches. Due to the way we iterate 5352 * the array of sibling when submitting requests, sibling[0] is 5353 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5354 * randomised across the system, we also help spread the load by the 5355 * first engine we inspect being different each time. 5356 * 5357 * NB This does not force us to execute on this engine, it will just 5358 * typically be the first we inspect for submission. 5359 */ 5360 swp = prandom_u32_max(ve->num_siblings); 5361 if (!swp) 5362 return; 5363 5364 swap(ve->siblings[swp], ve->siblings[0]); 5365 if (!intel_engine_has_relative_mmio(ve->siblings[0])) 5366 virtual_update_register_offsets(ve->context.lrc_reg_state, 5367 ve->siblings[0]); 5368 } 5369 5370 static int virtual_context_alloc(struct intel_context *ce) 5371 { 5372 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5373 5374 return __execlists_context_alloc(ce, ve->siblings[0]); 5375 } 5376 5377 static int virtual_context_pin(struct intel_context *ce) 5378 { 5379 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5380 int err; 5381 5382 /* Note: we must use a real engine class for setting up reg state */ 5383 err = __execlists_context_pin(ce, ve->siblings[0]); 5384 if (err) 5385 return err; 5386 5387 virtual_engine_initial_hint(ve); 5388 return 0; 5389 } 5390 5391 static void virtual_context_enter(struct intel_context *ce) 5392 { 5393 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5394 unsigned int n; 5395 5396 for (n = 0; n < ve->num_siblings; n++) 5397 intel_engine_pm_get(ve->siblings[n]); 5398 5399 intel_timeline_enter(ce->timeline); 5400 } 5401 5402 static void virtual_context_exit(struct intel_context *ce) 5403 { 5404 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5405 unsigned int n; 5406 5407 intel_timeline_exit(ce->timeline); 5408 5409 for (n = 0; n < ve->num_siblings; n++) 5410 intel_engine_pm_put(ve->siblings[n]); 5411 } 5412 5413 static const struct intel_context_ops virtual_context_ops = { 5414 .alloc = virtual_context_alloc, 5415 5416 .pin = virtual_context_pin, 5417 .unpin = execlists_context_unpin, 5418 5419 .enter = virtual_context_enter, 5420 .exit = virtual_context_exit, 5421 5422 .destroy = virtual_context_destroy, 5423 }; 5424 5425 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5426 { 5427 struct i915_request *rq; 5428 intel_engine_mask_t mask; 5429 5430 rq = READ_ONCE(ve->request); 5431 if (!rq) 5432 return 0; 5433 5434 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5435 mask = rq->execution_mask; 5436 if (unlikely(!mask)) { 5437 /* Invalid selection, submit to a random engine in error */ 5438 i915_request_set_error_once(rq, -ENODEV); 5439 mask = ve->siblings[0]->mask; 5440 } 5441 5442 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5443 rq->fence.context, rq->fence.seqno, 5444 mask, ve->base.execlists.queue_priority_hint); 5445 5446 return mask; 5447 } 5448 5449 static void virtual_submission_tasklet(unsigned long data) 5450 { 5451 struct virtual_engine * const ve = (struct virtual_engine *)data; 5452 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5453 intel_engine_mask_t mask; 5454 unsigned int n; 5455 5456 rcu_read_lock(); 5457 mask = virtual_submission_mask(ve); 5458 rcu_read_unlock(); 5459 if (unlikely(!mask)) 5460 return; 5461 5462 local_irq_disable(); 5463 for (n = 0; n < ve->num_siblings; n++) { 5464 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5465 struct ve_node * const node = &ve->nodes[sibling->id]; 5466 struct rb_node **parent, *rb; 5467 bool first; 5468 5469 if (!READ_ONCE(ve->request)) 5470 break; /* already handled by a sibling's tasklet */ 5471 5472 if (unlikely(!(mask & sibling->mask))) { 5473 if (!RB_EMPTY_NODE(&node->rb)) { 5474 spin_lock(&sibling->active.lock); 5475 rb_erase_cached(&node->rb, 5476 &sibling->execlists.virtual); 5477 RB_CLEAR_NODE(&node->rb); 5478 spin_unlock(&sibling->active.lock); 5479 } 5480 continue; 5481 } 5482 5483 spin_lock(&sibling->active.lock); 5484 5485 if (!RB_EMPTY_NODE(&node->rb)) { 5486 /* 5487 * Cheat and avoid rebalancing the tree if we can 5488 * reuse this node in situ. 5489 */ 5490 first = rb_first_cached(&sibling->execlists.virtual) == 5491 &node->rb; 5492 if (prio == node->prio || (prio > node->prio && first)) 5493 goto submit_engine; 5494 5495 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5496 } 5497 5498 rb = NULL; 5499 first = true; 5500 parent = &sibling->execlists.virtual.rb_root.rb_node; 5501 while (*parent) { 5502 struct ve_node *other; 5503 5504 rb = *parent; 5505 other = rb_entry(rb, typeof(*other), rb); 5506 if (prio > other->prio) { 5507 parent = &rb->rb_left; 5508 } else { 5509 parent = &rb->rb_right; 5510 first = false; 5511 } 5512 } 5513 5514 rb_link_node(&node->rb, rb, parent); 5515 rb_insert_color_cached(&node->rb, 5516 &sibling->execlists.virtual, 5517 first); 5518 5519 submit_engine: 5520 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5521 node->prio = prio; 5522 if (first && prio > sibling->execlists.queue_priority_hint) 5523 tasklet_hi_schedule(&sibling->execlists.tasklet); 5524 5525 spin_unlock(&sibling->active.lock); 5526 } 5527 local_irq_enable(); 5528 } 5529 5530 static void virtual_submit_request(struct i915_request *rq) 5531 { 5532 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5533 struct i915_request *old; 5534 unsigned long flags; 5535 5536 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5537 rq->fence.context, 5538 rq->fence.seqno); 5539 5540 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5541 5542 spin_lock_irqsave(&ve->base.active.lock, flags); 5543 5544 old = ve->request; 5545 if (old) { /* background completion event from preempt-to-busy */ 5546 GEM_BUG_ON(!i915_request_completed(old)); 5547 __i915_request_submit(old); 5548 i915_request_put(old); 5549 } 5550 5551 if (i915_request_completed(rq)) { 5552 __i915_request_submit(rq); 5553 5554 ve->base.execlists.queue_priority_hint = INT_MIN; 5555 ve->request = NULL; 5556 } else { 5557 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5558 ve->request = i915_request_get(rq); 5559 5560 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5561 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5562 5563 tasklet_schedule(&ve->base.execlists.tasklet); 5564 } 5565 5566 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5567 } 5568 5569 static struct ve_bond * 5570 virtual_find_bond(struct virtual_engine *ve, 5571 const struct intel_engine_cs *master) 5572 { 5573 int i; 5574 5575 for (i = 0; i < ve->num_bonds; i++) { 5576 if (ve->bonds[i].master == master) 5577 return &ve->bonds[i]; 5578 } 5579 5580 return NULL; 5581 } 5582 5583 static void 5584 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5585 { 5586 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5587 intel_engine_mask_t allowed, exec; 5588 struct ve_bond *bond; 5589 5590 allowed = ~to_request(signal)->engine->mask; 5591 5592 bond = virtual_find_bond(ve, to_request(signal)->engine); 5593 if (bond) 5594 allowed &= bond->sibling_mask; 5595 5596 /* Restrict the bonded request to run on only the available engines */ 5597 exec = READ_ONCE(rq->execution_mask); 5598 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5599 ; 5600 5601 /* Prevent the master from being re-run on the bonded engines */ 5602 to_request(signal)->execution_mask &= ~allowed; 5603 } 5604 5605 struct intel_context * 5606 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5607 unsigned int count) 5608 { 5609 struct virtual_engine *ve; 5610 unsigned int n; 5611 int err; 5612 5613 if (count == 0) 5614 return ERR_PTR(-EINVAL); 5615 5616 if (count == 1) 5617 return intel_context_create(siblings[0]); 5618 5619 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5620 if (!ve) 5621 return ERR_PTR(-ENOMEM); 5622 5623 ve->base.i915 = siblings[0]->i915; 5624 ve->base.gt = siblings[0]->gt; 5625 ve->base.uncore = siblings[0]->uncore; 5626 ve->base.id = -1; 5627 5628 ve->base.class = OTHER_CLASS; 5629 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5630 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5631 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5632 5633 /* 5634 * The decision on whether to submit a request using semaphores 5635 * depends on the saturated state of the engine. We only compute 5636 * this during HW submission of the request, and we need for this 5637 * state to be globally applied to all requests being submitted 5638 * to this engine. Virtual engines encompass more than one physical 5639 * engine and so we cannot accurately tell in advance if one of those 5640 * engines is already saturated and so cannot afford to use a semaphore 5641 * and be pessimized in priority for doing so -- if we are the only 5642 * context using semaphores after all other clients have stopped, we 5643 * will be starved on the saturated system. Such a global switch for 5644 * semaphores is less than ideal, but alas is the current compromise. 5645 */ 5646 ve->base.saturated = ALL_ENGINES; 5647 5648 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5649 5650 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5651 intel_engine_init_breadcrumbs(&ve->base); 5652 intel_engine_init_execlists(&ve->base); 5653 5654 ve->base.cops = &virtual_context_ops; 5655 ve->base.request_alloc = execlists_request_alloc; 5656 5657 ve->base.schedule = i915_schedule; 5658 ve->base.submit_request = virtual_submit_request; 5659 ve->base.bond_execute = virtual_bond_execute; 5660 5661 INIT_LIST_HEAD(virtual_queue(ve)); 5662 ve->base.execlists.queue_priority_hint = INT_MIN; 5663 tasklet_init(&ve->base.execlists.tasklet, 5664 virtual_submission_tasklet, 5665 (unsigned long)ve); 5666 5667 intel_context_init(&ve->context, &ve->base); 5668 5669 for (n = 0; n < count; n++) { 5670 struct intel_engine_cs *sibling = siblings[n]; 5671 5672 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5673 if (sibling->mask & ve->base.mask) { 5674 DRM_DEBUG("duplicate %s entry in load balancer\n", 5675 sibling->name); 5676 err = -EINVAL; 5677 goto err_put; 5678 } 5679 5680 /* 5681 * The virtual engine implementation is tightly coupled to 5682 * the execlists backend -- we push out request directly 5683 * into a tree inside each physical engine. We could support 5684 * layering if we handle cloning of the requests and 5685 * submitting a copy into each backend. 5686 */ 5687 if (sibling->execlists.tasklet.func != 5688 execlists_submission_tasklet) { 5689 err = -ENODEV; 5690 goto err_put; 5691 } 5692 5693 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5694 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5695 5696 ve->siblings[ve->num_siblings++] = sibling; 5697 ve->base.mask |= sibling->mask; 5698 5699 /* 5700 * All physical engines must be compatible for their emission 5701 * functions (as we build the instructions during request 5702 * construction and do not alter them before submission 5703 * on the physical engine). We use the engine class as a guide 5704 * here, although that could be refined. 5705 */ 5706 if (ve->base.class != OTHER_CLASS) { 5707 if (ve->base.class != sibling->class) { 5708 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5709 sibling->class, ve->base.class); 5710 err = -EINVAL; 5711 goto err_put; 5712 } 5713 continue; 5714 } 5715 5716 ve->base.class = sibling->class; 5717 ve->base.uabi_class = sibling->uabi_class; 5718 snprintf(ve->base.name, sizeof(ve->base.name), 5719 "v%dx%d", ve->base.class, count); 5720 ve->base.context_size = sibling->context_size; 5721 5722 ve->base.emit_bb_start = sibling->emit_bb_start; 5723 ve->base.emit_flush = sibling->emit_flush; 5724 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5725 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5726 ve->base.emit_fini_breadcrumb_dw = 5727 sibling->emit_fini_breadcrumb_dw; 5728 5729 ve->base.flags = sibling->flags; 5730 } 5731 5732 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5733 5734 return &ve->context; 5735 5736 err_put: 5737 intel_context_put(&ve->context); 5738 return ERR_PTR(err); 5739 } 5740 5741 struct intel_context * 5742 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5743 { 5744 struct virtual_engine *se = to_virtual_engine(src); 5745 struct intel_context *dst; 5746 5747 dst = intel_execlists_create_virtual(se->siblings, 5748 se->num_siblings); 5749 if (IS_ERR(dst)) 5750 return dst; 5751 5752 if (se->num_bonds) { 5753 struct virtual_engine *de = to_virtual_engine(dst->engine); 5754 5755 de->bonds = kmemdup(se->bonds, 5756 sizeof(*se->bonds) * se->num_bonds, 5757 GFP_KERNEL); 5758 if (!de->bonds) { 5759 intel_context_put(dst); 5760 return ERR_PTR(-ENOMEM); 5761 } 5762 5763 de->num_bonds = se->num_bonds; 5764 } 5765 5766 return dst; 5767 } 5768 5769 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5770 const struct intel_engine_cs *master, 5771 const struct intel_engine_cs *sibling) 5772 { 5773 struct virtual_engine *ve = to_virtual_engine(engine); 5774 struct ve_bond *bond; 5775 int n; 5776 5777 /* Sanity check the sibling is part of the virtual engine */ 5778 for (n = 0; n < ve->num_siblings; n++) 5779 if (sibling == ve->siblings[n]) 5780 break; 5781 if (n == ve->num_siblings) 5782 return -EINVAL; 5783 5784 bond = virtual_find_bond(ve, master); 5785 if (bond) { 5786 bond->sibling_mask |= sibling->mask; 5787 return 0; 5788 } 5789 5790 bond = krealloc(ve->bonds, 5791 sizeof(*bond) * (ve->num_bonds + 1), 5792 GFP_KERNEL); 5793 if (!bond) 5794 return -ENOMEM; 5795 5796 bond[ve->num_bonds].master = master; 5797 bond[ve->num_bonds].sibling_mask = sibling->mask; 5798 5799 ve->bonds = bond; 5800 ve->num_bonds++; 5801 5802 return 0; 5803 } 5804 5805 struct intel_engine_cs * 5806 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5807 unsigned int sibling) 5808 { 5809 struct virtual_engine *ve = to_virtual_engine(engine); 5810 5811 if (sibling >= ve->num_siblings) 5812 return NULL; 5813 5814 return ve->siblings[sibling]; 5815 } 5816 5817 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5818 struct drm_printer *m, 5819 void (*show_request)(struct drm_printer *m, 5820 struct i915_request *rq, 5821 const char *prefix), 5822 unsigned int max) 5823 { 5824 const struct intel_engine_execlists *execlists = &engine->execlists; 5825 struct i915_request *rq, *last; 5826 unsigned long flags; 5827 unsigned int count; 5828 struct rb_node *rb; 5829 5830 spin_lock_irqsave(&engine->active.lock, flags); 5831 5832 last = NULL; 5833 count = 0; 5834 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5835 if (count++ < max - 1) 5836 show_request(m, rq, "\t\tE "); 5837 else 5838 last = rq; 5839 } 5840 if (last) { 5841 if (count > max) { 5842 drm_printf(m, 5843 "\t\t...skipping %d executing requests...\n", 5844 count - max); 5845 } 5846 show_request(m, last, "\t\tE "); 5847 } 5848 5849 if (execlists->switch_priority_hint != INT_MIN) 5850 drm_printf(m, "\t\tSwitch priority hint: %d\n", 5851 READ_ONCE(execlists->switch_priority_hint)); 5852 if (execlists->queue_priority_hint != INT_MIN) 5853 drm_printf(m, "\t\tQueue priority hint: %d\n", 5854 READ_ONCE(execlists->queue_priority_hint)); 5855 5856 last = NULL; 5857 count = 0; 5858 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5859 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5860 int i; 5861 5862 priolist_for_each_request(rq, p, i) { 5863 if (count++ < max - 1) 5864 show_request(m, rq, "\t\tQ "); 5865 else 5866 last = rq; 5867 } 5868 } 5869 if (last) { 5870 if (count > max) { 5871 drm_printf(m, 5872 "\t\t...skipping %d queued requests...\n", 5873 count - max); 5874 } 5875 show_request(m, last, "\t\tQ "); 5876 } 5877 5878 last = NULL; 5879 count = 0; 5880 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5881 struct virtual_engine *ve = 5882 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5883 struct i915_request *rq = READ_ONCE(ve->request); 5884 5885 if (rq) { 5886 if (count++ < max - 1) 5887 show_request(m, rq, "\t\tV "); 5888 else 5889 last = rq; 5890 } 5891 } 5892 if (last) { 5893 if (count > max) { 5894 drm_printf(m, 5895 "\t\t...skipping %d virtual requests...\n", 5896 count - max); 5897 } 5898 show_request(m, last, "\t\tV "); 5899 } 5900 5901 spin_unlock_irqrestore(&engine->active.lock, flags); 5902 } 5903 5904 void intel_lr_context_reset(struct intel_engine_cs *engine, 5905 struct intel_context *ce, 5906 u32 head, 5907 bool scrub) 5908 { 5909 GEM_BUG_ON(!intel_context_is_pinned(ce)); 5910 5911 /* 5912 * We want a simple context + ring to execute the breadcrumb update. 5913 * We cannot rely on the context being intact across the GPU hang, 5914 * so clear it and rebuild just what we need for the breadcrumb. 5915 * All pending requests for this context will be zapped, and any 5916 * future request will be after userspace has had the opportunity 5917 * to recreate its own state. 5918 */ 5919 if (scrub) 5920 restore_default_state(ce, engine); 5921 5922 /* Rerun the request; its payload has been neutered (if guilty). */ 5923 __execlists_update_reg_state(ce, engine, head); 5924 } 5925 5926 bool 5927 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 5928 { 5929 return engine->set_default_submission == 5930 intel_execlists_set_default_submission; 5931 } 5932 5933 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 5934 #include "selftest_lrc.c" 5935 #endif 5936