1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_breadcrumbs.h" 141 #include "intel_context.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 #include "shmem_utils.h" 152 153 #define RING_EXECLIST_QFULL (1 << 0x2) 154 #define RING_EXECLIST1_VALID (1 << 0x3) 155 #define RING_EXECLIST0_VALID (1 << 0x4) 156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 157 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 158 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 159 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 166 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 169 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 171 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 175 #define GEN12_IDLE_CTX_ID 0x7FF 176 #define GEN12_CSB_CTX_VALID(csb_dw) \ 177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 178 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine, 241 u32 head); 242 243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 244 { 245 if (INTEL_GEN(engine->i915) >= 12) 246 return 0x60; 247 else if (INTEL_GEN(engine->i915) >= 9) 248 return 0x54; 249 else if (engine->class == RENDER_CLASS) 250 return 0x58; 251 else 252 return -1; 253 } 254 255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 256 { 257 if (INTEL_GEN(engine->i915) >= 12) 258 return 0x74; 259 else if (INTEL_GEN(engine->i915) >= 9) 260 return 0x68; 261 else if (engine->class == RENDER_CLASS) 262 return 0xd8; 263 else 264 return -1; 265 } 266 267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 268 { 269 if (INTEL_GEN(engine->i915) >= 12) 270 return 0x12; 271 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 272 return 0x18; 273 else 274 return -1; 275 } 276 277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 278 { 279 int x; 280 281 x = lrc_ring_wa_bb_per_ctx(engine); 282 if (x < 0) 283 return x; 284 285 return x + 2; 286 } 287 288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 289 { 290 int x; 291 292 x = lrc_ring_indirect_ptr(engine); 293 if (x < 0) 294 return x; 295 296 return x + 2; 297 } 298 299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 300 { 301 if (engine->class != RENDER_CLASS) 302 return -1; 303 304 if (INTEL_GEN(engine->i915) >= 12) 305 return 0xb6; 306 else if (INTEL_GEN(engine->i915) >= 11) 307 return 0xaa; 308 else 309 return -1; 310 } 311 312 static u32 313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 314 { 315 switch (INTEL_GEN(engine->i915)) { 316 default: 317 MISSING_CASE(INTEL_GEN(engine->i915)); 318 fallthrough; 319 case 12: 320 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 321 case 11: 322 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 323 case 10: 324 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 325 case 9: 326 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 327 case 8: 328 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 329 } 330 } 331 332 static void 333 lrc_ring_setup_indirect_ctx(u32 *regs, 334 const struct intel_engine_cs *engine, 335 u32 ctx_bb_ggtt_addr, 336 u32 size) 337 { 338 GEM_BUG_ON(!size); 339 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 340 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 341 regs[lrc_ring_indirect_ptr(engine) + 1] = 342 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 343 344 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 345 regs[lrc_ring_indirect_offset(engine) + 1] = 346 lrc_ring_indirect_offset_default(engine) << 6; 347 } 348 349 static u32 intel_context_get_runtime(const struct intel_context *ce) 350 { 351 /* 352 * We can use either ppHWSP[16] which is recorded before the context 353 * switch (and so excludes the cost of context switches) or use the 354 * value from the context image itself, which is saved/restored earlier 355 * and so includes the cost of the save. 356 */ 357 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 358 } 359 360 static void mark_eio(struct i915_request *rq) 361 { 362 if (i915_request_completed(rq)) 363 return; 364 365 GEM_BUG_ON(i915_request_signaled(rq)); 366 367 i915_request_set_error_once(rq, -EIO); 368 i915_request_mark_complete(rq); 369 } 370 371 static struct i915_request * 372 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 373 { 374 struct i915_request *active = rq; 375 376 rcu_read_lock(); 377 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 378 if (i915_request_completed(rq)) 379 break; 380 381 active = rq; 382 } 383 rcu_read_unlock(); 384 385 return active; 386 } 387 388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 389 { 390 return (i915_ggtt_offset(engine->status_page.vma) + 391 I915_GEM_HWS_PREEMPT_ADDR); 392 } 393 394 static inline void 395 ring_set_paused(const struct intel_engine_cs *engine, int state) 396 { 397 /* 398 * We inspect HWS_PREEMPT with a semaphore inside 399 * engine->emit_fini_breadcrumb. If the dword is true, 400 * the ring is paused as the semaphore will busywait 401 * until the dword is false. 402 */ 403 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 404 if (state) 405 wmb(); 406 } 407 408 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 409 { 410 return rb_entry(rb, struct i915_priolist, node); 411 } 412 413 static inline int rq_prio(const struct i915_request *rq) 414 { 415 return READ_ONCE(rq->sched.attr.priority); 416 } 417 418 static int effective_prio(const struct i915_request *rq) 419 { 420 int prio = rq_prio(rq); 421 422 /* 423 * If this request is special and must not be interrupted at any 424 * cost, so be it. Note we are only checking the most recent request 425 * in the context and so may be masking an earlier vip request. It 426 * is hoped that under the conditions where nopreempt is used, this 427 * will not matter (i.e. all requests to that context will be 428 * nopreempt for as long as desired). 429 */ 430 if (i915_request_has_nopreempt(rq)) 431 prio = I915_PRIORITY_UNPREEMPTABLE; 432 433 return prio; 434 } 435 436 static int queue_prio(const struct intel_engine_execlists *execlists) 437 { 438 struct i915_priolist *p; 439 struct rb_node *rb; 440 441 rb = rb_first_cached(&execlists->queue); 442 if (!rb) 443 return INT_MIN; 444 445 /* 446 * As the priolist[] are inverted, with the highest priority in [0], 447 * we have to flip the index value to become priority. 448 */ 449 p = to_priolist(rb); 450 if (!I915_USER_PRIORITY_SHIFT) 451 return p->priority; 452 453 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 454 } 455 456 static inline bool need_preempt(const struct intel_engine_cs *engine, 457 const struct i915_request *rq, 458 struct rb_node *rb) 459 { 460 int last_prio; 461 462 if (!intel_engine_has_semaphores(engine)) 463 return false; 464 465 /* 466 * Check if the current priority hint merits a preemption attempt. 467 * 468 * We record the highest value priority we saw during rescheduling 469 * prior to this dequeue, therefore we know that if it is strictly 470 * less than the current tail of ESLP[0], we do not need to force 471 * a preempt-to-idle cycle. 472 * 473 * However, the priority hint is a mere hint that we may need to 474 * preempt. If that hint is stale or we may be trying to preempt 475 * ourselves, ignore the request. 476 * 477 * More naturally we would write 478 * prio >= max(0, last); 479 * except that we wish to prevent triggering preemption at the same 480 * priority level: the task that is running should remain running 481 * to preserve FIFO ordering of dependencies. 482 */ 483 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 484 if (engine->execlists.queue_priority_hint <= last_prio) 485 return false; 486 487 /* 488 * Check against the first request in ELSP[1], it will, thanks to the 489 * power of PI, be the highest priority of that context. 490 */ 491 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 492 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 493 return true; 494 495 if (rb) { 496 struct virtual_engine *ve = 497 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 498 bool preempt = false; 499 500 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 501 struct i915_request *next; 502 503 rcu_read_lock(); 504 next = READ_ONCE(ve->request); 505 if (next) 506 preempt = rq_prio(next) > last_prio; 507 rcu_read_unlock(); 508 } 509 510 if (preempt) 511 return preempt; 512 } 513 514 /* 515 * If the inflight context did not trigger the preemption, then maybe 516 * it was the set of queued requests? Pick the highest priority in 517 * the queue (the first active priolist) and see if it deserves to be 518 * running instead of ELSP[0]. 519 * 520 * The highest priority request in the queue can not be either 521 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 522 * context, it's priority would not exceed ELSP[0] aka last_prio. 523 */ 524 return queue_prio(&engine->execlists) > last_prio; 525 } 526 527 __maybe_unused static inline bool 528 assert_priority_queue(const struct i915_request *prev, 529 const struct i915_request *next) 530 { 531 /* 532 * Without preemption, the prev may refer to the still active element 533 * which we refuse to let go. 534 * 535 * Even with preemption, there are times when we think it is better not 536 * to preempt and leave an ostensibly lower priority request in flight. 537 */ 538 if (i915_request_is_active(prev)) 539 return true; 540 541 return rq_prio(prev) >= rq_prio(next); 542 } 543 544 /* 545 * The context descriptor encodes various attributes of a context, 546 * including its GTT address and some flags. Because it's fairly 547 * expensive to calculate, we'll just do it once and cache the result, 548 * which remains valid until the context is unpinned. 549 * 550 * This is what a descriptor looks like, from LSB to MSB:: 551 * 552 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 553 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 554 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 555 * bits 53-54: mbz, reserved for use by hardware 556 * bits 55-63: group ID, currently unused and set to 0 557 * 558 * Starting from Gen11, the upper dword of the descriptor has a new format: 559 * 560 * bits 32-36: reserved 561 * bits 37-47: SW context ID 562 * bits 48:53: engine instance 563 * bit 54: mbz, reserved for use by hardware 564 * bits 55-60: SW counter 565 * bits 61-63: engine class 566 * 567 * engine info, SW context ID and SW counter need to form a unique number 568 * (Context ID) per lrc. 569 */ 570 static u32 571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 572 { 573 u32 desc; 574 575 desc = INTEL_LEGACY_32B_CONTEXT; 576 if (i915_vm_is_4lvl(ce->vm)) 577 desc = INTEL_LEGACY_64B_CONTEXT; 578 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 579 580 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 581 if (IS_GEN(engine->i915, 8)) 582 desc |= GEN8_CTX_L3LLC_COHERENT; 583 584 return i915_ggtt_offset(ce->state) | desc; 585 } 586 587 static inline unsigned int dword_in_page(void *addr) 588 { 589 return offset_in_page(addr) / sizeof(u32); 590 } 591 592 static void set_offsets(u32 *regs, 593 const u8 *data, 594 const struct intel_engine_cs *engine, 595 bool clear) 596 #define NOP(x) (BIT(7) | (x)) 597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 598 #define POSTED BIT(0) 599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 600 #define REG16(x) \ 601 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 602 (((x) >> 2) & 0x7f) 603 #define END(total_state_size) 0, (total_state_size) 604 { 605 const u32 base = engine->mmio_base; 606 607 while (*data) { 608 u8 count, flags; 609 610 if (*data & BIT(7)) { /* skip */ 611 count = *data++ & ~BIT(7); 612 if (clear) 613 memset32(regs, MI_NOOP, count); 614 regs += count; 615 continue; 616 } 617 618 count = *data & 0x3f; 619 flags = *data >> 6; 620 data++; 621 622 *regs = MI_LOAD_REGISTER_IMM(count); 623 if (flags & POSTED) 624 *regs |= MI_LRI_FORCE_POSTED; 625 if (INTEL_GEN(engine->i915) >= 11) 626 *regs |= MI_LRI_LRM_CS_MMIO; 627 regs++; 628 629 GEM_BUG_ON(!count); 630 do { 631 u32 offset = 0; 632 u8 v; 633 634 do { 635 v = *data++; 636 offset <<= 7; 637 offset |= v & ~BIT(7); 638 } while (v & BIT(7)); 639 640 regs[0] = base + (offset << 2); 641 if (clear) 642 regs[1] = 0; 643 regs += 2; 644 } while (--count); 645 } 646 647 if (clear) { 648 u8 count = *++data; 649 650 /* Clear past the tail for HW access */ 651 GEM_BUG_ON(dword_in_page(regs) > count); 652 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 653 654 /* Close the batch; used mainly by live_lrc_layout() */ 655 *regs = MI_BATCH_BUFFER_END; 656 if (INTEL_GEN(engine->i915) >= 10) 657 *regs |= BIT(0); 658 } 659 } 660 661 static const u8 gen8_xcs_offsets[] = { 662 NOP(1), 663 LRI(11, 0), 664 REG16(0x244), 665 REG(0x034), 666 REG(0x030), 667 REG(0x038), 668 REG(0x03c), 669 REG(0x168), 670 REG(0x140), 671 REG(0x110), 672 REG(0x11c), 673 REG(0x114), 674 REG(0x118), 675 676 NOP(9), 677 LRI(9, 0), 678 REG16(0x3a8), 679 REG16(0x28c), 680 REG16(0x288), 681 REG16(0x284), 682 REG16(0x280), 683 REG16(0x27c), 684 REG16(0x278), 685 REG16(0x274), 686 REG16(0x270), 687 688 NOP(13), 689 LRI(2, 0), 690 REG16(0x200), 691 REG(0x028), 692 693 END(80) 694 }; 695 696 static const u8 gen9_xcs_offsets[] = { 697 NOP(1), 698 LRI(14, POSTED), 699 REG16(0x244), 700 REG(0x034), 701 REG(0x030), 702 REG(0x038), 703 REG(0x03c), 704 REG(0x168), 705 REG(0x140), 706 REG(0x110), 707 REG(0x11c), 708 REG(0x114), 709 REG(0x118), 710 REG(0x1c0), 711 REG(0x1c4), 712 REG(0x1c8), 713 714 NOP(3), 715 LRI(9, POSTED), 716 REG16(0x3a8), 717 REG16(0x28c), 718 REG16(0x288), 719 REG16(0x284), 720 REG16(0x280), 721 REG16(0x27c), 722 REG16(0x278), 723 REG16(0x274), 724 REG16(0x270), 725 726 NOP(13), 727 LRI(1, POSTED), 728 REG16(0x200), 729 730 NOP(13), 731 LRI(44, POSTED), 732 REG(0x028), 733 REG(0x09c), 734 REG(0x0c0), 735 REG(0x178), 736 REG(0x17c), 737 REG16(0x358), 738 REG(0x170), 739 REG(0x150), 740 REG(0x154), 741 REG(0x158), 742 REG16(0x41c), 743 REG16(0x600), 744 REG16(0x604), 745 REG16(0x608), 746 REG16(0x60c), 747 REG16(0x610), 748 REG16(0x614), 749 REG16(0x618), 750 REG16(0x61c), 751 REG16(0x620), 752 REG16(0x624), 753 REG16(0x628), 754 REG16(0x62c), 755 REG16(0x630), 756 REG16(0x634), 757 REG16(0x638), 758 REG16(0x63c), 759 REG16(0x640), 760 REG16(0x644), 761 REG16(0x648), 762 REG16(0x64c), 763 REG16(0x650), 764 REG16(0x654), 765 REG16(0x658), 766 REG16(0x65c), 767 REG16(0x660), 768 REG16(0x664), 769 REG16(0x668), 770 REG16(0x66c), 771 REG16(0x670), 772 REG16(0x674), 773 REG16(0x678), 774 REG16(0x67c), 775 REG(0x068), 776 777 END(176) 778 }; 779 780 static const u8 gen12_xcs_offsets[] = { 781 NOP(1), 782 LRI(13, POSTED), 783 REG16(0x244), 784 REG(0x034), 785 REG(0x030), 786 REG(0x038), 787 REG(0x03c), 788 REG(0x168), 789 REG(0x140), 790 REG(0x110), 791 REG(0x1c0), 792 REG(0x1c4), 793 REG(0x1c8), 794 REG(0x180), 795 REG16(0x2b4), 796 797 NOP(5), 798 LRI(9, POSTED), 799 REG16(0x3a8), 800 REG16(0x28c), 801 REG16(0x288), 802 REG16(0x284), 803 REG16(0x280), 804 REG16(0x27c), 805 REG16(0x278), 806 REG16(0x274), 807 REG16(0x270), 808 809 END(80) 810 }; 811 812 static const u8 gen8_rcs_offsets[] = { 813 NOP(1), 814 LRI(14, POSTED), 815 REG16(0x244), 816 REG(0x034), 817 REG(0x030), 818 REG(0x038), 819 REG(0x03c), 820 REG(0x168), 821 REG(0x140), 822 REG(0x110), 823 REG(0x11c), 824 REG(0x114), 825 REG(0x118), 826 REG(0x1c0), 827 REG(0x1c4), 828 REG(0x1c8), 829 830 NOP(3), 831 LRI(9, POSTED), 832 REG16(0x3a8), 833 REG16(0x28c), 834 REG16(0x288), 835 REG16(0x284), 836 REG16(0x280), 837 REG16(0x27c), 838 REG16(0x278), 839 REG16(0x274), 840 REG16(0x270), 841 842 NOP(13), 843 LRI(1, 0), 844 REG(0x0c8), 845 846 END(80) 847 }; 848 849 static const u8 gen9_rcs_offsets[] = { 850 NOP(1), 851 LRI(14, POSTED), 852 REG16(0x244), 853 REG(0x34), 854 REG(0x30), 855 REG(0x38), 856 REG(0x3c), 857 REG(0x168), 858 REG(0x140), 859 REG(0x110), 860 REG(0x11c), 861 REG(0x114), 862 REG(0x118), 863 REG(0x1c0), 864 REG(0x1c4), 865 REG(0x1c8), 866 867 NOP(3), 868 LRI(9, POSTED), 869 REG16(0x3a8), 870 REG16(0x28c), 871 REG16(0x288), 872 REG16(0x284), 873 REG16(0x280), 874 REG16(0x27c), 875 REG16(0x278), 876 REG16(0x274), 877 REG16(0x270), 878 879 NOP(13), 880 LRI(1, 0), 881 REG(0xc8), 882 883 NOP(13), 884 LRI(44, POSTED), 885 REG(0x28), 886 REG(0x9c), 887 REG(0xc0), 888 REG(0x178), 889 REG(0x17c), 890 REG16(0x358), 891 REG(0x170), 892 REG(0x150), 893 REG(0x154), 894 REG(0x158), 895 REG16(0x41c), 896 REG16(0x600), 897 REG16(0x604), 898 REG16(0x608), 899 REG16(0x60c), 900 REG16(0x610), 901 REG16(0x614), 902 REG16(0x618), 903 REG16(0x61c), 904 REG16(0x620), 905 REG16(0x624), 906 REG16(0x628), 907 REG16(0x62c), 908 REG16(0x630), 909 REG16(0x634), 910 REG16(0x638), 911 REG16(0x63c), 912 REG16(0x640), 913 REG16(0x644), 914 REG16(0x648), 915 REG16(0x64c), 916 REG16(0x650), 917 REG16(0x654), 918 REG16(0x658), 919 REG16(0x65c), 920 REG16(0x660), 921 REG16(0x664), 922 REG16(0x668), 923 REG16(0x66c), 924 REG16(0x670), 925 REG16(0x674), 926 REG16(0x678), 927 REG16(0x67c), 928 REG(0x68), 929 930 END(176) 931 }; 932 933 static const u8 gen11_rcs_offsets[] = { 934 NOP(1), 935 LRI(15, POSTED), 936 REG16(0x244), 937 REG(0x034), 938 REG(0x030), 939 REG(0x038), 940 REG(0x03c), 941 REG(0x168), 942 REG(0x140), 943 REG(0x110), 944 REG(0x11c), 945 REG(0x114), 946 REG(0x118), 947 REG(0x1c0), 948 REG(0x1c4), 949 REG(0x1c8), 950 REG(0x180), 951 952 NOP(1), 953 LRI(9, POSTED), 954 REG16(0x3a8), 955 REG16(0x28c), 956 REG16(0x288), 957 REG16(0x284), 958 REG16(0x280), 959 REG16(0x27c), 960 REG16(0x278), 961 REG16(0x274), 962 REG16(0x270), 963 964 LRI(1, POSTED), 965 REG(0x1b0), 966 967 NOP(10), 968 LRI(1, 0), 969 REG(0x0c8), 970 971 END(80) 972 }; 973 974 static const u8 gen12_rcs_offsets[] = { 975 NOP(1), 976 LRI(13, POSTED), 977 REG16(0x244), 978 REG(0x034), 979 REG(0x030), 980 REG(0x038), 981 REG(0x03c), 982 REG(0x168), 983 REG(0x140), 984 REG(0x110), 985 REG(0x1c0), 986 REG(0x1c4), 987 REG(0x1c8), 988 REG(0x180), 989 REG16(0x2b4), 990 991 NOP(5), 992 LRI(9, POSTED), 993 REG16(0x3a8), 994 REG16(0x28c), 995 REG16(0x288), 996 REG16(0x284), 997 REG16(0x280), 998 REG16(0x27c), 999 REG16(0x278), 1000 REG16(0x274), 1001 REG16(0x270), 1002 1003 LRI(3, POSTED), 1004 REG(0x1b0), 1005 REG16(0x5a8), 1006 REG16(0x5ac), 1007 1008 NOP(6), 1009 LRI(1, 0), 1010 REG(0x0c8), 1011 NOP(3 + 9 + 1), 1012 1013 LRI(51, POSTED), 1014 REG16(0x588), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG16(0x588), 1020 REG(0x028), 1021 REG(0x09c), 1022 REG(0x0c0), 1023 REG(0x178), 1024 REG(0x17c), 1025 REG16(0x358), 1026 REG(0x170), 1027 REG(0x150), 1028 REG(0x154), 1029 REG(0x158), 1030 REG16(0x41c), 1031 REG16(0x600), 1032 REG16(0x604), 1033 REG16(0x608), 1034 REG16(0x60c), 1035 REG16(0x610), 1036 REG16(0x614), 1037 REG16(0x618), 1038 REG16(0x61c), 1039 REG16(0x620), 1040 REG16(0x624), 1041 REG16(0x628), 1042 REG16(0x62c), 1043 REG16(0x630), 1044 REG16(0x634), 1045 REG16(0x638), 1046 REG16(0x63c), 1047 REG16(0x640), 1048 REG16(0x644), 1049 REG16(0x648), 1050 REG16(0x64c), 1051 REG16(0x650), 1052 REG16(0x654), 1053 REG16(0x658), 1054 REG16(0x65c), 1055 REG16(0x660), 1056 REG16(0x664), 1057 REG16(0x668), 1058 REG16(0x66c), 1059 REG16(0x670), 1060 REG16(0x674), 1061 REG16(0x678), 1062 REG16(0x67c), 1063 REG(0x068), 1064 REG(0x084), 1065 NOP(1), 1066 1067 END(192) 1068 }; 1069 1070 #undef END 1071 #undef REG16 1072 #undef REG 1073 #undef LRI 1074 #undef NOP 1075 1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1077 { 1078 /* 1079 * The gen12+ lists only have the registers we program in the basic 1080 * default state. We rely on the context image using relative 1081 * addressing to automatic fixup the register state between the 1082 * physical engines for virtual engine. 1083 */ 1084 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1085 !intel_engine_has_relative_mmio(engine)); 1086 1087 if (engine->class == RENDER_CLASS) { 1088 if (INTEL_GEN(engine->i915) >= 12) 1089 return gen12_rcs_offsets; 1090 else if (INTEL_GEN(engine->i915) >= 11) 1091 return gen11_rcs_offsets; 1092 else if (INTEL_GEN(engine->i915) >= 9) 1093 return gen9_rcs_offsets; 1094 else 1095 return gen8_rcs_offsets; 1096 } else { 1097 if (INTEL_GEN(engine->i915) >= 12) 1098 return gen12_xcs_offsets; 1099 else if (INTEL_GEN(engine->i915) >= 9) 1100 return gen9_xcs_offsets; 1101 else 1102 return gen8_xcs_offsets; 1103 } 1104 } 1105 1106 static struct i915_request * 1107 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1108 { 1109 struct i915_request *rq, *rn, *active = NULL; 1110 struct list_head *pl; 1111 int prio = I915_PRIORITY_INVALID; 1112 1113 lockdep_assert_held(&engine->active.lock); 1114 1115 list_for_each_entry_safe_reverse(rq, rn, 1116 &engine->active.requests, 1117 sched.link) { 1118 if (i915_request_completed(rq)) 1119 continue; /* XXX */ 1120 1121 __i915_request_unsubmit(rq); 1122 1123 /* 1124 * Push the request back into the queue for later resubmission. 1125 * If this request is not native to this physical engine (i.e. 1126 * it came from a virtual source), push it back onto the virtual 1127 * engine so that it can be moved across onto another physical 1128 * engine as load dictates. 1129 */ 1130 if (likely(rq->execution_mask == engine->mask)) { 1131 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1132 if (rq_prio(rq) != prio) { 1133 prio = rq_prio(rq); 1134 pl = i915_sched_lookup_priolist(engine, prio); 1135 } 1136 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1137 1138 list_move(&rq->sched.link, pl); 1139 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1140 1141 /* Check in case we rollback so far we wrap [size/2] */ 1142 if (intel_ring_direction(rq->ring, 1143 rq->tail, 1144 rq->ring->tail + 8) > 0) 1145 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1146 1147 active = rq; 1148 } else { 1149 struct intel_engine_cs *owner = rq->context->engine; 1150 1151 WRITE_ONCE(rq->engine, owner); 1152 owner->submit_request(rq); 1153 active = NULL; 1154 } 1155 } 1156 1157 return active; 1158 } 1159 1160 struct i915_request * 1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1162 { 1163 struct intel_engine_cs *engine = 1164 container_of(execlists, typeof(*engine), execlists); 1165 1166 return __unwind_incomplete_requests(engine); 1167 } 1168 1169 static inline void 1170 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1171 { 1172 /* 1173 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1174 * The compiler should eliminate this function as dead-code. 1175 */ 1176 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1177 return; 1178 1179 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1180 status, rq); 1181 } 1182 1183 static void intel_engine_context_in(struct intel_engine_cs *engine) 1184 { 1185 unsigned long flags; 1186 1187 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1188 return; 1189 1190 write_seqlock_irqsave(&engine->stats.lock, flags); 1191 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1192 engine->stats.start = ktime_get(); 1193 atomic_inc(&engine->stats.active); 1194 } 1195 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1196 } 1197 1198 static void intel_engine_context_out(struct intel_engine_cs *engine) 1199 { 1200 unsigned long flags; 1201 1202 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1203 1204 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1205 return; 1206 1207 write_seqlock_irqsave(&engine->stats.lock, flags); 1208 if (atomic_dec_and_test(&engine->stats.active)) { 1209 engine->stats.total = 1210 ktime_add(engine->stats.total, 1211 ktime_sub(ktime_get(), engine->stats.start)); 1212 } 1213 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1214 } 1215 1216 static void 1217 execlists_check_context(const struct intel_context *ce, 1218 const struct intel_engine_cs *engine, 1219 const char *when) 1220 { 1221 const struct intel_ring *ring = ce->ring; 1222 u32 *regs = ce->lrc_reg_state; 1223 bool valid = true; 1224 int x; 1225 1226 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1227 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1228 engine->name, 1229 regs[CTX_RING_START], 1230 i915_ggtt_offset(ring->vma)); 1231 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1232 valid = false; 1233 } 1234 1235 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1236 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1237 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1238 engine->name, 1239 regs[CTX_RING_CTL], 1240 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1241 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1242 valid = false; 1243 } 1244 1245 x = lrc_ring_mi_mode(engine); 1246 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1247 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1248 engine->name, regs[x + 1]); 1249 regs[x + 1] &= ~STOP_RING; 1250 regs[x + 1] |= STOP_RING << 16; 1251 valid = false; 1252 } 1253 1254 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1255 } 1256 1257 static void restore_default_state(struct intel_context *ce, 1258 struct intel_engine_cs *engine) 1259 { 1260 u32 *regs; 1261 1262 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1263 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1264 1265 ce->runtime.last = intel_context_get_runtime(ce); 1266 } 1267 1268 static void reset_active(struct i915_request *rq, 1269 struct intel_engine_cs *engine) 1270 { 1271 struct intel_context * const ce = rq->context; 1272 u32 head; 1273 1274 /* 1275 * The executing context has been cancelled. We want to prevent 1276 * further execution along this context and propagate the error on 1277 * to anything depending on its results. 1278 * 1279 * In __i915_request_submit(), we apply the -EIO and remove the 1280 * requests' payloads for any banned requests. But first, we must 1281 * rewind the context back to the start of the incomplete request so 1282 * that we do not jump back into the middle of the batch. 1283 * 1284 * We preserve the breadcrumbs and semaphores of the incomplete 1285 * requests so that inter-timeline dependencies (i.e other timelines) 1286 * remain correctly ordered. And we defer to __i915_request_submit() 1287 * so that all asynchronous waits are correctly handled. 1288 */ 1289 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1290 rq->fence.context, rq->fence.seqno); 1291 1292 /* On resubmission of the active request, payload will be scrubbed */ 1293 if (i915_request_completed(rq)) 1294 head = rq->tail; 1295 else 1296 head = active_request(ce->timeline, rq)->head; 1297 head = intel_ring_wrap(ce->ring, head); 1298 1299 /* Scrub the context image to prevent replaying the previous batch */ 1300 restore_default_state(ce, engine); 1301 __execlists_update_reg_state(ce, engine, head); 1302 1303 /* We've switched away, so this should be a no-op, but intent matters */ 1304 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1305 } 1306 1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1308 { 1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1310 ce->runtime.num_underflow += dt < 0; 1311 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1312 #endif 1313 } 1314 1315 static void intel_context_update_runtime(struct intel_context *ce) 1316 { 1317 u32 old; 1318 s32 dt; 1319 1320 if (intel_context_is_barrier(ce)) 1321 return; 1322 1323 old = ce->runtime.last; 1324 ce->runtime.last = intel_context_get_runtime(ce); 1325 dt = ce->runtime.last - old; 1326 1327 if (unlikely(dt <= 0)) { 1328 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1329 old, ce->runtime.last, dt); 1330 st_update_runtime_underflow(ce, dt); 1331 return; 1332 } 1333 1334 ewma_runtime_add(&ce->runtime.avg, dt); 1335 ce->runtime.total += dt; 1336 } 1337 1338 static inline struct intel_engine_cs * 1339 __execlists_schedule_in(struct i915_request *rq) 1340 { 1341 struct intel_engine_cs * const engine = rq->engine; 1342 struct intel_context * const ce = rq->context; 1343 1344 intel_context_get(ce); 1345 1346 if (unlikely(intel_context_is_banned(ce))) 1347 reset_active(rq, engine); 1348 1349 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1350 execlists_check_context(ce, engine, "before"); 1351 1352 if (ce->tag) { 1353 /* Use a fixed tag for OA and friends */ 1354 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1355 ce->lrc.ccid = ce->tag; 1356 } else { 1357 /* We don't need a strict matching tag, just different values */ 1358 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1359 1360 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1361 clear_bit(tag - 1, &engine->context_tag); 1362 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1363 1364 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1365 } 1366 1367 ce->lrc.ccid |= engine->execlists.ccid; 1368 1369 __intel_gt_pm_get(engine->gt); 1370 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1371 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1372 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1373 intel_engine_context_in(engine); 1374 1375 return engine; 1376 } 1377 1378 static inline struct i915_request * 1379 execlists_schedule_in(struct i915_request *rq, int idx) 1380 { 1381 struct intel_context * const ce = rq->context; 1382 struct intel_engine_cs *old; 1383 1384 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1385 trace_i915_request_in(rq, idx); 1386 1387 old = READ_ONCE(ce->inflight); 1388 do { 1389 if (!old) { 1390 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1391 break; 1392 } 1393 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1394 1395 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1396 return i915_request_get(rq); 1397 } 1398 1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1400 { 1401 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1402 struct i915_request *next = READ_ONCE(ve->request); 1403 1404 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1405 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1406 } 1407 1408 static inline void 1409 __execlists_schedule_out(struct i915_request *rq, 1410 struct intel_engine_cs * const engine, 1411 unsigned int ccid) 1412 { 1413 struct intel_context * const ce = rq->context; 1414 1415 /* 1416 * NB process_csb() is not under the engine->active.lock and hence 1417 * schedule_out can race with schedule_in meaning that we should 1418 * refrain from doing non-trivial work here. 1419 */ 1420 1421 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1422 execlists_check_context(ce, engine, "after"); 1423 1424 /* 1425 * If we have just completed this context, the engine may now be 1426 * idle and we want to re-enter powersaving. 1427 */ 1428 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1429 i915_request_completed(rq)) 1430 intel_engine_add_retire(engine, ce->timeline); 1431 1432 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1433 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1434 if (ccid < BITS_PER_LONG) { 1435 GEM_BUG_ON(ccid == 0); 1436 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1437 set_bit(ccid - 1, &engine->context_tag); 1438 } 1439 1440 intel_context_update_runtime(ce); 1441 intel_engine_context_out(engine); 1442 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1443 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1444 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1445 intel_gt_pm_put_async(engine->gt); 1446 1447 /* 1448 * If this is part of a virtual engine, its next request may 1449 * have been blocked waiting for access to the active context. 1450 * We have to kick all the siblings again in case we need to 1451 * switch (e.g. the next request is not runnable on this 1452 * engine). Hopefully, we will already have submitted the next 1453 * request before the tasklet runs and do not need to rebuild 1454 * each virtual tree and kick everyone again. 1455 */ 1456 if (ce->engine != engine) 1457 kick_siblings(rq, ce); 1458 1459 intel_context_put(ce); 1460 } 1461 1462 static inline void 1463 execlists_schedule_out(struct i915_request *rq) 1464 { 1465 struct intel_context * const ce = rq->context; 1466 struct intel_engine_cs *cur, *old; 1467 u32 ccid; 1468 1469 trace_i915_request_out(rq); 1470 1471 ccid = rq->context->lrc.ccid; 1472 old = READ_ONCE(ce->inflight); 1473 do 1474 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1475 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1476 if (!cur) 1477 __execlists_schedule_out(rq, old, ccid); 1478 1479 i915_request_put(rq); 1480 } 1481 1482 static u64 execlists_update_context(struct i915_request *rq) 1483 { 1484 struct intel_context *ce = rq->context; 1485 u64 desc = ce->lrc.desc; 1486 u32 tail, prev; 1487 1488 /* 1489 * WaIdleLiteRestore:bdw,skl 1490 * 1491 * We should never submit the context with the same RING_TAIL twice 1492 * just in case we submit an empty ring, which confuses the HW. 1493 * 1494 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1495 * the normal request to be able to always advance the RING_TAIL on 1496 * subsequent resubmissions (for lite restore). Should that fail us, 1497 * and we try and submit the same tail again, force the context 1498 * reload. 1499 * 1500 * If we need to return to a preempted context, we need to skip the 1501 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1502 * HW has a tendency to ignore us rewinding the TAIL to the end of 1503 * an earlier request. 1504 */ 1505 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1506 prev = rq->ring->tail; 1507 tail = intel_ring_set_tail(rq->ring, rq->tail); 1508 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1509 desc |= CTX_DESC_FORCE_RESTORE; 1510 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1511 rq->tail = rq->wa_tail; 1512 1513 /* 1514 * Make sure the context image is complete before we submit it to HW. 1515 * 1516 * Ostensibly, writes (including the WCB) should be flushed prior to 1517 * an uncached write such as our mmio register access, the empirical 1518 * evidence (esp. on Braswell) suggests that the WC write into memory 1519 * may not be visible to the HW prior to the completion of the UC 1520 * register write and that we may begin execution from the context 1521 * before its image is complete leading to invalid PD chasing. 1522 */ 1523 wmb(); 1524 1525 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1526 return desc; 1527 } 1528 1529 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1530 { 1531 if (execlists->ctrl_reg) { 1532 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1533 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1534 } else { 1535 writel(upper_32_bits(desc), execlists->submit_reg); 1536 writel(lower_32_bits(desc), execlists->submit_reg); 1537 } 1538 } 1539 1540 static __maybe_unused char * 1541 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1542 { 1543 if (!rq) 1544 return ""; 1545 1546 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1547 prefix, 1548 rq->context->lrc.ccid, 1549 rq->fence.context, rq->fence.seqno, 1550 i915_request_completed(rq) ? "!" : 1551 i915_request_started(rq) ? "*" : 1552 "", 1553 rq_prio(rq)); 1554 1555 return buf; 1556 } 1557 1558 static __maybe_unused void 1559 trace_ports(const struct intel_engine_execlists *execlists, 1560 const char *msg, 1561 struct i915_request * const *ports) 1562 { 1563 const struct intel_engine_cs *engine = 1564 container_of(execlists, typeof(*engine), execlists); 1565 char __maybe_unused p0[40], p1[40]; 1566 1567 if (!ports[0]) 1568 return; 1569 1570 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1571 dump_port(p0, sizeof(p0), "", ports[0]), 1572 dump_port(p1, sizeof(p1), ", ", ports[1])); 1573 } 1574 1575 static inline bool 1576 reset_in_progress(const struct intel_engine_execlists *execlists) 1577 { 1578 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1579 } 1580 1581 static __maybe_unused bool 1582 assert_pending_valid(const struct intel_engine_execlists *execlists, 1583 const char *msg) 1584 { 1585 struct intel_engine_cs *engine = 1586 container_of(execlists, typeof(*engine), execlists); 1587 struct i915_request * const *port, *rq; 1588 struct intel_context *ce = NULL; 1589 bool sentinel = false; 1590 u32 ccid = -1; 1591 1592 trace_ports(execlists, msg, execlists->pending); 1593 1594 /* We may be messing around with the lists during reset, lalala */ 1595 if (reset_in_progress(execlists)) 1596 return true; 1597 1598 if (!execlists->pending[0]) { 1599 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1600 engine->name); 1601 return false; 1602 } 1603 1604 if (execlists->pending[execlists_num_ports(execlists)]) { 1605 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1606 engine->name, execlists_num_ports(execlists)); 1607 return false; 1608 } 1609 1610 for (port = execlists->pending; (rq = *port); port++) { 1611 unsigned long flags; 1612 bool ok = true; 1613 1614 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1615 GEM_BUG_ON(!i915_request_is_active(rq)); 1616 1617 if (ce == rq->context) { 1618 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1619 engine->name, 1620 ce->timeline->fence_context, 1621 port - execlists->pending); 1622 return false; 1623 } 1624 ce = rq->context; 1625 1626 if (ccid == ce->lrc.ccid) { 1627 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1628 engine->name, 1629 ccid, ce->timeline->fence_context, 1630 port - execlists->pending); 1631 return false; 1632 } 1633 ccid = ce->lrc.ccid; 1634 1635 /* 1636 * Sentinels are supposed to be the last request so they flush 1637 * the current execution off the HW. Check that they are the only 1638 * request in the pending submission. 1639 */ 1640 if (sentinel) { 1641 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1642 engine->name, 1643 ce->timeline->fence_context, 1644 port - execlists->pending); 1645 return false; 1646 } 1647 sentinel = i915_request_has_sentinel(rq); 1648 1649 /* Hold tightly onto the lock to prevent concurrent retires! */ 1650 if (!spin_trylock_irqsave(&rq->lock, flags)) 1651 continue; 1652 1653 if (i915_request_completed(rq)) 1654 goto unlock; 1655 1656 if (i915_active_is_idle(&ce->active) && 1657 !intel_context_is_barrier(ce)) { 1658 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1659 engine->name, 1660 ce->timeline->fence_context, 1661 port - execlists->pending); 1662 ok = false; 1663 goto unlock; 1664 } 1665 1666 if (!i915_vma_is_pinned(ce->state)) { 1667 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1668 engine->name, 1669 ce->timeline->fence_context, 1670 port - execlists->pending); 1671 ok = false; 1672 goto unlock; 1673 } 1674 1675 if (!i915_vma_is_pinned(ce->ring->vma)) { 1676 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1677 engine->name, 1678 ce->timeline->fence_context, 1679 port - execlists->pending); 1680 ok = false; 1681 goto unlock; 1682 } 1683 1684 unlock: 1685 spin_unlock_irqrestore(&rq->lock, flags); 1686 if (!ok) 1687 return false; 1688 } 1689 1690 return ce; 1691 } 1692 1693 static void execlists_submit_ports(struct intel_engine_cs *engine) 1694 { 1695 struct intel_engine_execlists *execlists = &engine->execlists; 1696 unsigned int n; 1697 1698 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1699 1700 /* 1701 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1702 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1703 * not be relinquished until the device is idle (see 1704 * i915_gem_idle_work_handler()). As a precaution, we make sure 1705 * that all ELSP are drained i.e. we have processed the CSB, 1706 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1707 */ 1708 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1709 1710 /* 1711 * ELSQ note: the submit queue is not cleared after being submitted 1712 * to the HW so we need to make sure we always clean it up. This is 1713 * currently ensured by the fact that we always write the same number 1714 * of elsq entries, keep this in mind before changing the loop below. 1715 */ 1716 for (n = execlists_num_ports(execlists); n--; ) { 1717 struct i915_request *rq = execlists->pending[n]; 1718 1719 write_desc(execlists, 1720 rq ? execlists_update_context(rq) : 0, 1721 n); 1722 } 1723 1724 /* we need to manually load the submit queue */ 1725 if (execlists->ctrl_reg) 1726 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1727 } 1728 1729 static bool ctx_single_port_submission(const struct intel_context *ce) 1730 { 1731 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1732 intel_context_force_single_submission(ce)); 1733 } 1734 1735 static bool can_merge_ctx(const struct intel_context *prev, 1736 const struct intel_context *next) 1737 { 1738 if (prev != next) 1739 return false; 1740 1741 if (ctx_single_port_submission(prev)) 1742 return false; 1743 1744 return true; 1745 } 1746 1747 static unsigned long i915_request_flags(const struct i915_request *rq) 1748 { 1749 return READ_ONCE(rq->fence.flags); 1750 } 1751 1752 static bool can_merge_rq(const struct i915_request *prev, 1753 const struct i915_request *next) 1754 { 1755 GEM_BUG_ON(prev == next); 1756 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1757 1758 /* 1759 * We do not submit known completed requests. Therefore if the next 1760 * request is already completed, we can pretend to merge it in 1761 * with the previous context (and we will skip updating the ELSP 1762 * and tracking). Thus hopefully keeping the ELSP full with active 1763 * contexts, despite the best efforts of preempt-to-busy to confuse 1764 * us. 1765 */ 1766 if (i915_request_completed(next)) 1767 return true; 1768 1769 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1770 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1771 BIT(I915_FENCE_FLAG_SENTINEL)))) 1772 return false; 1773 1774 if (!can_merge_ctx(prev->context, next->context)) 1775 return false; 1776 1777 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1778 return true; 1779 } 1780 1781 static void virtual_update_register_offsets(u32 *regs, 1782 struct intel_engine_cs *engine) 1783 { 1784 set_offsets(regs, reg_offsets(engine), engine, false); 1785 } 1786 1787 static bool virtual_matches(const struct virtual_engine *ve, 1788 const struct i915_request *rq, 1789 const struct intel_engine_cs *engine) 1790 { 1791 const struct intel_engine_cs *inflight; 1792 1793 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1794 return false; 1795 1796 /* 1797 * We track when the HW has completed saving the context image 1798 * (i.e. when we have seen the final CS event switching out of 1799 * the context) and must not overwrite the context image before 1800 * then. This restricts us to only using the active engine 1801 * while the previous virtualized request is inflight (so 1802 * we reuse the register offsets). This is a very small 1803 * hystersis on the greedy seelction algorithm. 1804 */ 1805 inflight = intel_context_inflight(&ve->context); 1806 if (inflight && inflight != engine) 1807 return false; 1808 1809 return true; 1810 } 1811 1812 static void virtual_xfer_context(struct virtual_engine *ve, 1813 struct intel_engine_cs *engine) 1814 { 1815 unsigned int n; 1816 1817 if (likely(engine == ve->siblings[0])) 1818 return; 1819 1820 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1821 if (!intel_engine_has_relative_mmio(engine)) 1822 virtual_update_register_offsets(ve->context.lrc_reg_state, 1823 engine); 1824 1825 /* 1826 * Move the bound engine to the top of the list for 1827 * future execution. We then kick this tasklet first 1828 * before checking others, so that we preferentially 1829 * reuse this set of bound registers. 1830 */ 1831 for (n = 1; n < ve->num_siblings; n++) { 1832 if (ve->siblings[n] == engine) { 1833 swap(ve->siblings[n], ve->siblings[0]); 1834 break; 1835 } 1836 } 1837 } 1838 1839 #define for_each_waiter(p__, rq__) \ 1840 list_for_each_entry_lockless(p__, \ 1841 &(rq__)->sched.waiters_list, \ 1842 wait_link) 1843 1844 #define for_each_signaler(p__, rq__) \ 1845 list_for_each_entry_rcu(p__, \ 1846 &(rq__)->sched.signalers_list, \ 1847 signal_link) 1848 1849 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1850 { 1851 LIST_HEAD(list); 1852 1853 /* 1854 * We want to move the interrupted request to the back of 1855 * the round-robin list (i.e. its priority level), but 1856 * in doing so, we must then move all requests that were in 1857 * flight and were waiting for the interrupted request to 1858 * be run after it again. 1859 */ 1860 do { 1861 struct i915_dependency *p; 1862 1863 GEM_BUG_ON(i915_request_is_active(rq)); 1864 list_move_tail(&rq->sched.link, pl); 1865 1866 for_each_waiter(p, rq) { 1867 struct i915_request *w = 1868 container_of(p->waiter, typeof(*w), sched); 1869 1870 if (p->flags & I915_DEPENDENCY_WEAK) 1871 continue; 1872 1873 /* Leave semaphores spinning on the other engines */ 1874 if (w->engine != rq->engine) 1875 continue; 1876 1877 /* No waiter should start before its signaler */ 1878 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1879 i915_request_started(w) && 1880 !i915_request_completed(rq)); 1881 1882 GEM_BUG_ON(i915_request_is_active(w)); 1883 if (!i915_request_is_ready(w)) 1884 continue; 1885 1886 if (rq_prio(w) < rq_prio(rq)) 1887 continue; 1888 1889 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1890 list_move_tail(&w->sched.link, &list); 1891 } 1892 1893 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1894 } while (rq); 1895 } 1896 1897 static void defer_active(struct intel_engine_cs *engine) 1898 { 1899 struct i915_request *rq; 1900 1901 rq = __unwind_incomplete_requests(engine); 1902 if (!rq) 1903 return; 1904 1905 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1906 } 1907 1908 static bool 1909 need_timeslice(const struct intel_engine_cs *engine, 1910 const struct i915_request *rq, 1911 const struct rb_node *rb) 1912 { 1913 int hint; 1914 1915 if (!intel_engine_has_timeslices(engine)) 1916 return false; 1917 1918 hint = engine->execlists.queue_priority_hint; 1919 1920 if (rb) { 1921 const struct virtual_engine *ve = 1922 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1923 const struct intel_engine_cs *inflight = 1924 intel_context_inflight(&ve->context); 1925 1926 if (!inflight || inflight == engine) { 1927 struct i915_request *next; 1928 1929 rcu_read_lock(); 1930 next = READ_ONCE(ve->request); 1931 if (next) 1932 hint = max(hint, rq_prio(next)); 1933 rcu_read_unlock(); 1934 } 1935 } 1936 1937 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1938 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1939 1940 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1941 return hint >= effective_prio(rq); 1942 } 1943 1944 static bool 1945 timeslice_yield(const struct intel_engine_execlists *el, 1946 const struct i915_request *rq) 1947 { 1948 /* 1949 * Once bitten, forever smitten! 1950 * 1951 * If the active context ever busy-waited on a semaphore, 1952 * it will be treated as a hog until the end of its timeslice (i.e. 1953 * until it is scheduled out and replaced by a new submission, 1954 * possibly even its own lite-restore). The HW only sends an interrupt 1955 * on the first miss, and we do know if that semaphore has been 1956 * signaled, or even if it is now stuck on another semaphore. Play 1957 * safe, yield if it might be stuck -- it will be given a fresh 1958 * timeslice in the near future. 1959 */ 1960 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1961 } 1962 1963 static bool 1964 timeslice_expired(const struct intel_engine_execlists *el, 1965 const struct i915_request *rq) 1966 { 1967 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1968 } 1969 1970 static int 1971 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1972 { 1973 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1974 return engine->execlists.queue_priority_hint; 1975 1976 return rq_prio(list_next_entry(rq, sched.link)); 1977 } 1978 1979 static inline unsigned long 1980 timeslice(const struct intel_engine_cs *engine) 1981 { 1982 return READ_ONCE(engine->props.timeslice_duration_ms); 1983 } 1984 1985 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1986 { 1987 const struct intel_engine_execlists *execlists = &engine->execlists; 1988 const struct i915_request *rq = *execlists->active; 1989 1990 if (!rq || i915_request_completed(rq)) 1991 return 0; 1992 1993 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1994 return 0; 1995 1996 return timeslice(engine); 1997 } 1998 1999 static void set_timeslice(struct intel_engine_cs *engine) 2000 { 2001 unsigned long duration; 2002 2003 if (!intel_engine_has_timeslices(engine)) 2004 return; 2005 2006 duration = active_timeslice(engine); 2007 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2008 2009 set_timer_ms(&engine->execlists.timer, duration); 2010 } 2011 2012 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2013 { 2014 struct intel_engine_execlists *execlists = &engine->execlists; 2015 unsigned long duration; 2016 2017 if (!intel_engine_has_timeslices(engine)) 2018 return; 2019 2020 WRITE_ONCE(execlists->switch_priority_hint, prio); 2021 if (prio == INT_MIN) 2022 return; 2023 2024 if (timer_pending(&execlists->timer)) 2025 return; 2026 2027 duration = timeslice(engine); 2028 ENGINE_TRACE(engine, 2029 "start timeslicing, prio:%d, interval:%lu", 2030 prio, duration); 2031 2032 set_timer_ms(&execlists->timer, duration); 2033 } 2034 2035 static void record_preemption(struct intel_engine_execlists *execlists) 2036 { 2037 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2038 } 2039 2040 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2041 const struct i915_request *rq) 2042 { 2043 if (!rq) 2044 return 0; 2045 2046 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2047 if (unlikely(intel_context_is_banned(rq->context))) 2048 return 1; 2049 2050 return READ_ONCE(engine->props.preempt_timeout_ms); 2051 } 2052 2053 static void set_preempt_timeout(struct intel_engine_cs *engine, 2054 const struct i915_request *rq) 2055 { 2056 if (!intel_engine_has_preempt_reset(engine)) 2057 return; 2058 2059 set_timer_ms(&engine->execlists.preempt, 2060 active_preempt_timeout(engine, rq)); 2061 } 2062 2063 static inline void clear_ports(struct i915_request **ports, int count) 2064 { 2065 memset_p((void **)ports, NULL, count); 2066 } 2067 2068 static inline void 2069 copy_ports(struct i915_request **dst, struct i915_request **src, int count) 2070 { 2071 /* A memcpy_p() would be very useful here! */ 2072 while (count--) 2073 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ 2074 } 2075 2076 static void execlists_dequeue(struct intel_engine_cs *engine) 2077 { 2078 struct intel_engine_execlists * const execlists = &engine->execlists; 2079 struct i915_request **port = execlists->pending; 2080 struct i915_request ** const last_port = port + execlists->port_mask; 2081 struct i915_request * const *active; 2082 struct i915_request *last; 2083 struct rb_node *rb; 2084 bool submit = false; 2085 2086 /* 2087 * Hardware submission is through 2 ports. Conceptually each port 2088 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2089 * static for a context, and unique to each, so we only execute 2090 * requests belonging to a single context from each ring. RING_HEAD 2091 * is maintained by the CS in the context image, it marks the place 2092 * where it got up to last time, and through RING_TAIL we tell the CS 2093 * where we want to execute up to this time. 2094 * 2095 * In this list the requests are in order of execution. Consecutive 2096 * requests from the same context are adjacent in the ringbuffer. We 2097 * can combine these requests into a single RING_TAIL update: 2098 * 2099 * RING_HEAD...req1...req2 2100 * ^- RING_TAIL 2101 * since to execute req2 the CS must first execute req1. 2102 * 2103 * Our goal then is to point each port to the end of a consecutive 2104 * sequence of requests as being the most optimal (fewest wake ups 2105 * and context switches) submission. 2106 */ 2107 2108 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2109 struct virtual_engine *ve = 2110 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2111 struct i915_request *rq = READ_ONCE(ve->request); 2112 2113 if (!rq) { /* lazily cleanup after another engine handled rq */ 2114 rb_erase_cached(rb, &execlists->virtual); 2115 RB_CLEAR_NODE(rb); 2116 rb = rb_first_cached(&execlists->virtual); 2117 continue; 2118 } 2119 2120 if (!virtual_matches(ve, rq, engine)) { 2121 rb = rb_next(rb); 2122 continue; 2123 } 2124 2125 break; 2126 } 2127 2128 /* 2129 * If the queue is higher priority than the last 2130 * request in the currently active context, submit afresh. 2131 * We will resubmit again afterwards in case we need to split 2132 * the active context to interject the preemption request, 2133 * i.e. we will retrigger preemption following the ack in case 2134 * of trouble. 2135 */ 2136 active = READ_ONCE(execlists->active); 2137 2138 /* 2139 * In theory we can skip over completed contexts that have not 2140 * yet been processed by events (as those events are in flight): 2141 * 2142 * while ((last = *active) && i915_request_completed(last)) 2143 * active++; 2144 * 2145 * However, the GPU cannot handle this as it will ultimately 2146 * find itself trying to jump back into a context it has just 2147 * completed and barf. 2148 */ 2149 2150 if ((last = *active)) { 2151 if (need_preempt(engine, last, rb)) { 2152 if (i915_request_completed(last)) { 2153 tasklet_hi_schedule(&execlists->tasklet); 2154 return; 2155 } 2156 2157 ENGINE_TRACE(engine, 2158 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2159 last->fence.context, 2160 last->fence.seqno, 2161 last->sched.attr.priority, 2162 execlists->queue_priority_hint); 2163 record_preemption(execlists); 2164 2165 /* 2166 * Don't let the RING_HEAD advance past the breadcrumb 2167 * as we unwind (and until we resubmit) so that we do 2168 * not accidentally tell it to go backwards. 2169 */ 2170 ring_set_paused(engine, 1); 2171 2172 /* 2173 * Note that we have not stopped the GPU at this point, 2174 * so we are unwinding the incomplete requests as they 2175 * remain inflight and so by the time we do complete 2176 * the preemption, some of the unwound requests may 2177 * complete! 2178 */ 2179 __unwind_incomplete_requests(engine); 2180 2181 last = NULL; 2182 } else if (need_timeslice(engine, last, rb) && 2183 timeslice_expired(execlists, last)) { 2184 if (i915_request_completed(last)) { 2185 tasklet_hi_schedule(&execlists->tasklet); 2186 return; 2187 } 2188 2189 ENGINE_TRACE(engine, 2190 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2191 last->fence.context, 2192 last->fence.seqno, 2193 last->sched.attr.priority, 2194 execlists->queue_priority_hint, 2195 yesno(timeslice_yield(execlists, last))); 2196 2197 ring_set_paused(engine, 1); 2198 defer_active(engine); 2199 2200 /* 2201 * Unlike for preemption, if we rewind and continue 2202 * executing the same context as previously active, 2203 * the order of execution will remain the same and 2204 * the tail will only advance. We do not need to 2205 * force a full context restore, as a lite-restore 2206 * is sufficient to resample the monotonic TAIL. 2207 * 2208 * If we switch to any other context, similarly we 2209 * will not rewind TAIL of current context, and 2210 * normal save/restore will preserve state and allow 2211 * us to later continue executing the same request. 2212 */ 2213 last = NULL; 2214 } else { 2215 /* 2216 * Otherwise if we already have a request pending 2217 * for execution after the current one, we can 2218 * just wait until the next CS event before 2219 * queuing more. In either case we will force a 2220 * lite-restore preemption event, but if we wait 2221 * we hopefully coalesce several updates into a single 2222 * submission. 2223 */ 2224 if (!list_is_last(&last->sched.link, 2225 &engine->active.requests)) { 2226 /* 2227 * Even if ELSP[1] is occupied and not worthy 2228 * of timeslices, our queue might be. 2229 */ 2230 start_timeslice(engine, queue_prio(execlists)); 2231 return; 2232 } 2233 } 2234 } 2235 2236 while (rb) { /* XXX virtual is always taking precedence */ 2237 struct virtual_engine *ve = 2238 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2239 struct i915_request *rq; 2240 2241 spin_lock(&ve->base.active.lock); 2242 2243 rq = ve->request; 2244 if (unlikely(!rq)) { /* lost the race to a sibling */ 2245 spin_unlock(&ve->base.active.lock); 2246 rb_erase_cached(rb, &execlists->virtual); 2247 RB_CLEAR_NODE(rb); 2248 rb = rb_first_cached(&execlists->virtual); 2249 continue; 2250 } 2251 2252 GEM_BUG_ON(rq != ve->request); 2253 GEM_BUG_ON(rq->engine != &ve->base); 2254 GEM_BUG_ON(rq->context != &ve->context); 2255 2256 if (rq_prio(rq) >= queue_prio(execlists)) { 2257 if (!virtual_matches(ve, rq, engine)) { 2258 spin_unlock(&ve->base.active.lock); 2259 rb = rb_next(rb); 2260 continue; 2261 } 2262 2263 if (last && !can_merge_rq(last, rq)) { 2264 spin_unlock(&ve->base.active.lock); 2265 start_timeslice(engine, rq_prio(rq)); 2266 return; /* leave this for another sibling */ 2267 } 2268 2269 ENGINE_TRACE(engine, 2270 "virtual rq=%llx:%lld%s, new engine? %s\n", 2271 rq->fence.context, 2272 rq->fence.seqno, 2273 i915_request_completed(rq) ? "!" : 2274 i915_request_started(rq) ? "*" : 2275 "", 2276 yesno(engine != ve->siblings[0])); 2277 2278 WRITE_ONCE(ve->request, NULL); 2279 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2280 INT_MIN); 2281 rb_erase_cached(rb, &execlists->virtual); 2282 RB_CLEAR_NODE(rb); 2283 2284 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2285 WRITE_ONCE(rq->engine, engine); 2286 2287 if (__i915_request_submit(rq)) { 2288 /* 2289 * Only after we confirm that we will submit 2290 * this request (i.e. it has not already 2291 * completed), do we want to update the context. 2292 * 2293 * This serves two purposes. It avoids 2294 * unnecessary work if we are resubmitting an 2295 * already completed request after timeslicing. 2296 * But more importantly, it prevents us altering 2297 * ve->siblings[] on an idle context, where 2298 * we may be using ve->siblings[] in 2299 * virtual_context_enter / virtual_context_exit. 2300 */ 2301 virtual_xfer_context(ve, engine); 2302 GEM_BUG_ON(ve->siblings[0] != engine); 2303 2304 submit = true; 2305 last = rq; 2306 } 2307 i915_request_put(rq); 2308 2309 /* 2310 * Hmm, we have a bunch of virtual engine requests, 2311 * but the first one was already completed (thanks 2312 * preempt-to-busy!). Keep looking at the veng queue 2313 * until we have no more relevant requests (i.e. 2314 * the normal submit queue has higher priority). 2315 */ 2316 if (!submit) { 2317 spin_unlock(&ve->base.active.lock); 2318 rb = rb_first_cached(&execlists->virtual); 2319 continue; 2320 } 2321 } 2322 2323 spin_unlock(&ve->base.active.lock); 2324 break; 2325 } 2326 2327 while ((rb = rb_first_cached(&execlists->queue))) { 2328 struct i915_priolist *p = to_priolist(rb); 2329 struct i915_request *rq, *rn; 2330 int i; 2331 2332 priolist_for_each_request_consume(rq, rn, p, i) { 2333 bool merge = true; 2334 2335 /* 2336 * Can we combine this request with the current port? 2337 * It has to be the same context/ringbuffer and not 2338 * have any exceptions (e.g. GVT saying never to 2339 * combine contexts). 2340 * 2341 * If we can combine the requests, we can execute both 2342 * by updating the RING_TAIL to point to the end of the 2343 * second request, and so we never need to tell the 2344 * hardware about the first. 2345 */ 2346 if (last && !can_merge_rq(last, rq)) { 2347 /* 2348 * If we are on the second port and cannot 2349 * combine this request with the last, then we 2350 * are done. 2351 */ 2352 if (port == last_port) 2353 goto done; 2354 2355 /* 2356 * We must not populate both ELSP[] with the 2357 * same LRCA, i.e. we must submit 2 different 2358 * contexts if we submit 2 ELSP. 2359 */ 2360 if (last->context == rq->context) 2361 goto done; 2362 2363 if (i915_request_has_sentinel(last)) 2364 goto done; 2365 2366 /* 2367 * If GVT overrides us we only ever submit 2368 * port[0], leaving port[1] empty. Note that we 2369 * also have to be careful that we don't queue 2370 * the same context (even though a different 2371 * request) to the second port. 2372 */ 2373 if (ctx_single_port_submission(last->context) || 2374 ctx_single_port_submission(rq->context)) 2375 goto done; 2376 2377 merge = false; 2378 } 2379 2380 if (__i915_request_submit(rq)) { 2381 if (!merge) { 2382 *port = execlists_schedule_in(last, port - execlists->pending); 2383 port++; 2384 last = NULL; 2385 } 2386 2387 GEM_BUG_ON(last && 2388 !can_merge_ctx(last->context, 2389 rq->context)); 2390 GEM_BUG_ON(last && 2391 i915_seqno_passed(last->fence.seqno, 2392 rq->fence.seqno)); 2393 2394 submit = true; 2395 last = rq; 2396 } 2397 } 2398 2399 rb_erase_cached(&p->node, &execlists->queue); 2400 i915_priolist_free(p); 2401 } 2402 2403 done: 2404 /* 2405 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2406 * 2407 * We choose the priority hint such that if we add a request of greater 2408 * priority than this, we kick the submission tasklet to decide on 2409 * the right order of submitting the requests to hardware. We must 2410 * also be prepared to reorder requests as they are in-flight on the 2411 * HW. We derive the priority hint then as the first "hole" in 2412 * the HW submission ports and if there are no available slots, 2413 * the priority of the lowest executing request, i.e. last. 2414 * 2415 * When we do receive a higher priority request ready to run from the 2416 * user, see queue_request(), the priority hint is bumped to that 2417 * request triggering preemption on the next dequeue (or subsequent 2418 * interrupt for secondary ports). 2419 */ 2420 execlists->queue_priority_hint = queue_prio(execlists); 2421 2422 if (submit) { 2423 *port = execlists_schedule_in(last, port - execlists->pending); 2424 execlists->switch_priority_hint = 2425 switch_prio(engine, *execlists->pending); 2426 2427 /* 2428 * Skip if we ended up with exactly the same set of requests, 2429 * e.g. trying to timeslice a pair of ordered contexts 2430 */ 2431 if (!memcmp(active, execlists->pending, 2432 (port - execlists->pending + 1) * sizeof(*port))) { 2433 do 2434 execlists_schedule_out(fetch_and_zero(port)); 2435 while (port-- != execlists->pending); 2436 2437 goto skip_submit; 2438 } 2439 clear_ports(port + 1, last_port - port); 2440 2441 WRITE_ONCE(execlists->yield, -1); 2442 set_preempt_timeout(engine, *active); 2443 execlists_submit_ports(engine); 2444 } else { 2445 start_timeslice(engine, execlists->queue_priority_hint); 2446 skip_submit: 2447 ring_set_paused(engine, 0); 2448 } 2449 } 2450 2451 static void 2452 cancel_port_requests(struct intel_engine_execlists * const execlists) 2453 { 2454 struct i915_request * const *port; 2455 2456 for (port = execlists->pending; *port; port++) 2457 execlists_schedule_out(*port); 2458 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2459 2460 /* Mark the end of active before we overwrite *active */ 2461 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2462 execlists_schedule_out(*port); 2463 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2464 2465 smp_wmb(); /* complete the seqlock for execlists_active() */ 2466 WRITE_ONCE(execlists->active, execlists->inflight); 2467 } 2468 2469 static inline void 2470 invalidate_csb_entries(const u64 *first, const u64 *last) 2471 { 2472 clflush((void *)first); 2473 clflush((void *)last); 2474 } 2475 2476 /* 2477 * Starting with Gen12, the status has a new format: 2478 * 2479 * bit 0: switched to new queue 2480 * bit 1: reserved 2481 * bit 2: semaphore wait mode (poll or signal), only valid when 2482 * switch detail is set to "wait on semaphore" 2483 * bits 3-5: engine class 2484 * bits 6-11: engine instance 2485 * bits 12-14: reserved 2486 * bits 15-25: sw context id of the lrc the GT switched to 2487 * bits 26-31: sw counter of the lrc the GT switched to 2488 * bits 32-35: context switch detail 2489 * - 0: ctx complete 2490 * - 1: wait on sync flip 2491 * - 2: wait on vblank 2492 * - 3: wait on scanline 2493 * - 4: wait on semaphore 2494 * - 5: context preempted (not on SEMAPHORE_WAIT or 2495 * WAIT_FOR_EVENT) 2496 * bit 36: reserved 2497 * bits 37-43: wait detail (for switch detail 1 to 4) 2498 * bits 44-46: reserved 2499 * bits 47-57: sw context id of the lrc the GT switched away from 2500 * bits 58-63: sw counter of the lrc the GT switched away from 2501 */ 2502 static inline bool gen12_csb_parse(const u64 csb) 2503 { 2504 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb)); 2505 bool new_queue = 2506 lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2507 2508 /* 2509 * The context switch detail is not guaranteed to be 5 when a preemption 2510 * occurs, so we can't just check for that. The check below works for 2511 * all the cases we care about, including preemptions of WAIT 2512 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2513 * would require some extra handling, but we don't support that. 2514 */ 2515 if (!ctx_away_valid || new_queue) { 2516 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb))); 2517 return true; 2518 } 2519 2520 /* 2521 * switch detail = 5 is covered by the case above and we do not expect a 2522 * context switch on an unsuccessful wait instruction since we always 2523 * use polling mode. 2524 */ 2525 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb))); 2526 return false; 2527 } 2528 2529 static inline bool gen8_csb_parse(const u64 csb) 2530 { 2531 return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2532 } 2533 2534 static noinline u64 2535 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2536 { 2537 u64 entry; 2538 2539 /* 2540 * Reading from the HWSP has one particular advantage: we can detect 2541 * a stale entry. Since the write into HWSP is broken, we have no reason 2542 * to trust the HW at all, the mmio entry may equally be unordered, so 2543 * we prefer the path that is self-checking and as a last resort, 2544 * return the mmio value. 2545 * 2546 * tgl,dg1:HSDES#22011327657 2547 */ 2548 preempt_disable(); 2549 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) { 2550 int idx = csb - engine->execlists.csb_status; 2551 int status; 2552 2553 status = GEN8_EXECLISTS_STATUS_BUF; 2554 if (idx >= 6) { 2555 status = GEN11_EXECLISTS_STATUS_BUF2; 2556 idx -= 6; 2557 } 2558 status += sizeof(u64) * idx; 2559 2560 entry = intel_uncore_read64(engine->uncore, 2561 _MMIO(engine->mmio_base + status)); 2562 } 2563 preempt_enable(); 2564 2565 return entry; 2566 } 2567 2568 static inline u64 2569 csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2570 { 2571 u64 entry = READ_ONCE(*csb); 2572 2573 /* 2574 * Unfortunately, the GPU does not always serialise its write 2575 * of the CSB entries before its write of the CSB pointer, at least 2576 * from the perspective of the CPU, using what is known as a Global 2577 * Observation Point. We may read a new CSB tail pointer, but then 2578 * read the stale CSB entries, causing us to misinterpret the 2579 * context-switch events, and eventually declare the GPU hung. 2580 * 2581 * icl:HSDES#1806554093 2582 * tgl:HSDES#22011248461 2583 */ 2584 if (unlikely(entry == -1)) 2585 entry = wa_csb_read(engine, csb); 2586 2587 /* Consume this entry so that we can spot its future reuse. */ 2588 WRITE_ONCE(*csb, -1); 2589 2590 /* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */ 2591 return entry; 2592 } 2593 2594 static void process_csb(struct intel_engine_cs *engine) 2595 { 2596 struct intel_engine_execlists * const execlists = &engine->execlists; 2597 u64 * const buf = execlists->csb_status; 2598 const u8 num_entries = execlists->csb_size; 2599 u8 head, tail; 2600 2601 /* 2602 * As we modify our execlists state tracking we require exclusive 2603 * access. Either we are inside the tasklet, or the tasklet is disabled 2604 * and we assume that is only inside the reset paths and so serialised. 2605 */ 2606 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2607 !reset_in_progress(execlists)); 2608 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2609 2610 /* 2611 * Note that csb_write, csb_status may be either in HWSP or mmio. 2612 * When reading from the csb_write mmio register, we have to be 2613 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2614 * the low 4bits. As it happens we know the next 4bits are always 2615 * zero and so we can simply masked off the low u8 of the register 2616 * and treat it identically to reading from the HWSP (without having 2617 * to use explicit shifting and masking, and probably bifurcating 2618 * the code to handle the legacy mmio read). 2619 */ 2620 head = execlists->csb_head; 2621 tail = READ_ONCE(*execlists->csb_write); 2622 if (unlikely(head == tail)) 2623 return; 2624 2625 /* 2626 * We will consume all events from HW, or at least pretend to. 2627 * 2628 * The sequence of events from the HW is deterministic, and derived 2629 * from our writes to the ELSP, with a smidgen of variability for 2630 * the arrival of the asynchronous requests wrt to the inflight 2631 * execution. If the HW sends an event that does not correspond with 2632 * the one we are expecting, we have to abandon all hope as we lose 2633 * all tracking of what the engine is actually executing. We will 2634 * only detect we are out of sequence with the HW when we get an 2635 * 'impossible' event because we have already drained our own 2636 * preemption/promotion queue. If this occurs, we know that we likely 2637 * lost track of execution earlier and must unwind and restart, the 2638 * simplest way is by stop processing the event queue and force the 2639 * engine to reset. 2640 */ 2641 execlists->csb_head = tail; 2642 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2643 2644 /* 2645 * Hopefully paired with a wmb() in HW! 2646 * 2647 * We must complete the read of the write pointer before any reads 2648 * from the CSB, so that we do not see stale values. Without an rmb 2649 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2650 * we perform the READ_ONCE(*csb_write). 2651 */ 2652 rmb(); 2653 do { 2654 bool promote; 2655 u64 csb; 2656 2657 if (++head == num_entries) 2658 head = 0; 2659 2660 /* 2661 * We are flying near dragons again. 2662 * 2663 * We hold a reference to the request in execlist_port[] 2664 * but no more than that. We are operating in softirq 2665 * context and so cannot hold any mutex or sleep. That 2666 * prevents us stopping the requests we are processing 2667 * in port[] from being retired simultaneously (the 2668 * breadcrumb will be complete before we see the 2669 * context-switch). As we only hold the reference to the 2670 * request, any pointer chasing underneath the request 2671 * is subject to a potential use-after-free. Thus we 2672 * store all of the bookkeeping within port[] as 2673 * required, and avoid using unguarded pointers beneath 2674 * request itself. The same applies to the atomic 2675 * status notifier. 2676 */ 2677 2678 csb = csb_read(engine, buf + head); 2679 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2680 head, upper_32_bits(csb), lower_32_bits(csb)); 2681 2682 if (INTEL_GEN(engine->i915) >= 12) 2683 promote = gen12_csb_parse(csb); 2684 else 2685 promote = gen8_csb_parse(csb); 2686 if (promote) { 2687 struct i915_request * const *old = execlists->active; 2688 2689 if (GEM_WARN_ON(!*execlists->pending)) { 2690 execlists->error_interrupt |= ERROR_CSB; 2691 break; 2692 } 2693 2694 ring_set_paused(engine, 0); 2695 2696 /* Point active to the new ELSP; prevent overwriting */ 2697 WRITE_ONCE(execlists->active, execlists->pending); 2698 smp_wmb(); /* notify execlists_active() */ 2699 2700 /* cancel old inflight, prepare for switch */ 2701 trace_ports(execlists, "preempted", old); 2702 while (*old) 2703 execlists_schedule_out(*old++); 2704 2705 /* switch pending to inflight */ 2706 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2707 copy_ports(execlists->inflight, 2708 execlists->pending, 2709 execlists_num_ports(execlists)); 2710 smp_wmb(); /* complete the seqlock */ 2711 WRITE_ONCE(execlists->active, execlists->inflight); 2712 2713 /* XXX Magic delay for tgl */ 2714 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 2715 2716 WRITE_ONCE(execlists->pending[0], NULL); 2717 } else { 2718 if (GEM_WARN_ON(!*execlists->active)) { 2719 execlists->error_interrupt |= ERROR_CSB; 2720 break; 2721 } 2722 2723 /* port0 completed, advanced to port1 */ 2724 trace_ports(execlists, "completed", execlists->active); 2725 2726 /* 2727 * We rely on the hardware being strongly 2728 * ordered, that the breadcrumb write is 2729 * coherent (visible from the CPU) before the 2730 * user interrupt is processed. One might assume 2731 * that the breadcrumb write being before the 2732 * user interrupt and the CS event for the context 2733 * switch would therefore be before the CS event 2734 * itself... 2735 */ 2736 if (GEM_SHOW_DEBUG() && 2737 !i915_request_completed(*execlists->active)) { 2738 struct i915_request *rq = *execlists->active; 2739 const u32 *regs __maybe_unused = 2740 rq->context->lrc_reg_state; 2741 2742 ENGINE_TRACE(engine, 2743 "context completed before request!\n"); 2744 ENGINE_TRACE(engine, 2745 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2746 ENGINE_READ(engine, RING_START), 2747 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2748 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2749 ENGINE_READ(engine, RING_CTL), 2750 ENGINE_READ(engine, RING_MI_MODE)); 2751 ENGINE_TRACE(engine, 2752 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2753 i915_ggtt_offset(rq->ring->vma), 2754 rq->head, rq->tail, 2755 rq->fence.context, 2756 lower_32_bits(rq->fence.seqno), 2757 hwsp_seqno(rq)); 2758 ENGINE_TRACE(engine, 2759 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2760 regs[CTX_RING_START], 2761 regs[CTX_RING_HEAD], 2762 regs[CTX_RING_TAIL]); 2763 } 2764 2765 execlists_schedule_out(*execlists->active++); 2766 2767 GEM_BUG_ON(execlists->active - execlists->inflight > 2768 execlists_num_ports(execlists)); 2769 } 2770 } while (head != tail); 2771 2772 set_timeslice(engine); 2773 2774 /* 2775 * Gen11 has proven to fail wrt global observation point between 2776 * entry and tail update, failing on the ordering and thus 2777 * we see an old entry in the context status buffer. 2778 * 2779 * Forcibly evict out entries for the next gpu csb update, 2780 * to increase the odds that we get a fresh entries with non 2781 * working hardware. The cost for doing so comes out mostly with 2782 * the wash as hardware, working or not, will need to do the 2783 * invalidation before. 2784 */ 2785 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2786 } 2787 2788 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2789 { 2790 lockdep_assert_held(&engine->active.lock); 2791 if (!READ_ONCE(engine->execlists.pending[0])) { 2792 rcu_read_lock(); /* protect peeking at execlists->active */ 2793 execlists_dequeue(engine); 2794 rcu_read_unlock(); 2795 } 2796 } 2797 2798 static void __execlists_hold(struct i915_request *rq) 2799 { 2800 LIST_HEAD(list); 2801 2802 do { 2803 struct i915_dependency *p; 2804 2805 if (i915_request_is_active(rq)) 2806 __i915_request_unsubmit(rq); 2807 2808 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2809 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2810 i915_request_set_hold(rq); 2811 RQ_TRACE(rq, "on hold\n"); 2812 2813 for_each_waiter(p, rq) { 2814 struct i915_request *w = 2815 container_of(p->waiter, typeof(*w), sched); 2816 2817 /* Leave semaphores spinning on the other engines */ 2818 if (w->engine != rq->engine) 2819 continue; 2820 2821 if (!i915_request_is_ready(w)) 2822 continue; 2823 2824 if (i915_request_completed(w)) 2825 continue; 2826 2827 if (i915_request_on_hold(w)) 2828 continue; 2829 2830 list_move_tail(&w->sched.link, &list); 2831 } 2832 2833 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2834 } while (rq); 2835 } 2836 2837 static bool execlists_hold(struct intel_engine_cs *engine, 2838 struct i915_request *rq) 2839 { 2840 spin_lock_irq(&engine->active.lock); 2841 2842 if (i915_request_completed(rq)) { /* too late! */ 2843 rq = NULL; 2844 goto unlock; 2845 } 2846 2847 if (rq->engine != engine) { /* preempted virtual engine */ 2848 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2849 2850 /* 2851 * intel_context_inflight() is only protected by virtue 2852 * of process_csb() being called only by the tasklet (or 2853 * directly from inside reset while the tasklet is suspended). 2854 * Assert that neither of those are allowed to run while we 2855 * poke at the request queues. 2856 */ 2857 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2858 2859 /* 2860 * An unsubmitted request along a virtual engine will 2861 * remain on the active (this) engine until we are able 2862 * to process the context switch away (and so mark the 2863 * context as no longer in flight). That cannot have happened 2864 * yet, otherwise we would not be hanging! 2865 */ 2866 spin_lock(&ve->base.active.lock); 2867 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2868 GEM_BUG_ON(ve->request != rq); 2869 ve->request = NULL; 2870 spin_unlock(&ve->base.active.lock); 2871 i915_request_put(rq); 2872 2873 rq->engine = engine; 2874 } 2875 2876 /* 2877 * Transfer this request onto the hold queue to prevent it 2878 * being resumbitted to HW (and potentially completed) before we have 2879 * released it. Since we may have already submitted following 2880 * requests, we need to remove those as well. 2881 */ 2882 GEM_BUG_ON(i915_request_on_hold(rq)); 2883 GEM_BUG_ON(rq->engine != engine); 2884 __execlists_hold(rq); 2885 GEM_BUG_ON(list_empty(&engine->active.hold)); 2886 2887 unlock: 2888 spin_unlock_irq(&engine->active.lock); 2889 return rq; 2890 } 2891 2892 static bool hold_request(const struct i915_request *rq) 2893 { 2894 struct i915_dependency *p; 2895 bool result = false; 2896 2897 /* 2898 * If one of our ancestors is on hold, we must also be on hold, 2899 * otherwise we will bypass it and execute before it. 2900 */ 2901 rcu_read_lock(); 2902 for_each_signaler(p, rq) { 2903 const struct i915_request *s = 2904 container_of(p->signaler, typeof(*s), sched); 2905 2906 if (s->engine != rq->engine) 2907 continue; 2908 2909 result = i915_request_on_hold(s); 2910 if (result) 2911 break; 2912 } 2913 rcu_read_unlock(); 2914 2915 return result; 2916 } 2917 2918 static void __execlists_unhold(struct i915_request *rq) 2919 { 2920 LIST_HEAD(list); 2921 2922 do { 2923 struct i915_dependency *p; 2924 2925 RQ_TRACE(rq, "hold release\n"); 2926 2927 GEM_BUG_ON(!i915_request_on_hold(rq)); 2928 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2929 2930 i915_request_clear_hold(rq); 2931 list_move_tail(&rq->sched.link, 2932 i915_sched_lookup_priolist(rq->engine, 2933 rq_prio(rq))); 2934 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2935 2936 /* Also release any children on this engine that are ready */ 2937 for_each_waiter(p, rq) { 2938 struct i915_request *w = 2939 container_of(p->waiter, typeof(*w), sched); 2940 2941 /* Propagate any change in error status */ 2942 if (rq->fence.error) 2943 i915_request_set_error_once(w, rq->fence.error); 2944 2945 if (w->engine != rq->engine) 2946 continue; 2947 2948 if (!i915_request_on_hold(w)) 2949 continue; 2950 2951 /* Check that no other parents are also on hold */ 2952 if (hold_request(w)) 2953 continue; 2954 2955 list_move_tail(&w->sched.link, &list); 2956 } 2957 2958 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2959 } while (rq); 2960 } 2961 2962 static void execlists_unhold(struct intel_engine_cs *engine, 2963 struct i915_request *rq) 2964 { 2965 spin_lock_irq(&engine->active.lock); 2966 2967 /* 2968 * Move this request back to the priority queue, and all of its 2969 * children and grandchildren that were suspended along with it. 2970 */ 2971 __execlists_unhold(rq); 2972 2973 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2974 engine->execlists.queue_priority_hint = rq_prio(rq); 2975 tasklet_hi_schedule(&engine->execlists.tasklet); 2976 } 2977 2978 spin_unlock_irq(&engine->active.lock); 2979 } 2980 2981 struct execlists_capture { 2982 struct work_struct work; 2983 struct i915_request *rq; 2984 struct i915_gpu_coredump *error; 2985 }; 2986 2987 static void execlists_capture_work(struct work_struct *work) 2988 { 2989 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2990 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2991 struct intel_engine_cs *engine = cap->rq->engine; 2992 struct intel_gt_coredump *gt = cap->error->gt; 2993 struct intel_engine_capture_vma *vma; 2994 2995 /* Compress all the objects attached to the request, slow! */ 2996 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2997 if (vma) { 2998 struct i915_vma_compress *compress = 2999 i915_vma_capture_prepare(gt); 3000 3001 intel_engine_coredump_add_vma(gt->engine, vma, compress); 3002 i915_vma_capture_finish(gt, compress); 3003 } 3004 3005 gt->simulated = gt->engine->simulated; 3006 cap->error->simulated = gt->simulated; 3007 3008 /* Publish the error state, and announce it to the world */ 3009 i915_error_state_store(cap->error); 3010 i915_gpu_coredump_put(cap->error); 3011 3012 /* Return this request and all that depend upon it for signaling */ 3013 execlists_unhold(engine, cap->rq); 3014 i915_request_put(cap->rq); 3015 3016 kfree(cap); 3017 } 3018 3019 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 3020 { 3021 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 3022 struct execlists_capture *cap; 3023 3024 cap = kmalloc(sizeof(*cap), gfp); 3025 if (!cap) 3026 return NULL; 3027 3028 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 3029 if (!cap->error) 3030 goto err_cap; 3031 3032 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 3033 if (!cap->error->gt) 3034 goto err_gpu; 3035 3036 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 3037 if (!cap->error->gt->engine) 3038 goto err_gt; 3039 3040 cap->error->gt->engine->hung = true; 3041 3042 return cap; 3043 3044 err_gt: 3045 kfree(cap->error->gt); 3046 err_gpu: 3047 kfree(cap->error); 3048 err_cap: 3049 kfree(cap); 3050 return NULL; 3051 } 3052 3053 static struct i915_request * 3054 active_context(struct intel_engine_cs *engine, u32 ccid) 3055 { 3056 const struct intel_engine_execlists * const el = &engine->execlists; 3057 struct i915_request * const *port, *rq; 3058 3059 /* 3060 * Use the most recent result from process_csb(), but just in case 3061 * we trigger an error (via interrupt) before the first CS event has 3062 * been written, peek at the next submission. 3063 */ 3064 3065 for (port = el->active; (rq = *port); port++) { 3066 if (rq->context->lrc.ccid == ccid) { 3067 ENGINE_TRACE(engine, 3068 "ccid found at active:%zd\n", 3069 port - el->active); 3070 return rq; 3071 } 3072 } 3073 3074 for (port = el->pending; (rq = *port); port++) { 3075 if (rq->context->lrc.ccid == ccid) { 3076 ENGINE_TRACE(engine, 3077 "ccid found at pending:%zd\n", 3078 port - el->pending); 3079 return rq; 3080 } 3081 } 3082 3083 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3084 return NULL; 3085 } 3086 3087 static u32 active_ccid(struct intel_engine_cs *engine) 3088 { 3089 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3090 } 3091 3092 static void execlists_capture(struct intel_engine_cs *engine) 3093 { 3094 struct execlists_capture *cap; 3095 3096 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3097 return; 3098 3099 /* 3100 * We need to _quickly_ capture the engine state before we reset. 3101 * We are inside an atomic section (softirq) here and we are delaying 3102 * the forced preemption event. 3103 */ 3104 cap = capture_regs(engine); 3105 if (!cap) 3106 return; 3107 3108 spin_lock_irq(&engine->active.lock); 3109 cap->rq = active_context(engine, active_ccid(engine)); 3110 if (cap->rq) { 3111 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3112 cap->rq = i915_request_get_rcu(cap->rq); 3113 } 3114 spin_unlock_irq(&engine->active.lock); 3115 if (!cap->rq) 3116 goto err_free; 3117 3118 /* 3119 * Remove the request from the execlists queue, and take ownership 3120 * of the request. We pass it to our worker who will _slowly_ compress 3121 * all the pages the _user_ requested for debugging their batch, after 3122 * which we return it to the queue for signaling. 3123 * 3124 * By removing them from the execlists queue, we also remove the 3125 * requests from being processed by __unwind_incomplete_requests() 3126 * during the intel_engine_reset(), and so they will *not* be replayed 3127 * afterwards. 3128 * 3129 * Note that because we have not yet reset the engine at this point, 3130 * it is possible for the request that we have identified as being 3131 * guilty, did in fact complete and we will then hit an arbitration 3132 * point allowing the outstanding preemption to succeed. The likelihood 3133 * of that is very low (as capturing of the engine registers should be 3134 * fast enough to run inside an irq-off atomic section!), so we will 3135 * simply hold that request accountable for being non-preemptible 3136 * long enough to force the reset. 3137 */ 3138 if (!execlists_hold(engine, cap->rq)) 3139 goto err_rq; 3140 3141 INIT_WORK(&cap->work, execlists_capture_work); 3142 schedule_work(&cap->work); 3143 return; 3144 3145 err_rq: 3146 i915_request_put(cap->rq); 3147 err_free: 3148 i915_gpu_coredump_put(cap->error); 3149 kfree(cap); 3150 } 3151 3152 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3153 { 3154 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3155 unsigned long *lock = &engine->gt->reset.flags; 3156 3157 if (!intel_has_reset_engine(engine->gt)) 3158 return; 3159 3160 if (test_and_set_bit(bit, lock)) 3161 return; 3162 3163 ENGINE_TRACE(engine, "reset for %s\n", msg); 3164 3165 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3166 tasklet_disable_nosync(&engine->execlists.tasklet); 3167 3168 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3169 execlists_capture(engine); 3170 intel_engine_reset(engine, msg); 3171 3172 tasklet_enable(&engine->execlists.tasklet); 3173 clear_and_wake_up_bit(bit, lock); 3174 } 3175 3176 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3177 { 3178 const struct timer_list *t = &engine->execlists.preempt; 3179 3180 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3181 return false; 3182 3183 if (!timer_expired(t)) 3184 return false; 3185 3186 return READ_ONCE(engine->execlists.pending[0]); 3187 } 3188 3189 /* 3190 * Check the unread Context Status Buffers and manage the submission of new 3191 * contexts to the ELSP accordingly. 3192 */ 3193 static void execlists_submission_tasklet(unsigned long data) 3194 { 3195 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3196 bool timeout = preempt_timeout(engine); 3197 3198 process_csb(engine); 3199 3200 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3201 const char *msg; 3202 3203 /* Generate the error message in priority wrt to the user! */ 3204 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3205 msg = "CS error"; /* thrown by a user payload */ 3206 else if (engine->execlists.error_interrupt & ERROR_CSB) 3207 msg = "invalid CSB event"; 3208 else 3209 msg = "internal error"; 3210 3211 engine->execlists.error_interrupt = 0; 3212 execlists_reset(engine, msg); 3213 } 3214 3215 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3216 unsigned long flags; 3217 3218 spin_lock_irqsave(&engine->active.lock, flags); 3219 __execlists_submission_tasklet(engine); 3220 spin_unlock_irqrestore(&engine->active.lock, flags); 3221 3222 /* Recheck after serialising with direct-submission */ 3223 if (unlikely(timeout && preempt_timeout(engine))) 3224 execlists_reset(engine, "preemption time out"); 3225 } 3226 } 3227 3228 static void __execlists_kick(struct intel_engine_execlists *execlists) 3229 { 3230 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3231 tasklet_hi_schedule(&execlists->tasklet); 3232 } 3233 3234 #define execlists_kick(t, member) \ 3235 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3236 3237 static void execlists_timeslice(struct timer_list *timer) 3238 { 3239 execlists_kick(timer, timer); 3240 } 3241 3242 static void execlists_preempt(struct timer_list *timer) 3243 { 3244 execlists_kick(timer, preempt); 3245 } 3246 3247 static void queue_request(struct intel_engine_cs *engine, 3248 struct i915_request *rq) 3249 { 3250 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3251 list_add_tail(&rq->sched.link, 3252 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3253 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3254 } 3255 3256 static void __submit_queue_imm(struct intel_engine_cs *engine) 3257 { 3258 struct intel_engine_execlists * const execlists = &engine->execlists; 3259 3260 if (reset_in_progress(execlists)) 3261 return; /* defer until we restart the engine following reset */ 3262 3263 __execlists_submission_tasklet(engine); 3264 } 3265 3266 static void submit_queue(struct intel_engine_cs *engine, 3267 const struct i915_request *rq) 3268 { 3269 struct intel_engine_execlists *execlists = &engine->execlists; 3270 3271 if (rq_prio(rq) <= execlists->queue_priority_hint) 3272 return; 3273 3274 execlists->queue_priority_hint = rq_prio(rq); 3275 __submit_queue_imm(engine); 3276 } 3277 3278 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3279 const struct i915_request *rq) 3280 { 3281 GEM_BUG_ON(i915_request_on_hold(rq)); 3282 return !list_empty(&engine->active.hold) && hold_request(rq); 3283 } 3284 3285 static void flush_csb(struct intel_engine_cs *engine) 3286 { 3287 struct intel_engine_execlists *el = &engine->execlists; 3288 3289 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3290 if (!reset_in_progress(el)) 3291 process_csb(engine); 3292 tasklet_unlock(&el->tasklet); 3293 } 3294 } 3295 3296 static void execlists_submit_request(struct i915_request *request) 3297 { 3298 struct intel_engine_cs *engine = request->engine; 3299 unsigned long flags; 3300 3301 /* Hopefully we clear execlists->pending[] to let us through */ 3302 flush_csb(engine); 3303 3304 /* Will be called from irq-context when using foreign fences. */ 3305 spin_lock_irqsave(&engine->active.lock, flags); 3306 3307 if (unlikely(ancestor_on_hold(engine, request))) { 3308 RQ_TRACE(request, "ancestor on hold\n"); 3309 list_add_tail(&request->sched.link, &engine->active.hold); 3310 i915_request_set_hold(request); 3311 } else { 3312 queue_request(engine, request); 3313 3314 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3315 GEM_BUG_ON(list_empty(&request->sched.link)); 3316 3317 submit_queue(engine, request); 3318 } 3319 3320 spin_unlock_irqrestore(&engine->active.lock, flags); 3321 } 3322 3323 static void __execlists_context_fini(struct intel_context *ce) 3324 { 3325 intel_ring_put(ce->ring); 3326 i915_vma_put(ce->state); 3327 } 3328 3329 static void execlists_context_destroy(struct kref *kref) 3330 { 3331 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3332 3333 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3334 GEM_BUG_ON(intel_context_is_pinned(ce)); 3335 3336 if (ce->state) 3337 __execlists_context_fini(ce); 3338 3339 intel_context_fini(ce); 3340 intel_context_free(ce); 3341 } 3342 3343 static void 3344 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3345 { 3346 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3347 return; 3348 3349 vaddr += engine->context_size; 3350 3351 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3352 } 3353 3354 static void 3355 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3356 { 3357 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3358 return; 3359 3360 vaddr += engine->context_size; 3361 3362 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3363 drm_err_once(&engine->i915->drm, 3364 "%s context redzone overwritten!\n", 3365 engine->name); 3366 } 3367 3368 static void execlists_context_unpin(struct intel_context *ce) 3369 { 3370 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3371 ce->engine); 3372 } 3373 3374 static void execlists_context_post_unpin(struct intel_context *ce) 3375 { 3376 i915_gem_object_unpin_map(ce->state->obj); 3377 } 3378 3379 static u32 * 3380 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3381 { 3382 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3383 MI_SRM_LRM_GLOBAL_GTT | 3384 MI_LRI_LRM_CS_MMIO; 3385 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3386 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3387 CTX_TIMESTAMP * sizeof(u32); 3388 *cs++ = 0; 3389 3390 *cs++ = MI_LOAD_REGISTER_REG | 3391 MI_LRR_SOURCE_CS_MMIO | 3392 MI_LRI_LRM_CS_MMIO; 3393 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3394 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3395 3396 *cs++ = MI_LOAD_REGISTER_REG | 3397 MI_LRR_SOURCE_CS_MMIO | 3398 MI_LRI_LRM_CS_MMIO; 3399 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3400 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3401 3402 return cs; 3403 } 3404 3405 static u32 * 3406 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3407 { 3408 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3409 3410 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3411 MI_SRM_LRM_GLOBAL_GTT | 3412 MI_LRI_LRM_CS_MMIO; 3413 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3414 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3415 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3416 *cs++ = 0; 3417 3418 return cs; 3419 } 3420 3421 static u32 * 3422 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3423 { 3424 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3425 3426 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3427 MI_SRM_LRM_GLOBAL_GTT | 3428 MI_LRI_LRM_CS_MMIO; 3429 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3430 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3431 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3432 *cs++ = 0; 3433 3434 *cs++ = MI_LOAD_REGISTER_REG | 3435 MI_LRR_SOURCE_CS_MMIO | 3436 MI_LRI_LRM_CS_MMIO; 3437 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3438 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3439 3440 return cs; 3441 } 3442 3443 static u32 * 3444 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3445 { 3446 cs = gen12_emit_timestamp_wa(ce, cs); 3447 cs = gen12_emit_cmd_buf_wa(ce, cs); 3448 cs = gen12_emit_restore_scratch(ce, cs); 3449 3450 return cs; 3451 } 3452 3453 static u32 * 3454 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3455 { 3456 cs = gen12_emit_timestamp_wa(ce, cs); 3457 cs = gen12_emit_restore_scratch(ce, cs); 3458 3459 return cs; 3460 } 3461 3462 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3463 { 3464 return PAGE_SIZE * ce->wa_bb_page; 3465 } 3466 3467 static u32 *context_indirect_bb(const struct intel_context *ce) 3468 { 3469 void *ptr; 3470 3471 GEM_BUG_ON(!ce->wa_bb_page); 3472 3473 ptr = ce->lrc_reg_state; 3474 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3475 ptr += context_wa_bb_offset(ce); 3476 3477 return ptr; 3478 } 3479 3480 static void 3481 setup_indirect_ctx_bb(const struct intel_context *ce, 3482 const struct intel_engine_cs *engine, 3483 u32 *(*emit)(const struct intel_context *, u32 *)) 3484 { 3485 u32 * const start = context_indirect_bb(ce); 3486 u32 *cs; 3487 3488 cs = emit(ce, start); 3489 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3490 while ((unsigned long)cs % CACHELINE_BYTES) 3491 *cs++ = MI_NOOP; 3492 3493 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3494 i915_ggtt_offset(ce->state) + 3495 context_wa_bb_offset(ce), 3496 (cs - start) * sizeof(*cs)); 3497 } 3498 3499 static void 3500 __execlists_update_reg_state(const struct intel_context *ce, 3501 const struct intel_engine_cs *engine, 3502 u32 head) 3503 { 3504 struct intel_ring *ring = ce->ring; 3505 u32 *regs = ce->lrc_reg_state; 3506 3507 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3508 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3509 3510 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3511 regs[CTX_RING_HEAD] = head; 3512 regs[CTX_RING_TAIL] = ring->tail; 3513 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3514 3515 /* RPCS */ 3516 if (engine->class == RENDER_CLASS) { 3517 regs[CTX_R_PWR_CLK_STATE] = 3518 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3519 3520 i915_oa_init_reg_state(ce, engine); 3521 } 3522 3523 if (ce->wa_bb_page) { 3524 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3525 3526 fn = gen12_emit_indirect_ctx_xcs; 3527 if (ce->engine->class == RENDER_CLASS) 3528 fn = gen12_emit_indirect_ctx_rcs; 3529 3530 /* Mutually exclusive wrt to global indirect bb */ 3531 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3532 setup_indirect_ctx_bb(ce, engine, fn); 3533 } 3534 } 3535 3536 static int 3537 execlists_context_pre_pin(struct intel_context *ce, 3538 struct i915_gem_ww_ctx *ww, void **vaddr) 3539 { 3540 GEM_BUG_ON(!ce->state); 3541 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3542 3543 *vaddr = i915_gem_object_pin_map(ce->state->obj, 3544 i915_coherent_map_type(ce->engine->i915) | 3545 I915_MAP_OVERRIDE); 3546 3547 return PTR_ERR_OR_ZERO(*vaddr); 3548 } 3549 3550 static int 3551 __execlists_context_pin(struct intel_context *ce, 3552 struct intel_engine_cs *engine, 3553 void *vaddr) 3554 { 3555 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3556 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3557 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3558 3559 return 0; 3560 } 3561 3562 static int execlists_context_pin(struct intel_context *ce, void *vaddr) 3563 { 3564 return __execlists_context_pin(ce, ce->engine, vaddr); 3565 } 3566 3567 static int execlists_context_alloc(struct intel_context *ce) 3568 { 3569 return __execlists_context_alloc(ce, ce->engine); 3570 } 3571 3572 static void execlists_context_reset(struct intel_context *ce) 3573 { 3574 CE_TRACE(ce, "reset\n"); 3575 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3576 3577 intel_ring_reset(ce->ring, ce->ring->emit); 3578 3579 /* Scrub away the garbage */ 3580 execlists_init_reg_state(ce->lrc_reg_state, 3581 ce, ce->engine, ce->ring, true); 3582 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3583 3584 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3585 } 3586 3587 static const struct intel_context_ops execlists_context_ops = { 3588 .alloc = execlists_context_alloc, 3589 3590 .pre_pin = execlists_context_pre_pin, 3591 .pin = execlists_context_pin, 3592 .unpin = execlists_context_unpin, 3593 .post_unpin = execlists_context_post_unpin, 3594 3595 .enter = intel_context_enter_engine, 3596 .exit = intel_context_exit_engine, 3597 3598 .reset = execlists_context_reset, 3599 .destroy = execlists_context_destroy, 3600 }; 3601 3602 static u32 hwsp_offset(const struct i915_request *rq) 3603 { 3604 const struct intel_timeline_cacheline *cl; 3605 3606 /* Before the request is executed, the timeline/cachline is fixed */ 3607 3608 cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); 3609 if (cl) 3610 return cl->ggtt_offset; 3611 3612 return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; 3613 } 3614 3615 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3616 { 3617 u32 *cs; 3618 3619 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3620 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3621 return 0; 3622 3623 cs = intel_ring_begin(rq, 6); 3624 if (IS_ERR(cs)) 3625 return PTR_ERR(cs); 3626 3627 /* 3628 * Check if we have been preempted before we even get started. 3629 * 3630 * After this point i915_request_started() reports true, even if 3631 * we get preempted and so are no longer running. 3632 */ 3633 *cs++ = MI_ARB_CHECK; 3634 *cs++ = MI_NOOP; 3635 3636 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3637 *cs++ = hwsp_offset(rq); 3638 *cs++ = 0; 3639 *cs++ = rq->fence.seqno - 1; 3640 3641 intel_ring_advance(rq, cs); 3642 3643 /* Record the updated position of the request's payload */ 3644 rq->infix = intel_ring_offset(rq, cs); 3645 3646 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3647 3648 return 0; 3649 } 3650 3651 static int emit_pdps(struct i915_request *rq) 3652 { 3653 const struct intel_engine_cs * const engine = rq->engine; 3654 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3655 int err, i; 3656 u32 *cs; 3657 3658 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3659 3660 /* 3661 * Beware ye of the dragons, this sequence is magic! 3662 * 3663 * Small changes to this sequence can cause anything from 3664 * GPU hangs to forcewake errors and machine lockups! 3665 */ 3666 3667 /* Flush any residual operations from the context load */ 3668 err = engine->emit_flush(rq, EMIT_FLUSH); 3669 if (err) 3670 return err; 3671 3672 /* Magic required to prevent forcewake errors! */ 3673 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3674 if (err) 3675 return err; 3676 3677 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3678 if (IS_ERR(cs)) 3679 return PTR_ERR(cs); 3680 3681 /* Ensure the LRI have landed before we invalidate & continue */ 3682 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3683 for (i = GEN8_3LVL_PDPES; i--; ) { 3684 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3685 u32 base = engine->mmio_base; 3686 3687 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3688 *cs++ = upper_32_bits(pd_daddr); 3689 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3690 *cs++ = lower_32_bits(pd_daddr); 3691 } 3692 *cs++ = MI_NOOP; 3693 3694 intel_ring_advance(rq, cs); 3695 3696 return 0; 3697 } 3698 3699 static int execlists_request_alloc(struct i915_request *request) 3700 { 3701 int ret; 3702 3703 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3704 3705 /* 3706 * Flush enough space to reduce the likelihood of waiting after 3707 * we start building the request - in which case we will just 3708 * have to repeat work. 3709 */ 3710 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3711 3712 /* 3713 * Note that after this point, we have committed to using 3714 * this request as it is being used to both track the 3715 * state of engine initialisation and liveness of the 3716 * golden renderstate above. Think twice before you try 3717 * to cancel/unwind this request now. 3718 */ 3719 3720 if (!i915_vm_is_4lvl(request->context->vm)) { 3721 ret = emit_pdps(request); 3722 if (ret) 3723 return ret; 3724 } 3725 3726 /* Unconditionally invalidate GPU caches and TLBs. */ 3727 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3728 if (ret) 3729 return ret; 3730 3731 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3732 return 0; 3733 } 3734 3735 /* 3736 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3737 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3738 * but there is a slight complication as this is applied in WA batch where the 3739 * values are only initialized once so we cannot take register value at the 3740 * beginning and reuse it further; hence we save its value to memory, upload a 3741 * constant value with bit21 set and then we restore it back with the saved value. 3742 * To simplify the WA, a constant value is formed by using the default value 3743 * of this register. This shouldn't be a problem because we are only modifying 3744 * it for a short period and this batch in non-premptible. We can ofcourse 3745 * use additional instructions that read the actual value of the register 3746 * at that time and set our bit of interest but it makes the WA complicated. 3747 * 3748 * This WA is also required for Gen9 so extracting as a function avoids 3749 * code duplication. 3750 */ 3751 static u32 * 3752 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3753 { 3754 /* NB no one else is allowed to scribble over scratch + 256! */ 3755 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3756 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3757 *batch++ = intel_gt_scratch_offset(engine->gt, 3758 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3759 *batch++ = 0; 3760 3761 *batch++ = MI_LOAD_REGISTER_IMM(1); 3762 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3763 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3764 3765 batch = gen8_emit_pipe_control(batch, 3766 PIPE_CONTROL_CS_STALL | 3767 PIPE_CONTROL_DC_FLUSH_ENABLE, 3768 0); 3769 3770 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3771 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3772 *batch++ = intel_gt_scratch_offset(engine->gt, 3773 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3774 *batch++ = 0; 3775 3776 return batch; 3777 } 3778 3779 /* 3780 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3781 * initialized at the beginning and shared across all contexts but this field 3782 * helps us to have multiple batches at different offsets and select them based 3783 * on a criteria. At the moment this batch always start at the beginning of the page 3784 * and at this point we don't have multiple wa_ctx batch buffers. 3785 * 3786 * The number of WA applied are not known at the beginning; we use this field 3787 * to return the no of DWORDS written. 3788 * 3789 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3790 * so it adds NOOPs as padding to make it cacheline aligned. 3791 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3792 * makes a complete batch buffer. 3793 */ 3794 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3795 { 3796 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3797 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3798 3799 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3800 if (IS_BROADWELL(engine->i915)) 3801 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3802 3803 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3804 /* Actual scratch location is at 128 bytes offset */ 3805 batch = gen8_emit_pipe_control(batch, 3806 PIPE_CONTROL_FLUSH_L3 | 3807 PIPE_CONTROL_STORE_DATA_INDEX | 3808 PIPE_CONTROL_CS_STALL | 3809 PIPE_CONTROL_QW_WRITE, 3810 LRC_PPHWSP_SCRATCH_ADDR); 3811 3812 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3813 3814 /* Pad to end of cacheline */ 3815 while ((unsigned long)batch % CACHELINE_BYTES) 3816 *batch++ = MI_NOOP; 3817 3818 /* 3819 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3820 * execution depends on the length specified in terms of cache lines 3821 * in the register CTX_RCS_INDIRECT_CTX 3822 */ 3823 3824 return batch; 3825 } 3826 3827 struct lri { 3828 i915_reg_t reg; 3829 u32 value; 3830 }; 3831 3832 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3833 { 3834 GEM_BUG_ON(!count || count > 63); 3835 3836 *batch++ = MI_LOAD_REGISTER_IMM(count); 3837 do { 3838 *batch++ = i915_mmio_reg_offset(lri->reg); 3839 *batch++ = lri->value; 3840 } while (lri++, --count); 3841 *batch++ = MI_NOOP; 3842 3843 return batch; 3844 } 3845 3846 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3847 { 3848 static const struct lri lri[] = { 3849 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3850 { 3851 COMMON_SLICE_CHICKEN2, 3852 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3853 0), 3854 }, 3855 3856 /* BSpec: 11391 */ 3857 { 3858 FF_SLICE_CHICKEN, 3859 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3860 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3861 }, 3862 3863 /* BSpec: 11299 */ 3864 { 3865 _3D_CHICKEN3, 3866 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3867 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3868 } 3869 }; 3870 3871 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3872 3873 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3874 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3875 3876 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3877 batch = gen8_emit_pipe_control(batch, 3878 PIPE_CONTROL_FLUSH_L3 | 3879 PIPE_CONTROL_STORE_DATA_INDEX | 3880 PIPE_CONTROL_CS_STALL | 3881 PIPE_CONTROL_QW_WRITE, 3882 LRC_PPHWSP_SCRATCH_ADDR); 3883 3884 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3885 3886 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3887 if (HAS_POOLED_EU(engine->i915)) { 3888 /* 3889 * EU pool configuration is setup along with golden context 3890 * during context initialization. This value depends on 3891 * device type (2x6 or 3x6) and needs to be updated based 3892 * on which subslice is disabled especially for 2x6 3893 * devices, however it is safe to load default 3894 * configuration of 3x6 device instead of masking off 3895 * corresponding bits because HW ignores bits of a disabled 3896 * subslice and drops down to appropriate config. Please 3897 * see render_state_setup() in i915_gem_render_state.c for 3898 * possible configurations, to avoid duplication they are 3899 * not shown here again. 3900 */ 3901 *batch++ = GEN9_MEDIA_POOL_STATE; 3902 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3903 *batch++ = 0x00777000; 3904 *batch++ = 0; 3905 *batch++ = 0; 3906 *batch++ = 0; 3907 } 3908 3909 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3910 3911 /* Pad to end of cacheline */ 3912 while ((unsigned long)batch % CACHELINE_BYTES) 3913 *batch++ = MI_NOOP; 3914 3915 return batch; 3916 } 3917 3918 static u32 * 3919 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3920 { 3921 int i; 3922 3923 /* 3924 * WaPipeControlBefore3DStateSamplePattern: cnl 3925 * 3926 * Ensure the engine is idle prior to programming a 3927 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3928 */ 3929 batch = gen8_emit_pipe_control(batch, 3930 PIPE_CONTROL_CS_STALL, 3931 0); 3932 /* 3933 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3934 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3935 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3936 * confusing. Since gen8_emit_pipe_control() already advances the 3937 * batch by 6 dwords, we advance the other 10 here, completing a 3938 * cacheline. It's not clear if the workaround requires this padding 3939 * before other commands, or if it's just the regular padding we would 3940 * already have for the workaround bb, so leave it here for now. 3941 */ 3942 for (i = 0; i < 10; i++) 3943 *batch++ = MI_NOOP; 3944 3945 /* Pad to end of cacheline */ 3946 while ((unsigned long)batch % CACHELINE_BYTES) 3947 *batch++ = MI_NOOP; 3948 3949 return batch; 3950 } 3951 3952 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3953 3954 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3955 { 3956 struct drm_i915_gem_object *obj; 3957 struct i915_vma *vma; 3958 int err; 3959 3960 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3961 if (IS_ERR(obj)) 3962 return PTR_ERR(obj); 3963 3964 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3965 if (IS_ERR(vma)) { 3966 err = PTR_ERR(vma); 3967 goto err; 3968 } 3969 3970 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 3971 if (err) 3972 goto err; 3973 3974 engine->wa_ctx.vma = vma; 3975 return 0; 3976 3977 err: 3978 i915_gem_object_put(obj); 3979 return err; 3980 } 3981 3982 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3983 { 3984 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3985 } 3986 3987 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3988 3989 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3990 { 3991 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3992 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3993 &wa_ctx->per_ctx }; 3994 wa_bb_func_t wa_bb_fn[2]; 3995 void *batch, *batch_ptr; 3996 unsigned int i; 3997 int ret; 3998 3999 if (engine->class != RENDER_CLASS) 4000 return 0; 4001 4002 switch (INTEL_GEN(engine->i915)) { 4003 case 12: 4004 case 11: 4005 return 0; 4006 case 10: 4007 wa_bb_fn[0] = gen10_init_indirectctx_bb; 4008 wa_bb_fn[1] = NULL; 4009 break; 4010 case 9: 4011 wa_bb_fn[0] = gen9_init_indirectctx_bb; 4012 wa_bb_fn[1] = NULL; 4013 break; 4014 case 8: 4015 wa_bb_fn[0] = gen8_init_indirectctx_bb; 4016 wa_bb_fn[1] = NULL; 4017 break; 4018 default: 4019 MISSING_CASE(INTEL_GEN(engine->i915)); 4020 return 0; 4021 } 4022 4023 ret = lrc_setup_wa_ctx(engine); 4024 if (ret) { 4025 drm_dbg(&engine->i915->drm, 4026 "Failed to setup context WA page: %d\n", ret); 4027 return ret; 4028 } 4029 4030 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 4031 4032 /* 4033 * Emit the two workaround batch buffers, recording the offset from the 4034 * start of the workaround batch buffer object for each and their 4035 * respective sizes. 4036 */ 4037 batch_ptr = batch; 4038 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 4039 wa_bb[i]->offset = batch_ptr - batch; 4040 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 4041 CACHELINE_BYTES))) { 4042 ret = -EINVAL; 4043 break; 4044 } 4045 if (wa_bb_fn[i]) 4046 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 4047 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 4048 } 4049 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 4050 4051 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 4052 __i915_gem_object_release_map(wa_ctx->vma->obj); 4053 if (ret) 4054 lrc_destroy_wa_ctx(engine); 4055 4056 return ret; 4057 } 4058 4059 static void reset_csb_pointers(struct intel_engine_cs *engine) 4060 { 4061 struct intel_engine_execlists * const execlists = &engine->execlists; 4062 const unsigned int reset_value = execlists->csb_size - 1; 4063 4064 ring_set_paused(engine, 0); 4065 4066 /* 4067 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 4068 * Bludgeon them with a mmio update to be sure. 4069 */ 4070 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4071 0xffff << 16 | reset_value << 8 | reset_value); 4072 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4073 4074 /* 4075 * After a reset, the HW starts writing into CSB entry [0]. We 4076 * therefore have to set our HEAD pointer back one entry so that 4077 * the *first* entry we check is entry 0. To complicate this further, 4078 * as we don't wait for the first interrupt after reset, we have to 4079 * fake the HW write to point back to the last entry so that our 4080 * inline comparison of our cached head position against the last HW 4081 * write works even before the first interrupt. 4082 */ 4083 execlists->csb_head = reset_value; 4084 WRITE_ONCE(*execlists->csb_write, reset_value); 4085 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4086 4087 /* Check that the GPU does indeed update the CSB entries! */ 4088 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); 4089 invalidate_csb_entries(&execlists->csb_status[0], 4090 &execlists->csb_status[reset_value]); 4091 4092 /* Once more for luck and our trusty paranoia */ 4093 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4094 0xffff << 16 | reset_value << 8 | reset_value); 4095 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4096 4097 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4098 } 4099 4100 static void execlists_sanitize(struct intel_engine_cs *engine) 4101 { 4102 GEM_BUG_ON(execlists_active(&engine->execlists)); 4103 4104 /* 4105 * Poison residual state on resume, in case the suspend didn't! 4106 * 4107 * We have to assume that across suspend/resume (or other loss 4108 * of control) that the contents of our pinned buffers has been 4109 * lost, replaced by garbage. Since this doesn't always happen, 4110 * let's poison such state so that we more quickly spot when 4111 * we falsely assume it has been preserved. 4112 */ 4113 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4114 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4115 4116 reset_csb_pointers(engine); 4117 4118 /* 4119 * The kernel_context HWSP is stored in the status_page. As above, 4120 * that may be lost on resume/initialisation, and so we need to 4121 * reset the value in the HWSP. 4122 */ 4123 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4124 4125 /* And scrub the dirty cachelines for the HWSP */ 4126 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4127 } 4128 4129 static void enable_error_interrupt(struct intel_engine_cs *engine) 4130 { 4131 u32 status; 4132 4133 engine->execlists.error_interrupt = 0; 4134 ENGINE_WRITE(engine, RING_EMR, ~0u); 4135 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4136 4137 status = ENGINE_READ(engine, RING_ESR); 4138 if (unlikely(status)) { 4139 drm_err(&engine->i915->drm, 4140 "engine '%s' resumed still in error: %08x\n", 4141 engine->name, status); 4142 __intel_gt_reset(engine->gt, engine->mask); 4143 } 4144 4145 /* 4146 * On current gen8+, we have 2 signals to play with 4147 * 4148 * - I915_ERROR_INSTUCTION (bit 0) 4149 * 4150 * Generate an error if the command parser encounters an invalid 4151 * instruction 4152 * 4153 * This is a fatal error. 4154 * 4155 * - CP_PRIV (bit 2) 4156 * 4157 * Generate an error on privilege violation (where the CP replaces 4158 * the instruction with a no-op). This also fires for writes into 4159 * read-only scratch pages. 4160 * 4161 * This is a non-fatal error, parsing continues. 4162 * 4163 * * there are a few others defined for odd HW that we do not use 4164 * 4165 * Since CP_PRIV fires for cases where we have chosen to ignore the 4166 * error (as the HW is validating and suppressing the mistakes), we 4167 * only unmask the instruction error bit. 4168 */ 4169 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4170 } 4171 4172 static void enable_execlists(struct intel_engine_cs *engine) 4173 { 4174 u32 mode; 4175 4176 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4177 4178 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4179 4180 if (INTEL_GEN(engine->i915) >= 11) 4181 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4182 else 4183 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4184 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4185 4186 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4187 4188 ENGINE_WRITE_FW(engine, 4189 RING_HWS_PGA, 4190 i915_ggtt_offset(engine->status_page.vma)); 4191 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4192 4193 enable_error_interrupt(engine); 4194 4195 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4196 } 4197 4198 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4199 { 4200 bool unexpected = false; 4201 4202 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4203 drm_dbg(&engine->i915->drm, 4204 "STOP_RING still set in RING_MI_MODE\n"); 4205 unexpected = true; 4206 } 4207 4208 return unexpected; 4209 } 4210 4211 static int execlists_resume(struct intel_engine_cs *engine) 4212 { 4213 intel_mocs_init_engine(engine); 4214 4215 intel_breadcrumbs_reset(engine->breadcrumbs); 4216 4217 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4218 struct drm_printer p = drm_debug_printer(__func__); 4219 4220 intel_engine_dump(engine, &p, NULL); 4221 } 4222 4223 enable_execlists(engine); 4224 4225 return 0; 4226 } 4227 4228 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4229 { 4230 struct intel_engine_execlists * const execlists = &engine->execlists; 4231 unsigned long flags; 4232 4233 ENGINE_TRACE(engine, "depth<-%d\n", 4234 atomic_read(&execlists->tasklet.count)); 4235 4236 /* 4237 * Prevent request submission to the hardware until we have 4238 * completed the reset in i915_gem_reset_finish(). If a request 4239 * is completed by one engine, it may then queue a request 4240 * to a second via its execlists->tasklet *just* as we are 4241 * calling engine->resume() and also writing the ELSP. 4242 * Turning off the execlists->tasklet until the reset is over 4243 * prevents the race. 4244 */ 4245 __tasklet_disable_sync_once(&execlists->tasklet); 4246 GEM_BUG_ON(!reset_in_progress(execlists)); 4247 4248 /* And flush any current direct submission. */ 4249 spin_lock_irqsave(&engine->active.lock, flags); 4250 spin_unlock_irqrestore(&engine->active.lock, flags); 4251 4252 /* 4253 * We stop engines, otherwise we might get failed reset and a 4254 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4255 * from system hang if batchbuffer is progressing when 4256 * the reset is issued, regardless of READY_TO_RESET ack. 4257 * Thus assume it is best to stop engines on all gens 4258 * where we have a gpu reset. 4259 * 4260 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4261 * 4262 * FIXME: Wa for more modern gens needs to be validated 4263 */ 4264 ring_set_paused(engine, 1); 4265 intel_engine_stop_cs(engine); 4266 4267 engine->execlists.reset_ccid = active_ccid(engine); 4268 } 4269 4270 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4271 { 4272 int x; 4273 4274 x = lrc_ring_mi_mode(engine); 4275 if (x != -1) { 4276 regs[x + 1] &= ~STOP_RING; 4277 regs[x + 1] |= STOP_RING << 16; 4278 } 4279 } 4280 4281 static void __execlists_reset_reg_state(const struct intel_context *ce, 4282 const struct intel_engine_cs *engine) 4283 { 4284 u32 *regs = ce->lrc_reg_state; 4285 4286 __reset_stop_ring(regs, engine); 4287 } 4288 4289 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4290 { 4291 struct intel_engine_execlists * const execlists = &engine->execlists; 4292 struct intel_context *ce; 4293 struct i915_request *rq; 4294 u32 head; 4295 4296 mb(); /* paranoia: read the CSB pointers from after the reset */ 4297 clflush(execlists->csb_write); 4298 mb(); 4299 4300 process_csb(engine); /* drain preemption events */ 4301 4302 /* Following the reset, we need to reload the CSB read/write pointers */ 4303 reset_csb_pointers(engine); 4304 4305 /* 4306 * Save the currently executing context, even if we completed 4307 * its request, it was still running at the time of the 4308 * reset and will have been clobbered. 4309 */ 4310 rq = active_context(engine, engine->execlists.reset_ccid); 4311 if (!rq) 4312 goto unwind; 4313 4314 ce = rq->context; 4315 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4316 4317 if (i915_request_completed(rq)) { 4318 /* Idle context; tidy up the ring so we can restart afresh */ 4319 head = intel_ring_wrap(ce->ring, rq->tail); 4320 goto out_replay; 4321 } 4322 4323 /* We still have requests in-flight; the engine should be active */ 4324 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4325 4326 /* Context has requests still in-flight; it should not be idle! */ 4327 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4328 4329 rq = active_request(ce->timeline, rq); 4330 head = intel_ring_wrap(ce->ring, rq->head); 4331 GEM_BUG_ON(head == ce->ring->tail); 4332 4333 /* 4334 * If this request hasn't started yet, e.g. it is waiting on a 4335 * semaphore, we need to avoid skipping the request or else we 4336 * break the signaling chain. However, if the context is corrupt 4337 * the request will not restart and we will be stuck with a wedged 4338 * device. It is quite often the case that if we issue a reset 4339 * while the GPU is loading the context image, that the context 4340 * image becomes corrupt. 4341 * 4342 * Otherwise, if we have not started yet, the request should replay 4343 * perfectly and we do not need to flag the result as being erroneous. 4344 */ 4345 if (!i915_request_started(rq)) 4346 goto out_replay; 4347 4348 /* 4349 * If the request was innocent, we leave the request in the ELSP 4350 * and will try to replay it on restarting. The context image may 4351 * have been corrupted by the reset, in which case we may have 4352 * to service a new GPU hang, but more likely we can continue on 4353 * without impact. 4354 * 4355 * If the request was guilty, we presume the context is corrupt 4356 * and have to at least restore the RING register in the context 4357 * image back to the expected values to skip over the guilty request. 4358 */ 4359 __i915_request_reset(rq, stalled); 4360 4361 /* 4362 * We want a simple context + ring to execute the breadcrumb update. 4363 * We cannot rely on the context being intact across the GPU hang, 4364 * so clear it and rebuild just what we need for the breadcrumb. 4365 * All pending requests for this context will be zapped, and any 4366 * future request will be after userspace has had the opportunity 4367 * to recreate its own state. 4368 */ 4369 out_replay: 4370 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4371 head, ce->ring->tail); 4372 __execlists_reset_reg_state(ce, engine); 4373 __execlists_update_reg_state(ce, engine, head); 4374 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4375 4376 unwind: 4377 /* Push back any incomplete requests for replay after the reset. */ 4378 cancel_port_requests(execlists); 4379 __unwind_incomplete_requests(engine); 4380 } 4381 4382 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4383 { 4384 unsigned long flags; 4385 4386 ENGINE_TRACE(engine, "\n"); 4387 4388 spin_lock_irqsave(&engine->active.lock, flags); 4389 4390 __execlists_reset(engine, stalled); 4391 4392 spin_unlock_irqrestore(&engine->active.lock, flags); 4393 } 4394 4395 static void nop_submission_tasklet(unsigned long data) 4396 { 4397 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4398 4399 /* The driver is wedged; don't process any more events. */ 4400 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4401 } 4402 4403 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4404 { 4405 struct intel_engine_execlists * const execlists = &engine->execlists; 4406 struct i915_request *rq, *rn; 4407 struct rb_node *rb; 4408 unsigned long flags; 4409 4410 ENGINE_TRACE(engine, "\n"); 4411 4412 /* 4413 * Before we call engine->cancel_requests(), we should have exclusive 4414 * access to the submission state. This is arranged for us by the 4415 * caller disabling the interrupt generation, the tasklet and other 4416 * threads that may then access the same state, giving us a free hand 4417 * to reset state. However, we still need to let lockdep be aware that 4418 * we know this state may be accessed in hardirq context, so we 4419 * disable the irq around this manipulation and we want to keep 4420 * the spinlock focused on its duties and not accidentally conflate 4421 * coverage to the submission's irq state. (Similarly, although we 4422 * shouldn't need to disable irq around the manipulation of the 4423 * submission's irq state, we also wish to remind ourselves that 4424 * it is irq state.) 4425 */ 4426 spin_lock_irqsave(&engine->active.lock, flags); 4427 4428 __execlists_reset(engine, true); 4429 4430 /* Mark all executing requests as skipped. */ 4431 list_for_each_entry(rq, &engine->active.requests, sched.link) 4432 mark_eio(rq); 4433 intel_engine_signal_breadcrumbs(engine); 4434 4435 /* Flush the queued requests to the timeline list (for retiring). */ 4436 while ((rb = rb_first_cached(&execlists->queue))) { 4437 struct i915_priolist *p = to_priolist(rb); 4438 int i; 4439 4440 priolist_for_each_request_consume(rq, rn, p, i) { 4441 mark_eio(rq); 4442 __i915_request_submit(rq); 4443 } 4444 4445 rb_erase_cached(&p->node, &execlists->queue); 4446 i915_priolist_free(p); 4447 } 4448 4449 /* On-hold requests will be flushed to timeline upon their release */ 4450 list_for_each_entry(rq, &engine->active.hold, sched.link) 4451 mark_eio(rq); 4452 4453 /* Cancel all attached virtual engines */ 4454 while ((rb = rb_first_cached(&execlists->virtual))) { 4455 struct virtual_engine *ve = 4456 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4457 4458 rb_erase_cached(rb, &execlists->virtual); 4459 RB_CLEAR_NODE(rb); 4460 4461 spin_lock(&ve->base.active.lock); 4462 rq = fetch_and_zero(&ve->request); 4463 if (rq) { 4464 mark_eio(rq); 4465 4466 rq->engine = engine; 4467 __i915_request_submit(rq); 4468 i915_request_put(rq); 4469 4470 ve->base.execlists.queue_priority_hint = INT_MIN; 4471 } 4472 spin_unlock(&ve->base.active.lock); 4473 } 4474 4475 /* Remaining _unready_ requests will be nop'ed when submitted */ 4476 4477 execlists->queue_priority_hint = INT_MIN; 4478 execlists->queue = RB_ROOT_CACHED; 4479 4480 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4481 execlists->tasklet.func = nop_submission_tasklet; 4482 4483 spin_unlock_irqrestore(&engine->active.lock, flags); 4484 } 4485 4486 static void execlists_reset_finish(struct intel_engine_cs *engine) 4487 { 4488 struct intel_engine_execlists * const execlists = &engine->execlists; 4489 4490 /* 4491 * After a GPU reset, we may have requests to replay. Do so now while 4492 * we still have the forcewake to be sure that the GPU is not allowed 4493 * to sleep before we restart and reload a context. 4494 */ 4495 GEM_BUG_ON(!reset_in_progress(execlists)); 4496 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4497 execlists->tasklet.func(execlists->tasklet.data); 4498 4499 if (__tasklet_enable(&execlists->tasklet)) 4500 /* And kick in case we missed a new request submission. */ 4501 tasklet_hi_schedule(&execlists->tasklet); 4502 ENGINE_TRACE(engine, "depth->%d\n", 4503 atomic_read(&execlists->tasklet.count)); 4504 } 4505 4506 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4507 u64 offset, u32 len, 4508 const unsigned int flags) 4509 { 4510 u32 *cs; 4511 4512 cs = intel_ring_begin(rq, 4); 4513 if (IS_ERR(cs)) 4514 return PTR_ERR(cs); 4515 4516 /* 4517 * WaDisableCtxRestoreArbitration:bdw,chv 4518 * 4519 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4520 * particular all the gen that do not need the w/a at all!), if we 4521 * took care to make sure that on every switch into this context 4522 * (both ordinary and for preemption) that arbitrartion was enabled 4523 * we would be fine. However, for gen8 there is another w/a that 4524 * requires us to not preempt inside GPGPU execution, so we keep 4525 * arbitration disabled for gen8 batches. Arbitration will be 4526 * re-enabled before we close the request 4527 * (engine->emit_fini_breadcrumb). 4528 */ 4529 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4530 4531 /* FIXME(BDW+): Address space and security selectors. */ 4532 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4533 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4534 *cs++ = lower_32_bits(offset); 4535 *cs++ = upper_32_bits(offset); 4536 4537 intel_ring_advance(rq, cs); 4538 4539 return 0; 4540 } 4541 4542 static int gen8_emit_bb_start(struct i915_request *rq, 4543 u64 offset, u32 len, 4544 const unsigned int flags) 4545 { 4546 u32 *cs; 4547 4548 cs = intel_ring_begin(rq, 6); 4549 if (IS_ERR(cs)) 4550 return PTR_ERR(cs); 4551 4552 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4553 4554 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4555 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4556 *cs++ = lower_32_bits(offset); 4557 *cs++ = upper_32_bits(offset); 4558 4559 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4560 *cs++ = MI_NOOP; 4561 4562 intel_ring_advance(rq, cs); 4563 4564 return 0; 4565 } 4566 4567 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4568 { 4569 ENGINE_WRITE(engine, RING_IMR, 4570 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4571 ENGINE_POSTING_READ(engine, RING_IMR); 4572 } 4573 4574 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4575 { 4576 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4577 } 4578 4579 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4580 { 4581 u32 cmd, *cs; 4582 4583 cs = intel_ring_begin(request, 4); 4584 if (IS_ERR(cs)) 4585 return PTR_ERR(cs); 4586 4587 cmd = MI_FLUSH_DW + 1; 4588 4589 /* We always require a command barrier so that subsequent 4590 * commands, such as breadcrumb interrupts, are strictly ordered 4591 * wrt the contents of the write cache being flushed to memory 4592 * (and thus being coherent from the CPU). 4593 */ 4594 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4595 4596 if (mode & EMIT_INVALIDATE) { 4597 cmd |= MI_INVALIDATE_TLB; 4598 if (request->engine->class == VIDEO_DECODE_CLASS) 4599 cmd |= MI_INVALIDATE_BSD; 4600 } 4601 4602 *cs++ = cmd; 4603 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4604 *cs++ = 0; /* upper addr */ 4605 *cs++ = 0; /* value */ 4606 intel_ring_advance(request, cs); 4607 4608 return 0; 4609 } 4610 4611 static int gen8_emit_flush_render(struct i915_request *request, 4612 u32 mode) 4613 { 4614 bool vf_flush_wa = false, dc_flush_wa = false; 4615 u32 *cs, flags = 0; 4616 int len; 4617 4618 flags |= PIPE_CONTROL_CS_STALL; 4619 4620 if (mode & EMIT_FLUSH) { 4621 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4622 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4623 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4624 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4625 } 4626 4627 if (mode & EMIT_INVALIDATE) { 4628 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4629 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4630 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4631 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4632 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4633 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4634 flags |= PIPE_CONTROL_QW_WRITE; 4635 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4636 4637 /* 4638 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4639 * pipe control. 4640 */ 4641 if (IS_GEN(request->engine->i915, 9)) 4642 vf_flush_wa = true; 4643 4644 /* WaForGAMHang:kbl */ 4645 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4646 dc_flush_wa = true; 4647 } 4648 4649 len = 6; 4650 4651 if (vf_flush_wa) 4652 len += 6; 4653 4654 if (dc_flush_wa) 4655 len += 12; 4656 4657 cs = intel_ring_begin(request, len); 4658 if (IS_ERR(cs)) 4659 return PTR_ERR(cs); 4660 4661 if (vf_flush_wa) 4662 cs = gen8_emit_pipe_control(cs, 0, 0); 4663 4664 if (dc_flush_wa) 4665 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4666 0); 4667 4668 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4669 4670 if (dc_flush_wa) 4671 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4672 4673 intel_ring_advance(request, cs); 4674 4675 return 0; 4676 } 4677 4678 static int gen11_emit_flush_render(struct i915_request *request, 4679 u32 mode) 4680 { 4681 if (mode & EMIT_FLUSH) { 4682 u32 *cs; 4683 u32 flags = 0; 4684 4685 flags |= PIPE_CONTROL_CS_STALL; 4686 4687 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4688 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4689 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4690 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4691 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4692 flags |= PIPE_CONTROL_QW_WRITE; 4693 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4694 4695 cs = intel_ring_begin(request, 6); 4696 if (IS_ERR(cs)) 4697 return PTR_ERR(cs); 4698 4699 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4700 intel_ring_advance(request, cs); 4701 } 4702 4703 if (mode & EMIT_INVALIDATE) { 4704 u32 *cs; 4705 u32 flags = 0; 4706 4707 flags |= PIPE_CONTROL_CS_STALL; 4708 4709 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4710 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4711 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4712 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4713 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4714 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4715 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4716 flags |= PIPE_CONTROL_QW_WRITE; 4717 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4718 4719 cs = intel_ring_begin(request, 6); 4720 if (IS_ERR(cs)) 4721 return PTR_ERR(cs); 4722 4723 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4724 intel_ring_advance(request, cs); 4725 } 4726 4727 return 0; 4728 } 4729 4730 static u32 preparser_disable(bool state) 4731 { 4732 return MI_ARB_CHECK | 1 << 8 | state; 4733 } 4734 4735 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4736 { 4737 static const i915_reg_t vd[] = { 4738 GEN12_VD0_AUX_NV, 4739 GEN12_VD1_AUX_NV, 4740 GEN12_VD2_AUX_NV, 4741 GEN12_VD3_AUX_NV, 4742 }; 4743 4744 static const i915_reg_t ve[] = { 4745 GEN12_VE0_AUX_NV, 4746 GEN12_VE1_AUX_NV, 4747 }; 4748 4749 if (engine->class == VIDEO_DECODE_CLASS) 4750 return vd[engine->instance]; 4751 4752 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4753 return ve[engine->instance]; 4754 4755 GEM_BUG_ON("unknown aux_inv_reg\n"); 4756 4757 return INVALID_MMIO_REG; 4758 } 4759 4760 static u32 * 4761 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4762 { 4763 *cs++ = MI_LOAD_REGISTER_IMM(1); 4764 *cs++ = i915_mmio_reg_offset(inv_reg); 4765 *cs++ = AUX_INV; 4766 *cs++ = MI_NOOP; 4767 4768 return cs; 4769 } 4770 4771 static int gen12_emit_flush_render(struct i915_request *request, 4772 u32 mode) 4773 { 4774 if (mode & EMIT_FLUSH) { 4775 u32 flags = 0; 4776 u32 *cs; 4777 4778 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4779 flags |= PIPE_CONTROL_FLUSH_L3; 4780 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4781 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4782 /* Wa_1409600907:tgl */ 4783 flags |= PIPE_CONTROL_DEPTH_STALL; 4784 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4785 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4786 4787 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4788 flags |= PIPE_CONTROL_QW_WRITE; 4789 4790 flags |= PIPE_CONTROL_CS_STALL; 4791 4792 cs = intel_ring_begin(request, 6); 4793 if (IS_ERR(cs)) 4794 return PTR_ERR(cs); 4795 4796 cs = gen12_emit_pipe_control(cs, 4797 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4798 flags, LRC_PPHWSP_SCRATCH_ADDR); 4799 intel_ring_advance(request, cs); 4800 } 4801 4802 if (mode & EMIT_INVALIDATE) { 4803 u32 flags = 0; 4804 u32 *cs; 4805 4806 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4807 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4808 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4809 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4810 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4811 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4812 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4813 4814 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4815 flags |= PIPE_CONTROL_QW_WRITE; 4816 4817 flags |= PIPE_CONTROL_CS_STALL; 4818 4819 cs = intel_ring_begin(request, 8 + 4); 4820 if (IS_ERR(cs)) 4821 return PTR_ERR(cs); 4822 4823 /* 4824 * Prevent the pre-parser from skipping past the TLB 4825 * invalidate and loading a stale page for the batch 4826 * buffer / request payload. 4827 */ 4828 *cs++ = preparser_disable(true); 4829 4830 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4831 4832 /* hsdes: 1809175790 */ 4833 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4834 4835 *cs++ = preparser_disable(false); 4836 intel_ring_advance(request, cs); 4837 } 4838 4839 return 0; 4840 } 4841 4842 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4843 { 4844 intel_engine_mask_t aux_inv = 0; 4845 u32 cmd, *cs; 4846 4847 cmd = 4; 4848 if (mode & EMIT_INVALIDATE) 4849 cmd += 2; 4850 if (mode & EMIT_INVALIDATE) 4851 aux_inv = request->engine->mask & ~BIT(BCS0); 4852 if (aux_inv) 4853 cmd += 2 * hweight8(aux_inv) + 2; 4854 4855 cs = intel_ring_begin(request, cmd); 4856 if (IS_ERR(cs)) 4857 return PTR_ERR(cs); 4858 4859 if (mode & EMIT_INVALIDATE) 4860 *cs++ = preparser_disable(true); 4861 4862 cmd = MI_FLUSH_DW + 1; 4863 4864 /* We always require a command barrier so that subsequent 4865 * commands, such as breadcrumb interrupts, are strictly ordered 4866 * wrt the contents of the write cache being flushed to memory 4867 * (and thus being coherent from the CPU). 4868 */ 4869 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4870 4871 if (mode & EMIT_INVALIDATE) { 4872 cmd |= MI_INVALIDATE_TLB; 4873 if (request->engine->class == VIDEO_DECODE_CLASS) 4874 cmd |= MI_INVALIDATE_BSD; 4875 } 4876 4877 *cs++ = cmd; 4878 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4879 *cs++ = 0; /* upper addr */ 4880 *cs++ = 0; /* value */ 4881 4882 if (aux_inv) { /* hsdes: 1809175790 */ 4883 struct intel_engine_cs *engine; 4884 unsigned int tmp; 4885 4886 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4887 for_each_engine_masked(engine, request->engine->gt, 4888 aux_inv, tmp) { 4889 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4890 *cs++ = AUX_INV; 4891 } 4892 *cs++ = MI_NOOP; 4893 } 4894 4895 if (mode & EMIT_INVALIDATE) 4896 *cs++ = preparser_disable(false); 4897 4898 intel_ring_advance(request, cs); 4899 4900 return 0; 4901 } 4902 4903 static void assert_request_valid(struct i915_request *rq) 4904 { 4905 struct intel_ring *ring __maybe_unused = rq->ring; 4906 4907 /* Can we unwind this request without appearing to go forwards? */ 4908 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4909 } 4910 4911 /* 4912 * Reserve space for 2 NOOPs at the end of each request to be 4913 * used as a workaround for not being allowed to do lite 4914 * restore with HEAD==TAIL (WaIdleLiteRestore). 4915 */ 4916 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4917 { 4918 /* Ensure there's always at least one preemption point per-request. */ 4919 *cs++ = MI_ARB_CHECK; 4920 *cs++ = MI_NOOP; 4921 request->wa_tail = intel_ring_offset(request, cs); 4922 4923 /* Check that entire request is less than half the ring */ 4924 assert_request_valid(request); 4925 4926 return cs; 4927 } 4928 4929 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4930 { 4931 *cs++ = MI_SEMAPHORE_WAIT | 4932 MI_SEMAPHORE_GLOBAL_GTT | 4933 MI_SEMAPHORE_POLL | 4934 MI_SEMAPHORE_SAD_EQ_SDD; 4935 *cs++ = 0; 4936 *cs++ = intel_hws_preempt_address(request->engine); 4937 *cs++ = 0; 4938 4939 return cs; 4940 } 4941 4942 static __always_inline u32* 4943 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4944 { 4945 *cs++ = MI_USER_INTERRUPT; 4946 4947 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4948 if (intel_engine_has_semaphores(request->engine)) 4949 cs = emit_preempt_busywait(request, cs); 4950 4951 request->tail = intel_ring_offset(request, cs); 4952 assert_ring_tail_valid(request->ring, request->tail); 4953 4954 return gen8_emit_wa_tail(request, cs); 4955 } 4956 4957 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) 4958 { 4959 return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); 4960 } 4961 4962 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4963 { 4964 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4965 } 4966 4967 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4968 { 4969 cs = gen8_emit_pipe_control(cs, 4970 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4971 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4972 PIPE_CONTROL_DC_FLUSH_ENABLE, 4973 0); 4974 4975 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4976 cs = gen8_emit_ggtt_write_rcs(cs, 4977 request->fence.seqno, 4978 hwsp_offset(request), 4979 PIPE_CONTROL_FLUSH_ENABLE | 4980 PIPE_CONTROL_CS_STALL); 4981 4982 return gen8_emit_fini_breadcrumb_tail(request, cs); 4983 } 4984 4985 static u32 * 4986 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4987 { 4988 cs = gen8_emit_ggtt_write_rcs(cs, 4989 request->fence.seqno, 4990 hwsp_offset(request), 4991 PIPE_CONTROL_CS_STALL | 4992 PIPE_CONTROL_TILE_CACHE_FLUSH | 4993 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4994 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4995 PIPE_CONTROL_DC_FLUSH_ENABLE | 4996 PIPE_CONTROL_FLUSH_ENABLE); 4997 4998 return gen8_emit_fini_breadcrumb_tail(request, cs); 4999 } 5000 5001 /* 5002 * Note that the CS instruction pre-parser will not stall on the breadcrumb 5003 * flush and will continue pre-fetching the instructions after it before the 5004 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 5005 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 5006 * of the next request before the memory has been flushed, we're guaranteed that 5007 * we won't access the batch itself too early. 5008 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 5009 * so, if the current request is modifying an instruction in the next request on 5010 * the same intel_context, we might pre-fetch and then execute the pre-update 5011 * instruction. To avoid this, the users of self-modifying code should either 5012 * disable the parser around the code emitting the memory writes, via a new flag 5013 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 5014 * the in-kernel use-cases we've opted to use a separate context, see 5015 * reloc_gpu() as an example. 5016 * All the above applies only to the instructions themselves. Non-inline data 5017 * used by the instructions is not pre-fetched. 5018 */ 5019 5020 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 5021 { 5022 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 5023 MI_SEMAPHORE_GLOBAL_GTT | 5024 MI_SEMAPHORE_POLL | 5025 MI_SEMAPHORE_SAD_EQ_SDD; 5026 *cs++ = 0; 5027 *cs++ = intel_hws_preempt_address(request->engine); 5028 *cs++ = 0; 5029 *cs++ = 0; 5030 *cs++ = MI_NOOP; 5031 5032 return cs; 5033 } 5034 5035 static __always_inline u32* 5036 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 5037 { 5038 *cs++ = MI_USER_INTERRUPT; 5039 5040 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 5041 if (intel_engine_has_semaphores(request->engine)) 5042 cs = gen12_emit_preempt_busywait(request, cs); 5043 5044 request->tail = intel_ring_offset(request, cs); 5045 assert_ring_tail_valid(request->ring, request->tail); 5046 5047 return gen8_emit_wa_tail(request, cs); 5048 } 5049 5050 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 5051 { 5052 /* XXX Stalling flush before seqno write; post-sync not */ 5053 cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); 5054 return gen12_emit_fini_breadcrumb_tail(rq, cs); 5055 } 5056 5057 static u32 * 5058 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 5059 { 5060 cs = gen12_emit_ggtt_write_rcs(cs, 5061 request->fence.seqno, 5062 hwsp_offset(request), 5063 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 5064 PIPE_CONTROL_CS_STALL | 5065 PIPE_CONTROL_TILE_CACHE_FLUSH | 5066 PIPE_CONTROL_FLUSH_L3 | 5067 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5068 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5069 /* Wa_1409600907:tgl */ 5070 PIPE_CONTROL_DEPTH_STALL | 5071 PIPE_CONTROL_DC_FLUSH_ENABLE | 5072 PIPE_CONTROL_FLUSH_ENABLE); 5073 5074 return gen12_emit_fini_breadcrumb_tail(request, cs); 5075 } 5076 5077 static void execlists_park(struct intel_engine_cs *engine) 5078 { 5079 cancel_timer(&engine->execlists.timer); 5080 cancel_timer(&engine->execlists.preempt); 5081 } 5082 5083 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 5084 { 5085 engine->submit_request = execlists_submit_request; 5086 engine->schedule = i915_schedule; 5087 engine->execlists.tasklet.func = execlists_submission_tasklet; 5088 5089 engine->reset.prepare = execlists_reset_prepare; 5090 engine->reset.rewind = execlists_reset_rewind; 5091 engine->reset.cancel = execlists_reset_cancel; 5092 engine->reset.finish = execlists_reset_finish; 5093 5094 engine->park = execlists_park; 5095 engine->unpark = NULL; 5096 5097 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5098 if (!intel_vgpu_active(engine->i915)) { 5099 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5100 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5101 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5102 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5103 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5104 } 5105 } 5106 5107 if (INTEL_GEN(engine->i915) >= 12) 5108 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5109 5110 if (intel_engine_has_preemption(engine)) 5111 engine->emit_bb_start = gen8_emit_bb_start; 5112 else 5113 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5114 } 5115 5116 static void execlists_shutdown(struct intel_engine_cs *engine) 5117 { 5118 /* Synchronise with residual timers and any softirq they raise */ 5119 del_timer_sync(&engine->execlists.timer); 5120 del_timer_sync(&engine->execlists.preempt); 5121 tasklet_kill(&engine->execlists.tasklet); 5122 } 5123 5124 static void execlists_release(struct intel_engine_cs *engine) 5125 { 5126 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5127 5128 execlists_shutdown(engine); 5129 5130 intel_engine_cleanup_common(engine); 5131 lrc_destroy_wa_ctx(engine); 5132 } 5133 5134 static void 5135 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5136 { 5137 /* Default vfuncs which can be overriden by each engine. */ 5138 5139 engine->resume = execlists_resume; 5140 5141 engine->cops = &execlists_context_ops; 5142 engine->request_alloc = execlists_request_alloc; 5143 5144 engine->emit_flush = gen8_emit_flush; 5145 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5146 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5147 if (INTEL_GEN(engine->i915) >= 12) { 5148 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5149 engine->emit_flush = gen12_emit_flush; 5150 } 5151 engine->set_default_submission = intel_execlists_set_default_submission; 5152 5153 if (INTEL_GEN(engine->i915) < 11) { 5154 engine->irq_enable = gen8_logical_ring_enable_irq; 5155 engine->irq_disable = gen8_logical_ring_disable_irq; 5156 } else { 5157 /* 5158 * TODO: On Gen11 interrupt masks need to be clear 5159 * to allow C6 entry. Keep interrupts enabled at 5160 * and take the hit of generating extra interrupts 5161 * until a more refined solution exists. 5162 */ 5163 } 5164 } 5165 5166 static inline void 5167 logical_ring_default_irqs(struct intel_engine_cs *engine) 5168 { 5169 unsigned int shift = 0; 5170 5171 if (INTEL_GEN(engine->i915) < 11) { 5172 const u8 irq_shifts[] = { 5173 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5174 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5175 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5176 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5177 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5178 }; 5179 5180 shift = irq_shifts[engine->id]; 5181 } 5182 5183 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5184 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5185 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5186 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5187 } 5188 5189 static void rcs_submission_override(struct intel_engine_cs *engine) 5190 { 5191 switch (INTEL_GEN(engine->i915)) { 5192 case 12: 5193 engine->emit_flush = gen12_emit_flush_render; 5194 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5195 break; 5196 case 11: 5197 engine->emit_flush = gen11_emit_flush_render; 5198 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5199 break; 5200 default: 5201 engine->emit_flush = gen8_emit_flush_render; 5202 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5203 break; 5204 } 5205 } 5206 5207 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5208 { 5209 struct intel_engine_execlists * const execlists = &engine->execlists; 5210 struct drm_i915_private *i915 = engine->i915; 5211 struct intel_uncore *uncore = engine->uncore; 5212 u32 base = engine->mmio_base; 5213 5214 tasklet_init(&engine->execlists.tasklet, 5215 execlists_submission_tasklet, (unsigned long)engine); 5216 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5217 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5218 5219 logical_ring_default_vfuncs(engine); 5220 logical_ring_default_irqs(engine); 5221 5222 if (engine->class == RENDER_CLASS) 5223 rcs_submission_override(engine); 5224 5225 if (intel_init_workaround_bb(engine)) 5226 /* 5227 * We continue even if we fail to initialize WA batch 5228 * because we only expect rare glitches but nothing 5229 * critical to prevent us from using GPU 5230 */ 5231 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5232 5233 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5234 execlists->submit_reg = uncore->regs + 5235 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5236 execlists->ctrl_reg = uncore->regs + 5237 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5238 } else { 5239 execlists->submit_reg = uncore->regs + 5240 i915_mmio_reg_offset(RING_ELSP(base)); 5241 } 5242 5243 execlists->csb_status = 5244 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5245 5246 execlists->csb_write = 5247 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5248 5249 if (INTEL_GEN(i915) < 11) 5250 execlists->csb_size = GEN8_CSB_ENTRIES; 5251 else 5252 execlists->csb_size = GEN11_CSB_ENTRIES; 5253 5254 if (INTEL_GEN(engine->i915) >= 11) { 5255 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5256 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5257 } 5258 5259 /* Finally, take ownership and responsibility for cleanup! */ 5260 engine->sanitize = execlists_sanitize; 5261 engine->release = execlists_release; 5262 5263 return 0; 5264 } 5265 5266 static void init_common_reg_state(u32 * const regs, 5267 const struct intel_engine_cs *engine, 5268 const struct intel_ring *ring, 5269 bool inhibit) 5270 { 5271 u32 ctl; 5272 5273 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5274 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5275 if (inhibit) 5276 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5277 if (INTEL_GEN(engine->i915) < 11) 5278 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5279 CTX_CTRL_RS_CTX_ENABLE); 5280 regs[CTX_CONTEXT_CONTROL] = ctl; 5281 5282 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5283 regs[CTX_TIMESTAMP] = 0; 5284 } 5285 5286 static void init_wa_bb_reg_state(u32 * const regs, 5287 const struct intel_engine_cs *engine) 5288 { 5289 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5290 5291 if (wa_ctx->per_ctx.size) { 5292 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5293 5294 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5295 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5296 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5297 } 5298 5299 if (wa_ctx->indirect_ctx.size) { 5300 lrc_ring_setup_indirect_ctx(regs, engine, 5301 i915_ggtt_offset(wa_ctx->vma) + 5302 wa_ctx->indirect_ctx.offset, 5303 wa_ctx->indirect_ctx.size); 5304 } 5305 } 5306 5307 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5308 { 5309 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5310 /* 64b PPGTT (48bit canonical) 5311 * PDP0_DESCRIPTOR contains the base address to PML4 and 5312 * other PDP Descriptors are ignored. 5313 */ 5314 ASSIGN_CTX_PML4(ppgtt, regs); 5315 } else { 5316 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5317 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5318 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5319 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5320 } 5321 } 5322 5323 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5324 { 5325 if (i915_is_ggtt(vm)) 5326 return i915_vm_to_ggtt(vm)->alias; 5327 else 5328 return i915_vm_to_ppgtt(vm); 5329 } 5330 5331 static void execlists_init_reg_state(u32 *regs, 5332 const struct intel_context *ce, 5333 const struct intel_engine_cs *engine, 5334 const struct intel_ring *ring, 5335 bool inhibit) 5336 { 5337 /* 5338 * A context is actually a big batch buffer with several 5339 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5340 * values we are setting here are only for the first context restore: 5341 * on a subsequent save, the GPU will recreate this batchbuffer with new 5342 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5343 * we are not initializing here). 5344 * 5345 * Must keep consistent with virtual_update_register_offsets(). 5346 */ 5347 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5348 5349 init_common_reg_state(regs, engine, ring, inhibit); 5350 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5351 5352 init_wa_bb_reg_state(regs, engine); 5353 5354 __reset_stop_ring(regs, engine); 5355 } 5356 5357 static int 5358 populate_lr_context(struct intel_context *ce, 5359 struct drm_i915_gem_object *ctx_obj, 5360 struct intel_engine_cs *engine, 5361 struct intel_ring *ring) 5362 { 5363 bool inhibit = true; 5364 void *vaddr; 5365 5366 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5367 if (IS_ERR(vaddr)) { 5368 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5369 return PTR_ERR(vaddr); 5370 } 5371 5372 set_redzone(vaddr, engine); 5373 5374 if (engine->default_state) { 5375 shmem_read(engine->default_state, 0, 5376 vaddr, engine->context_size); 5377 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5378 inhibit = false; 5379 } 5380 5381 /* Clear the ppHWSP (inc. per-context counters) */ 5382 memset(vaddr, 0, PAGE_SIZE); 5383 5384 /* 5385 * The second page of the context object contains some registers which 5386 * must be set up prior to the first execution. 5387 */ 5388 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5389 ce, engine, ring, inhibit); 5390 5391 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5392 i915_gem_object_unpin_map(ctx_obj); 5393 return 0; 5394 } 5395 5396 static struct intel_timeline *pinned_timeline(struct intel_context *ce) 5397 { 5398 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 5399 5400 return intel_timeline_create_from_engine(ce->engine, 5401 page_unmask_bits(tl)); 5402 } 5403 5404 static int __execlists_context_alloc(struct intel_context *ce, 5405 struct intel_engine_cs *engine) 5406 { 5407 struct drm_i915_gem_object *ctx_obj; 5408 struct intel_ring *ring; 5409 struct i915_vma *vma; 5410 u32 context_size; 5411 int ret; 5412 5413 GEM_BUG_ON(ce->state); 5414 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5415 5416 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5417 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5418 5419 if (INTEL_GEN(engine->i915) == 12) { 5420 ce->wa_bb_page = context_size / PAGE_SIZE; 5421 context_size += PAGE_SIZE; 5422 } 5423 5424 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5425 if (IS_ERR(ctx_obj)) 5426 return PTR_ERR(ctx_obj); 5427 5428 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5429 if (IS_ERR(vma)) { 5430 ret = PTR_ERR(vma); 5431 goto error_deref_obj; 5432 } 5433 5434 if (!page_mask_bits(ce->timeline)) { 5435 struct intel_timeline *tl; 5436 5437 /* 5438 * Use the static global HWSP for the kernel context, and 5439 * a dynamically allocated cacheline for everyone else. 5440 */ 5441 if (unlikely(ce->timeline)) 5442 tl = pinned_timeline(ce); 5443 else 5444 tl = intel_timeline_create(engine->gt); 5445 if (IS_ERR(tl)) { 5446 ret = PTR_ERR(tl); 5447 goto error_deref_obj; 5448 } 5449 5450 ce->timeline = tl; 5451 } 5452 5453 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5454 if (IS_ERR(ring)) { 5455 ret = PTR_ERR(ring); 5456 goto error_deref_obj; 5457 } 5458 5459 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5460 if (ret) { 5461 drm_dbg(&engine->i915->drm, 5462 "Failed to populate LRC: %d\n", ret); 5463 goto error_ring_free; 5464 } 5465 5466 ce->ring = ring; 5467 ce->state = vma; 5468 5469 return 0; 5470 5471 error_ring_free: 5472 intel_ring_put(ring); 5473 error_deref_obj: 5474 i915_gem_object_put(ctx_obj); 5475 return ret; 5476 } 5477 5478 static struct list_head *virtual_queue(struct virtual_engine *ve) 5479 { 5480 return &ve->base.execlists.default_priolist.requests[0]; 5481 } 5482 5483 static void virtual_context_destroy(struct kref *kref) 5484 { 5485 struct virtual_engine *ve = 5486 container_of(kref, typeof(*ve), context.ref); 5487 unsigned int n; 5488 5489 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5490 GEM_BUG_ON(ve->request); 5491 GEM_BUG_ON(ve->context.inflight); 5492 5493 for (n = 0; n < ve->num_siblings; n++) { 5494 struct intel_engine_cs *sibling = ve->siblings[n]; 5495 struct rb_node *node = &ve->nodes[sibling->id].rb; 5496 unsigned long flags; 5497 5498 if (RB_EMPTY_NODE(node)) 5499 continue; 5500 5501 spin_lock_irqsave(&sibling->active.lock, flags); 5502 5503 /* Detachment is lazily performed in the execlists tasklet */ 5504 if (!RB_EMPTY_NODE(node)) 5505 rb_erase_cached(node, &sibling->execlists.virtual); 5506 5507 spin_unlock_irqrestore(&sibling->active.lock, flags); 5508 } 5509 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5510 5511 if (ve->context.state) 5512 __execlists_context_fini(&ve->context); 5513 intel_context_fini(&ve->context); 5514 5515 intel_engine_free_request_pool(&ve->base); 5516 5517 kfree(ve->bonds); 5518 kfree(ve); 5519 } 5520 5521 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5522 { 5523 int swp; 5524 5525 /* 5526 * Pick a random sibling on starting to help spread the load around. 5527 * 5528 * New contexts are typically created with exactly the same order 5529 * of siblings, and often started in batches. Due to the way we iterate 5530 * the array of sibling when submitting requests, sibling[0] is 5531 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5532 * randomised across the system, we also help spread the load by the 5533 * first engine we inspect being different each time. 5534 * 5535 * NB This does not force us to execute on this engine, it will just 5536 * typically be the first we inspect for submission. 5537 */ 5538 swp = prandom_u32_max(ve->num_siblings); 5539 if (swp) 5540 swap(ve->siblings[swp], ve->siblings[0]); 5541 } 5542 5543 static int virtual_context_alloc(struct intel_context *ce) 5544 { 5545 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5546 5547 return __execlists_context_alloc(ce, ve->siblings[0]); 5548 } 5549 5550 static int virtual_context_pin(struct intel_context *ce, void *vaddr) 5551 { 5552 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5553 5554 /* Note: we must use a real engine class for setting up reg state */ 5555 return __execlists_context_pin(ce, ve->siblings[0], vaddr); 5556 } 5557 5558 static void virtual_context_enter(struct intel_context *ce) 5559 { 5560 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5561 unsigned int n; 5562 5563 for (n = 0; n < ve->num_siblings; n++) 5564 intel_engine_pm_get(ve->siblings[n]); 5565 5566 intel_timeline_enter(ce->timeline); 5567 } 5568 5569 static void virtual_context_exit(struct intel_context *ce) 5570 { 5571 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5572 unsigned int n; 5573 5574 intel_timeline_exit(ce->timeline); 5575 5576 for (n = 0; n < ve->num_siblings; n++) 5577 intel_engine_pm_put(ve->siblings[n]); 5578 } 5579 5580 static const struct intel_context_ops virtual_context_ops = { 5581 .alloc = virtual_context_alloc, 5582 5583 .pre_pin = execlists_context_pre_pin, 5584 .pin = virtual_context_pin, 5585 .unpin = execlists_context_unpin, 5586 .post_unpin = execlists_context_post_unpin, 5587 5588 .enter = virtual_context_enter, 5589 .exit = virtual_context_exit, 5590 5591 .destroy = virtual_context_destroy, 5592 }; 5593 5594 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5595 { 5596 struct i915_request *rq; 5597 intel_engine_mask_t mask; 5598 5599 rq = READ_ONCE(ve->request); 5600 if (!rq) 5601 return 0; 5602 5603 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5604 mask = rq->execution_mask; 5605 if (unlikely(!mask)) { 5606 /* Invalid selection, submit to a random engine in error */ 5607 i915_request_set_error_once(rq, -ENODEV); 5608 mask = ve->siblings[0]->mask; 5609 } 5610 5611 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5612 rq->fence.context, rq->fence.seqno, 5613 mask, ve->base.execlists.queue_priority_hint); 5614 5615 return mask; 5616 } 5617 5618 static void virtual_submission_tasklet(unsigned long data) 5619 { 5620 struct virtual_engine * const ve = (struct virtual_engine *)data; 5621 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5622 intel_engine_mask_t mask; 5623 unsigned int n; 5624 5625 rcu_read_lock(); 5626 mask = virtual_submission_mask(ve); 5627 rcu_read_unlock(); 5628 if (unlikely(!mask)) 5629 return; 5630 5631 local_irq_disable(); 5632 for (n = 0; n < ve->num_siblings; n++) { 5633 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5634 struct ve_node * const node = &ve->nodes[sibling->id]; 5635 struct rb_node **parent, *rb; 5636 bool first; 5637 5638 if (!READ_ONCE(ve->request)) 5639 break; /* already handled by a sibling's tasklet */ 5640 5641 if (unlikely(!(mask & sibling->mask))) { 5642 if (!RB_EMPTY_NODE(&node->rb)) { 5643 spin_lock(&sibling->active.lock); 5644 rb_erase_cached(&node->rb, 5645 &sibling->execlists.virtual); 5646 RB_CLEAR_NODE(&node->rb); 5647 spin_unlock(&sibling->active.lock); 5648 } 5649 continue; 5650 } 5651 5652 spin_lock(&sibling->active.lock); 5653 5654 if (!RB_EMPTY_NODE(&node->rb)) { 5655 /* 5656 * Cheat and avoid rebalancing the tree if we can 5657 * reuse this node in situ. 5658 */ 5659 first = rb_first_cached(&sibling->execlists.virtual) == 5660 &node->rb; 5661 if (prio == node->prio || (prio > node->prio && first)) 5662 goto submit_engine; 5663 5664 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5665 } 5666 5667 rb = NULL; 5668 first = true; 5669 parent = &sibling->execlists.virtual.rb_root.rb_node; 5670 while (*parent) { 5671 struct ve_node *other; 5672 5673 rb = *parent; 5674 other = rb_entry(rb, typeof(*other), rb); 5675 if (prio > other->prio) { 5676 parent = &rb->rb_left; 5677 } else { 5678 parent = &rb->rb_right; 5679 first = false; 5680 } 5681 } 5682 5683 rb_link_node(&node->rb, rb, parent); 5684 rb_insert_color_cached(&node->rb, 5685 &sibling->execlists.virtual, 5686 first); 5687 5688 submit_engine: 5689 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5690 node->prio = prio; 5691 if (first && prio > sibling->execlists.queue_priority_hint) 5692 tasklet_hi_schedule(&sibling->execlists.tasklet); 5693 5694 spin_unlock(&sibling->active.lock); 5695 } 5696 local_irq_enable(); 5697 } 5698 5699 static void virtual_submit_request(struct i915_request *rq) 5700 { 5701 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5702 struct i915_request *old; 5703 unsigned long flags; 5704 5705 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5706 rq->fence.context, 5707 rq->fence.seqno); 5708 5709 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5710 5711 spin_lock_irqsave(&ve->base.active.lock, flags); 5712 5713 old = ve->request; 5714 if (old) { /* background completion event from preempt-to-busy */ 5715 GEM_BUG_ON(!i915_request_completed(old)); 5716 __i915_request_submit(old); 5717 i915_request_put(old); 5718 } 5719 5720 if (i915_request_completed(rq)) { 5721 __i915_request_submit(rq); 5722 5723 ve->base.execlists.queue_priority_hint = INT_MIN; 5724 ve->request = NULL; 5725 } else { 5726 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5727 ve->request = i915_request_get(rq); 5728 5729 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5730 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5731 5732 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5733 } 5734 5735 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5736 } 5737 5738 static struct ve_bond * 5739 virtual_find_bond(struct virtual_engine *ve, 5740 const struct intel_engine_cs *master) 5741 { 5742 int i; 5743 5744 for (i = 0; i < ve->num_bonds; i++) { 5745 if (ve->bonds[i].master == master) 5746 return &ve->bonds[i]; 5747 } 5748 5749 return NULL; 5750 } 5751 5752 static void 5753 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5754 { 5755 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5756 intel_engine_mask_t allowed, exec; 5757 struct ve_bond *bond; 5758 5759 allowed = ~to_request(signal)->engine->mask; 5760 5761 bond = virtual_find_bond(ve, to_request(signal)->engine); 5762 if (bond) 5763 allowed &= bond->sibling_mask; 5764 5765 /* Restrict the bonded request to run on only the available engines */ 5766 exec = READ_ONCE(rq->execution_mask); 5767 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5768 ; 5769 5770 /* Prevent the master from being re-run on the bonded engines */ 5771 to_request(signal)->execution_mask &= ~allowed; 5772 } 5773 5774 struct intel_context * 5775 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5776 unsigned int count) 5777 { 5778 struct virtual_engine *ve; 5779 unsigned int n; 5780 int err; 5781 5782 if (count == 0) 5783 return ERR_PTR(-EINVAL); 5784 5785 if (count == 1) 5786 return intel_context_create(siblings[0]); 5787 5788 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5789 if (!ve) 5790 return ERR_PTR(-ENOMEM); 5791 5792 ve->base.i915 = siblings[0]->i915; 5793 ve->base.gt = siblings[0]->gt; 5794 ve->base.uncore = siblings[0]->uncore; 5795 ve->base.id = -1; 5796 5797 ve->base.class = OTHER_CLASS; 5798 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5799 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5800 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5801 5802 /* 5803 * The decision on whether to submit a request using semaphores 5804 * depends on the saturated state of the engine. We only compute 5805 * this during HW submission of the request, and we need for this 5806 * state to be globally applied to all requests being submitted 5807 * to this engine. Virtual engines encompass more than one physical 5808 * engine and so we cannot accurately tell in advance if one of those 5809 * engines is already saturated and so cannot afford to use a semaphore 5810 * and be pessimized in priority for doing so -- if we are the only 5811 * context using semaphores after all other clients have stopped, we 5812 * will be starved on the saturated system. Such a global switch for 5813 * semaphores is less than ideal, but alas is the current compromise. 5814 */ 5815 ve->base.saturated = ALL_ENGINES; 5816 5817 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5818 5819 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5820 intel_engine_init_execlists(&ve->base); 5821 5822 ve->base.cops = &virtual_context_ops; 5823 ve->base.request_alloc = execlists_request_alloc; 5824 5825 ve->base.schedule = i915_schedule; 5826 ve->base.submit_request = virtual_submit_request; 5827 ve->base.bond_execute = virtual_bond_execute; 5828 5829 INIT_LIST_HEAD(virtual_queue(ve)); 5830 ve->base.execlists.queue_priority_hint = INT_MIN; 5831 tasklet_init(&ve->base.execlists.tasklet, 5832 virtual_submission_tasklet, 5833 (unsigned long)ve); 5834 5835 intel_context_init(&ve->context, &ve->base); 5836 5837 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); 5838 if (!ve->base.breadcrumbs) { 5839 err = -ENOMEM; 5840 goto err_put; 5841 } 5842 5843 for (n = 0; n < count; n++) { 5844 struct intel_engine_cs *sibling = siblings[n]; 5845 5846 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5847 if (sibling->mask & ve->base.mask) { 5848 DRM_DEBUG("duplicate %s entry in load balancer\n", 5849 sibling->name); 5850 err = -EINVAL; 5851 goto err_put; 5852 } 5853 5854 /* 5855 * The virtual engine implementation is tightly coupled to 5856 * the execlists backend -- we push out request directly 5857 * into a tree inside each physical engine. We could support 5858 * layering if we handle cloning of the requests and 5859 * submitting a copy into each backend. 5860 */ 5861 if (sibling->execlists.tasklet.func != 5862 execlists_submission_tasklet) { 5863 err = -ENODEV; 5864 goto err_put; 5865 } 5866 5867 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5868 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5869 5870 ve->siblings[ve->num_siblings++] = sibling; 5871 ve->base.mask |= sibling->mask; 5872 5873 /* 5874 * All physical engines must be compatible for their emission 5875 * functions (as we build the instructions during request 5876 * construction and do not alter them before submission 5877 * on the physical engine). We use the engine class as a guide 5878 * here, although that could be refined. 5879 */ 5880 if (ve->base.class != OTHER_CLASS) { 5881 if (ve->base.class != sibling->class) { 5882 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5883 sibling->class, ve->base.class); 5884 err = -EINVAL; 5885 goto err_put; 5886 } 5887 continue; 5888 } 5889 5890 ve->base.class = sibling->class; 5891 ve->base.uabi_class = sibling->uabi_class; 5892 snprintf(ve->base.name, sizeof(ve->base.name), 5893 "v%dx%d", ve->base.class, count); 5894 ve->base.context_size = sibling->context_size; 5895 5896 ve->base.emit_bb_start = sibling->emit_bb_start; 5897 ve->base.emit_flush = sibling->emit_flush; 5898 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5899 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5900 ve->base.emit_fini_breadcrumb_dw = 5901 sibling->emit_fini_breadcrumb_dw; 5902 5903 ve->base.flags = sibling->flags; 5904 } 5905 5906 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5907 5908 virtual_engine_initial_hint(ve); 5909 return &ve->context; 5910 5911 err_put: 5912 intel_context_put(&ve->context); 5913 return ERR_PTR(err); 5914 } 5915 5916 struct intel_context * 5917 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5918 { 5919 struct virtual_engine *se = to_virtual_engine(src); 5920 struct intel_context *dst; 5921 5922 dst = intel_execlists_create_virtual(se->siblings, 5923 se->num_siblings); 5924 if (IS_ERR(dst)) 5925 return dst; 5926 5927 if (se->num_bonds) { 5928 struct virtual_engine *de = to_virtual_engine(dst->engine); 5929 5930 de->bonds = kmemdup(se->bonds, 5931 sizeof(*se->bonds) * se->num_bonds, 5932 GFP_KERNEL); 5933 if (!de->bonds) { 5934 intel_context_put(dst); 5935 return ERR_PTR(-ENOMEM); 5936 } 5937 5938 de->num_bonds = se->num_bonds; 5939 } 5940 5941 return dst; 5942 } 5943 5944 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5945 const struct intel_engine_cs *master, 5946 const struct intel_engine_cs *sibling) 5947 { 5948 struct virtual_engine *ve = to_virtual_engine(engine); 5949 struct ve_bond *bond; 5950 int n; 5951 5952 /* Sanity check the sibling is part of the virtual engine */ 5953 for (n = 0; n < ve->num_siblings; n++) 5954 if (sibling == ve->siblings[n]) 5955 break; 5956 if (n == ve->num_siblings) 5957 return -EINVAL; 5958 5959 bond = virtual_find_bond(ve, master); 5960 if (bond) { 5961 bond->sibling_mask |= sibling->mask; 5962 return 0; 5963 } 5964 5965 bond = krealloc(ve->bonds, 5966 sizeof(*bond) * (ve->num_bonds + 1), 5967 GFP_KERNEL); 5968 if (!bond) 5969 return -ENOMEM; 5970 5971 bond[ve->num_bonds].master = master; 5972 bond[ve->num_bonds].sibling_mask = sibling->mask; 5973 5974 ve->bonds = bond; 5975 ve->num_bonds++; 5976 5977 return 0; 5978 } 5979 5980 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5981 struct drm_printer *m, 5982 void (*show_request)(struct drm_printer *m, 5983 struct i915_request *rq, 5984 const char *prefix), 5985 unsigned int max) 5986 { 5987 const struct intel_engine_execlists *execlists = &engine->execlists; 5988 struct i915_request *rq, *last; 5989 unsigned long flags; 5990 unsigned int count; 5991 struct rb_node *rb; 5992 5993 spin_lock_irqsave(&engine->active.lock, flags); 5994 5995 last = NULL; 5996 count = 0; 5997 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5998 if (count++ < max - 1) 5999 show_request(m, rq, "\t\tE "); 6000 else 6001 last = rq; 6002 } 6003 if (last) { 6004 if (count > max) { 6005 drm_printf(m, 6006 "\t\t...skipping %d executing requests...\n", 6007 count - max); 6008 } 6009 show_request(m, last, "\t\tE "); 6010 } 6011 6012 if (execlists->switch_priority_hint != INT_MIN) 6013 drm_printf(m, "\t\tSwitch priority hint: %d\n", 6014 READ_ONCE(execlists->switch_priority_hint)); 6015 if (execlists->queue_priority_hint != INT_MIN) 6016 drm_printf(m, "\t\tQueue priority hint: %d\n", 6017 READ_ONCE(execlists->queue_priority_hint)); 6018 6019 last = NULL; 6020 count = 0; 6021 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 6022 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 6023 int i; 6024 6025 priolist_for_each_request(rq, p, i) { 6026 if (count++ < max - 1) 6027 show_request(m, rq, "\t\tQ "); 6028 else 6029 last = rq; 6030 } 6031 } 6032 if (last) { 6033 if (count > max) { 6034 drm_printf(m, 6035 "\t\t...skipping %d queued requests...\n", 6036 count - max); 6037 } 6038 show_request(m, last, "\t\tQ "); 6039 } 6040 6041 last = NULL; 6042 count = 0; 6043 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 6044 struct virtual_engine *ve = 6045 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 6046 struct i915_request *rq = READ_ONCE(ve->request); 6047 6048 if (rq) { 6049 if (count++ < max - 1) 6050 show_request(m, rq, "\t\tV "); 6051 else 6052 last = rq; 6053 } 6054 } 6055 if (last) { 6056 if (count > max) { 6057 drm_printf(m, 6058 "\t\t...skipping %d virtual requests...\n", 6059 count - max); 6060 } 6061 show_request(m, last, "\t\tV "); 6062 } 6063 6064 spin_unlock_irqrestore(&engine->active.lock, flags); 6065 } 6066 6067 void intel_lr_context_reset(struct intel_engine_cs *engine, 6068 struct intel_context *ce, 6069 u32 head, 6070 bool scrub) 6071 { 6072 GEM_BUG_ON(!intel_context_is_pinned(ce)); 6073 6074 /* 6075 * We want a simple context + ring to execute the breadcrumb update. 6076 * We cannot rely on the context being intact across the GPU hang, 6077 * so clear it and rebuild just what we need for the breadcrumb. 6078 * All pending requests for this context will be zapped, and any 6079 * future request will be after userspace has had the opportunity 6080 * to recreate its own state. 6081 */ 6082 if (scrub) 6083 restore_default_state(ce, engine); 6084 6085 /* Rerun the request; its payload has been neutered (if guilty). */ 6086 __execlists_update_reg_state(ce, engine, head); 6087 } 6088 6089 bool 6090 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 6091 { 6092 return engine->set_default_submission == 6093 intel_execlists_set_default_submission; 6094 } 6095 6096 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 6097 #include "selftest_lrc.c" 6098 #endif 6099