1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_breadcrumbs.h" 141 #include "intel_context.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 #include "shmem_utils.h" 152 153 #define RING_EXECLIST_QFULL (1 << 0x2) 154 #define RING_EXECLIST1_VALID (1 << 0x3) 155 #define RING_EXECLIST0_VALID (1 << 0x4) 156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 157 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 158 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 159 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 166 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 169 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 171 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 175 #define GEN12_IDLE_CTX_ID 0x7FF 176 #define GEN12_CSB_CTX_VALID(csb_dw) \ 177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 178 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 struct rcu_work rcu; 186 187 /* 188 * We allow only a single request through the virtual engine at a time 189 * (each request in the timeline waits for the completion fence of 190 * the previous before being submitted). By restricting ourselves to 191 * only submitting a single request, each request is placed on to a 192 * physical to maximise load spreading (by virtue of the late greedy 193 * scheduling -- each real engine takes the next available request 194 * upon idling). 195 */ 196 struct i915_request *request; 197 198 /* 199 * We keep a rbtree of available virtual engines inside each physical 200 * engine, sorted by priority. Here we preallocate the nodes we need 201 * for the virtual engine, indexed by physical_engine->id. 202 */ 203 struct ve_node { 204 struct rb_node rb; 205 int prio; 206 } nodes[I915_NUM_ENGINES]; 207 208 /* 209 * Keep track of bonded pairs -- restrictions upon on our selection 210 * of physical engines any particular request may be submitted to. 211 * If we receive a submit-fence from a master engine, we will only 212 * use one of sibling_mask physical engines. 213 */ 214 struct ve_bond { 215 const struct intel_engine_cs *master; 216 intel_engine_mask_t sibling_mask; 217 } *bonds; 218 unsigned int num_bonds; 219 220 /* And finally, which physical engines this virtual engine maps onto. */ 221 unsigned int num_siblings; 222 struct intel_engine_cs *siblings[]; 223 }; 224 225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 226 { 227 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 228 return container_of(engine, struct virtual_engine, base); 229 } 230 231 static int __execlists_context_alloc(struct intel_context *ce, 232 struct intel_engine_cs *engine); 233 234 static void execlists_init_reg_state(u32 *reg_state, 235 const struct intel_context *ce, 236 const struct intel_engine_cs *engine, 237 const struct intel_ring *ring, 238 bool close); 239 static void 240 __execlists_update_reg_state(const struct intel_context *ce, 241 const struct intel_engine_cs *engine, 242 u32 head); 243 244 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 245 { 246 if (INTEL_GEN(engine->i915) >= 12) 247 return 0x60; 248 else if (INTEL_GEN(engine->i915) >= 9) 249 return 0x54; 250 else if (engine->class == RENDER_CLASS) 251 return 0x58; 252 else 253 return -1; 254 } 255 256 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 257 { 258 if (INTEL_GEN(engine->i915) >= 12) 259 return 0x74; 260 else if (INTEL_GEN(engine->i915) >= 9) 261 return 0x68; 262 else if (engine->class == RENDER_CLASS) 263 return 0xd8; 264 else 265 return -1; 266 } 267 268 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 269 { 270 if (INTEL_GEN(engine->i915) >= 12) 271 return 0x12; 272 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 273 return 0x18; 274 else 275 return -1; 276 } 277 278 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 279 { 280 int x; 281 282 x = lrc_ring_wa_bb_per_ctx(engine); 283 if (x < 0) 284 return x; 285 286 return x + 2; 287 } 288 289 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 290 { 291 int x; 292 293 x = lrc_ring_indirect_ptr(engine); 294 if (x < 0) 295 return x; 296 297 return x + 2; 298 } 299 300 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 301 { 302 if (engine->class != RENDER_CLASS) 303 return -1; 304 305 if (INTEL_GEN(engine->i915) >= 12) 306 return 0xb6; 307 else if (INTEL_GEN(engine->i915) >= 11) 308 return 0xaa; 309 else 310 return -1; 311 } 312 313 static u32 314 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 315 { 316 switch (INTEL_GEN(engine->i915)) { 317 default: 318 MISSING_CASE(INTEL_GEN(engine->i915)); 319 fallthrough; 320 case 12: 321 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 322 case 11: 323 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 324 case 10: 325 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 326 case 9: 327 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 328 case 8: 329 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 330 } 331 } 332 333 static void 334 lrc_ring_setup_indirect_ctx(u32 *regs, 335 const struct intel_engine_cs *engine, 336 u32 ctx_bb_ggtt_addr, 337 u32 size) 338 { 339 GEM_BUG_ON(!size); 340 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 341 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 342 regs[lrc_ring_indirect_ptr(engine) + 1] = 343 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 344 345 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 346 regs[lrc_ring_indirect_offset(engine) + 1] = 347 lrc_ring_indirect_offset_default(engine) << 6; 348 } 349 350 static u32 intel_context_get_runtime(const struct intel_context *ce) 351 { 352 /* 353 * We can use either ppHWSP[16] which is recorded before the context 354 * switch (and so excludes the cost of context switches) or use the 355 * value from the context image itself, which is saved/restored earlier 356 * and so includes the cost of the save. 357 */ 358 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 359 } 360 361 static void mark_eio(struct i915_request *rq) 362 { 363 if (i915_request_completed(rq)) 364 return; 365 366 GEM_BUG_ON(i915_request_signaled(rq)); 367 368 i915_request_set_error_once(rq, -EIO); 369 i915_request_mark_complete(rq); 370 } 371 372 static struct i915_request * 373 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 374 { 375 struct i915_request *active = rq; 376 377 rcu_read_lock(); 378 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 379 if (i915_request_completed(rq)) 380 break; 381 382 active = rq; 383 } 384 rcu_read_unlock(); 385 386 return active; 387 } 388 389 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 390 { 391 return (i915_ggtt_offset(engine->status_page.vma) + 392 I915_GEM_HWS_PREEMPT_ADDR); 393 } 394 395 static inline void 396 ring_set_paused(const struct intel_engine_cs *engine, int state) 397 { 398 /* 399 * We inspect HWS_PREEMPT with a semaphore inside 400 * engine->emit_fini_breadcrumb. If the dword is true, 401 * the ring is paused as the semaphore will busywait 402 * until the dword is false. 403 */ 404 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 405 if (state) 406 wmb(); 407 } 408 409 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 410 { 411 return rb_entry(rb, struct i915_priolist, node); 412 } 413 414 static inline int rq_prio(const struct i915_request *rq) 415 { 416 return READ_ONCE(rq->sched.attr.priority); 417 } 418 419 static int effective_prio(const struct i915_request *rq) 420 { 421 int prio = rq_prio(rq); 422 423 /* 424 * If this request is special and must not be interrupted at any 425 * cost, so be it. Note we are only checking the most recent request 426 * in the context and so may be masking an earlier vip request. It 427 * is hoped that under the conditions where nopreempt is used, this 428 * will not matter (i.e. all requests to that context will be 429 * nopreempt for as long as desired). 430 */ 431 if (i915_request_has_nopreempt(rq)) 432 prio = I915_PRIORITY_UNPREEMPTABLE; 433 434 return prio; 435 } 436 437 static int queue_prio(const struct intel_engine_execlists *execlists) 438 { 439 struct i915_priolist *p; 440 struct rb_node *rb; 441 442 rb = rb_first_cached(&execlists->queue); 443 if (!rb) 444 return INT_MIN; 445 446 /* 447 * As the priolist[] are inverted, with the highest priority in [0], 448 * we have to flip the index value to become priority. 449 */ 450 p = to_priolist(rb); 451 if (!I915_USER_PRIORITY_SHIFT) 452 return p->priority; 453 454 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 455 } 456 457 static inline bool need_preempt(const struct intel_engine_cs *engine, 458 const struct i915_request *rq, 459 struct rb_node *rb) 460 { 461 int last_prio; 462 463 if (!intel_engine_has_semaphores(engine)) 464 return false; 465 466 /* 467 * Check if the current priority hint merits a preemption attempt. 468 * 469 * We record the highest value priority we saw during rescheduling 470 * prior to this dequeue, therefore we know that if it is strictly 471 * less than the current tail of ESLP[0], we do not need to force 472 * a preempt-to-idle cycle. 473 * 474 * However, the priority hint is a mere hint that we may need to 475 * preempt. If that hint is stale or we may be trying to preempt 476 * ourselves, ignore the request. 477 * 478 * More naturally we would write 479 * prio >= max(0, last); 480 * except that we wish to prevent triggering preemption at the same 481 * priority level: the task that is running should remain running 482 * to preserve FIFO ordering of dependencies. 483 */ 484 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 485 if (engine->execlists.queue_priority_hint <= last_prio) 486 return false; 487 488 /* 489 * Check against the first request in ELSP[1], it will, thanks to the 490 * power of PI, be the highest priority of that context. 491 */ 492 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 493 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 494 return true; 495 496 if (rb) { 497 struct virtual_engine *ve = 498 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 499 bool preempt = false; 500 501 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 502 struct i915_request *next; 503 504 rcu_read_lock(); 505 next = READ_ONCE(ve->request); 506 if (next) 507 preempt = rq_prio(next) > last_prio; 508 rcu_read_unlock(); 509 } 510 511 if (preempt) 512 return preempt; 513 } 514 515 /* 516 * If the inflight context did not trigger the preemption, then maybe 517 * it was the set of queued requests? Pick the highest priority in 518 * the queue (the first active priolist) and see if it deserves to be 519 * running instead of ELSP[0]. 520 * 521 * The highest priority request in the queue can not be either 522 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 523 * context, it's priority would not exceed ELSP[0] aka last_prio. 524 */ 525 return queue_prio(&engine->execlists) > last_prio; 526 } 527 528 __maybe_unused static inline bool 529 assert_priority_queue(const struct i915_request *prev, 530 const struct i915_request *next) 531 { 532 /* 533 * Without preemption, the prev may refer to the still active element 534 * which we refuse to let go. 535 * 536 * Even with preemption, there are times when we think it is better not 537 * to preempt and leave an ostensibly lower priority request in flight. 538 */ 539 if (i915_request_is_active(prev)) 540 return true; 541 542 return rq_prio(prev) >= rq_prio(next); 543 } 544 545 /* 546 * The context descriptor encodes various attributes of a context, 547 * including its GTT address and some flags. Because it's fairly 548 * expensive to calculate, we'll just do it once and cache the result, 549 * which remains valid until the context is unpinned. 550 * 551 * This is what a descriptor looks like, from LSB to MSB:: 552 * 553 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 554 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 555 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 556 * bits 53-54: mbz, reserved for use by hardware 557 * bits 55-63: group ID, currently unused and set to 0 558 * 559 * Starting from Gen11, the upper dword of the descriptor has a new format: 560 * 561 * bits 32-36: reserved 562 * bits 37-47: SW context ID 563 * bits 48:53: engine instance 564 * bit 54: mbz, reserved for use by hardware 565 * bits 55-60: SW counter 566 * bits 61-63: engine class 567 * 568 * engine info, SW context ID and SW counter need to form a unique number 569 * (Context ID) per lrc. 570 */ 571 static u32 572 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 573 { 574 u32 desc; 575 576 desc = INTEL_LEGACY_32B_CONTEXT; 577 if (i915_vm_is_4lvl(ce->vm)) 578 desc = INTEL_LEGACY_64B_CONTEXT; 579 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 580 581 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 582 if (IS_GEN(engine->i915, 8)) 583 desc |= GEN8_CTX_L3LLC_COHERENT; 584 585 return i915_ggtt_offset(ce->state) | desc; 586 } 587 588 static inline unsigned int dword_in_page(void *addr) 589 { 590 return offset_in_page(addr) / sizeof(u32); 591 } 592 593 static void set_offsets(u32 *regs, 594 const u8 *data, 595 const struct intel_engine_cs *engine, 596 bool clear) 597 #define NOP(x) (BIT(7) | (x)) 598 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 599 #define POSTED BIT(0) 600 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 601 #define REG16(x) \ 602 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 603 (((x) >> 2) & 0x7f) 604 #define END(total_state_size) 0, (total_state_size) 605 { 606 const u32 base = engine->mmio_base; 607 608 while (*data) { 609 u8 count, flags; 610 611 if (*data & BIT(7)) { /* skip */ 612 count = *data++ & ~BIT(7); 613 if (clear) 614 memset32(regs, MI_NOOP, count); 615 regs += count; 616 continue; 617 } 618 619 count = *data & 0x3f; 620 flags = *data >> 6; 621 data++; 622 623 *regs = MI_LOAD_REGISTER_IMM(count); 624 if (flags & POSTED) 625 *regs |= MI_LRI_FORCE_POSTED; 626 if (INTEL_GEN(engine->i915) >= 11) 627 *regs |= MI_LRI_LRM_CS_MMIO; 628 regs++; 629 630 GEM_BUG_ON(!count); 631 do { 632 u32 offset = 0; 633 u8 v; 634 635 do { 636 v = *data++; 637 offset <<= 7; 638 offset |= v & ~BIT(7); 639 } while (v & BIT(7)); 640 641 regs[0] = base + (offset << 2); 642 if (clear) 643 regs[1] = 0; 644 regs += 2; 645 } while (--count); 646 } 647 648 if (clear) { 649 u8 count = *++data; 650 651 /* Clear past the tail for HW access */ 652 GEM_BUG_ON(dword_in_page(regs) > count); 653 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 654 655 /* Close the batch; used mainly by live_lrc_layout() */ 656 *regs = MI_BATCH_BUFFER_END; 657 if (INTEL_GEN(engine->i915) >= 10) 658 *regs |= BIT(0); 659 } 660 } 661 662 static const u8 gen8_xcs_offsets[] = { 663 NOP(1), 664 LRI(11, 0), 665 REG16(0x244), 666 REG(0x034), 667 REG(0x030), 668 REG(0x038), 669 REG(0x03c), 670 REG(0x168), 671 REG(0x140), 672 REG(0x110), 673 REG(0x11c), 674 REG(0x114), 675 REG(0x118), 676 677 NOP(9), 678 LRI(9, 0), 679 REG16(0x3a8), 680 REG16(0x28c), 681 REG16(0x288), 682 REG16(0x284), 683 REG16(0x280), 684 REG16(0x27c), 685 REG16(0x278), 686 REG16(0x274), 687 REG16(0x270), 688 689 NOP(13), 690 LRI(2, 0), 691 REG16(0x200), 692 REG(0x028), 693 694 END(80) 695 }; 696 697 static const u8 gen9_xcs_offsets[] = { 698 NOP(1), 699 LRI(14, POSTED), 700 REG16(0x244), 701 REG(0x034), 702 REG(0x030), 703 REG(0x038), 704 REG(0x03c), 705 REG(0x168), 706 REG(0x140), 707 REG(0x110), 708 REG(0x11c), 709 REG(0x114), 710 REG(0x118), 711 REG(0x1c0), 712 REG(0x1c4), 713 REG(0x1c8), 714 715 NOP(3), 716 LRI(9, POSTED), 717 REG16(0x3a8), 718 REG16(0x28c), 719 REG16(0x288), 720 REG16(0x284), 721 REG16(0x280), 722 REG16(0x27c), 723 REG16(0x278), 724 REG16(0x274), 725 REG16(0x270), 726 727 NOP(13), 728 LRI(1, POSTED), 729 REG16(0x200), 730 731 NOP(13), 732 LRI(44, POSTED), 733 REG(0x028), 734 REG(0x09c), 735 REG(0x0c0), 736 REG(0x178), 737 REG(0x17c), 738 REG16(0x358), 739 REG(0x170), 740 REG(0x150), 741 REG(0x154), 742 REG(0x158), 743 REG16(0x41c), 744 REG16(0x600), 745 REG16(0x604), 746 REG16(0x608), 747 REG16(0x60c), 748 REG16(0x610), 749 REG16(0x614), 750 REG16(0x618), 751 REG16(0x61c), 752 REG16(0x620), 753 REG16(0x624), 754 REG16(0x628), 755 REG16(0x62c), 756 REG16(0x630), 757 REG16(0x634), 758 REG16(0x638), 759 REG16(0x63c), 760 REG16(0x640), 761 REG16(0x644), 762 REG16(0x648), 763 REG16(0x64c), 764 REG16(0x650), 765 REG16(0x654), 766 REG16(0x658), 767 REG16(0x65c), 768 REG16(0x660), 769 REG16(0x664), 770 REG16(0x668), 771 REG16(0x66c), 772 REG16(0x670), 773 REG16(0x674), 774 REG16(0x678), 775 REG16(0x67c), 776 REG(0x068), 777 778 END(176) 779 }; 780 781 static const u8 gen12_xcs_offsets[] = { 782 NOP(1), 783 LRI(13, POSTED), 784 REG16(0x244), 785 REG(0x034), 786 REG(0x030), 787 REG(0x038), 788 REG(0x03c), 789 REG(0x168), 790 REG(0x140), 791 REG(0x110), 792 REG(0x1c0), 793 REG(0x1c4), 794 REG(0x1c8), 795 REG(0x180), 796 REG16(0x2b4), 797 798 NOP(5), 799 LRI(9, POSTED), 800 REG16(0x3a8), 801 REG16(0x28c), 802 REG16(0x288), 803 REG16(0x284), 804 REG16(0x280), 805 REG16(0x27c), 806 REG16(0x278), 807 REG16(0x274), 808 REG16(0x270), 809 810 END(80) 811 }; 812 813 static const u8 gen8_rcs_offsets[] = { 814 NOP(1), 815 LRI(14, POSTED), 816 REG16(0x244), 817 REG(0x034), 818 REG(0x030), 819 REG(0x038), 820 REG(0x03c), 821 REG(0x168), 822 REG(0x140), 823 REG(0x110), 824 REG(0x11c), 825 REG(0x114), 826 REG(0x118), 827 REG(0x1c0), 828 REG(0x1c4), 829 REG(0x1c8), 830 831 NOP(3), 832 LRI(9, POSTED), 833 REG16(0x3a8), 834 REG16(0x28c), 835 REG16(0x288), 836 REG16(0x284), 837 REG16(0x280), 838 REG16(0x27c), 839 REG16(0x278), 840 REG16(0x274), 841 REG16(0x270), 842 843 NOP(13), 844 LRI(1, 0), 845 REG(0x0c8), 846 847 END(80) 848 }; 849 850 static const u8 gen9_rcs_offsets[] = { 851 NOP(1), 852 LRI(14, POSTED), 853 REG16(0x244), 854 REG(0x34), 855 REG(0x30), 856 REG(0x38), 857 REG(0x3c), 858 REG(0x168), 859 REG(0x140), 860 REG(0x110), 861 REG(0x11c), 862 REG(0x114), 863 REG(0x118), 864 REG(0x1c0), 865 REG(0x1c4), 866 REG(0x1c8), 867 868 NOP(3), 869 LRI(9, POSTED), 870 REG16(0x3a8), 871 REG16(0x28c), 872 REG16(0x288), 873 REG16(0x284), 874 REG16(0x280), 875 REG16(0x27c), 876 REG16(0x278), 877 REG16(0x274), 878 REG16(0x270), 879 880 NOP(13), 881 LRI(1, 0), 882 REG(0xc8), 883 884 NOP(13), 885 LRI(44, POSTED), 886 REG(0x28), 887 REG(0x9c), 888 REG(0xc0), 889 REG(0x178), 890 REG(0x17c), 891 REG16(0x358), 892 REG(0x170), 893 REG(0x150), 894 REG(0x154), 895 REG(0x158), 896 REG16(0x41c), 897 REG16(0x600), 898 REG16(0x604), 899 REG16(0x608), 900 REG16(0x60c), 901 REG16(0x610), 902 REG16(0x614), 903 REG16(0x618), 904 REG16(0x61c), 905 REG16(0x620), 906 REG16(0x624), 907 REG16(0x628), 908 REG16(0x62c), 909 REG16(0x630), 910 REG16(0x634), 911 REG16(0x638), 912 REG16(0x63c), 913 REG16(0x640), 914 REG16(0x644), 915 REG16(0x648), 916 REG16(0x64c), 917 REG16(0x650), 918 REG16(0x654), 919 REG16(0x658), 920 REG16(0x65c), 921 REG16(0x660), 922 REG16(0x664), 923 REG16(0x668), 924 REG16(0x66c), 925 REG16(0x670), 926 REG16(0x674), 927 REG16(0x678), 928 REG16(0x67c), 929 REG(0x68), 930 931 END(176) 932 }; 933 934 static const u8 gen11_rcs_offsets[] = { 935 NOP(1), 936 LRI(15, POSTED), 937 REG16(0x244), 938 REG(0x034), 939 REG(0x030), 940 REG(0x038), 941 REG(0x03c), 942 REG(0x168), 943 REG(0x140), 944 REG(0x110), 945 REG(0x11c), 946 REG(0x114), 947 REG(0x118), 948 REG(0x1c0), 949 REG(0x1c4), 950 REG(0x1c8), 951 REG(0x180), 952 953 NOP(1), 954 LRI(9, POSTED), 955 REG16(0x3a8), 956 REG16(0x28c), 957 REG16(0x288), 958 REG16(0x284), 959 REG16(0x280), 960 REG16(0x27c), 961 REG16(0x278), 962 REG16(0x274), 963 REG16(0x270), 964 965 LRI(1, POSTED), 966 REG(0x1b0), 967 968 NOP(10), 969 LRI(1, 0), 970 REG(0x0c8), 971 972 END(80) 973 }; 974 975 static const u8 gen12_rcs_offsets[] = { 976 NOP(1), 977 LRI(13, POSTED), 978 REG16(0x244), 979 REG(0x034), 980 REG(0x030), 981 REG(0x038), 982 REG(0x03c), 983 REG(0x168), 984 REG(0x140), 985 REG(0x110), 986 REG(0x1c0), 987 REG(0x1c4), 988 REG(0x1c8), 989 REG(0x180), 990 REG16(0x2b4), 991 992 NOP(5), 993 LRI(9, POSTED), 994 REG16(0x3a8), 995 REG16(0x28c), 996 REG16(0x288), 997 REG16(0x284), 998 REG16(0x280), 999 REG16(0x27c), 1000 REG16(0x278), 1001 REG16(0x274), 1002 REG16(0x270), 1003 1004 LRI(3, POSTED), 1005 REG(0x1b0), 1006 REG16(0x5a8), 1007 REG16(0x5ac), 1008 1009 NOP(6), 1010 LRI(1, 0), 1011 REG(0x0c8), 1012 NOP(3 + 9 + 1), 1013 1014 LRI(51, POSTED), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG16(0x588), 1020 REG16(0x588), 1021 REG(0x028), 1022 REG(0x09c), 1023 REG(0x0c0), 1024 REG(0x178), 1025 REG(0x17c), 1026 REG16(0x358), 1027 REG(0x170), 1028 REG(0x150), 1029 REG(0x154), 1030 REG(0x158), 1031 REG16(0x41c), 1032 REG16(0x600), 1033 REG16(0x604), 1034 REG16(0x608), 1035 REG16(0x60c), 1036 REG16(0x610), 1037 REG16(0x614), 1038 REG16(0x618), 1039 REG16(0x61c), 1040 REG16(0x620), 1041 REG16(0x624), 1042 REG16(0x628), 1043 REG16(0x62c), 1044 REG16(0x630), 1045 REG16(0x634), 1046 REG16(0x638), 1047 REG16(0x63c), 1048 REG16(0x640), 1049 REG16(0x644), 1050 REG16(0x648), 1051 REG16(0x64c), 1052 REG16(0x650), 1053 REG16(0x654), 1054 REG16(0x658), 1055 REG16(0x65c), 1056 REG16(0x660), 1057 REG16(0x664), 1058 REG16(0x668), 1059 REG16(0x66c), 1060 REG16(0x670), 1061 REG16(0x674), 1062 REG16(0x678), 1063 REG16(0x67c), 1064 REG(0x068), 1065 REG(0x084), 1066 NOP(1), 1067 1068 END(192) 1069 }; 1070 1071 #undef END 1072 #undef REG16 1073 #undef REG 1074 #undef LRI 1075 #undef NOP 1076 1077 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1078 { 1079 /* 1080 * The gen12+ lists only have the registers we program in the basic 1081 * default state. We rely on the context image using relative 1082 * addressing to automatic fixup the register state between the 1083 * physical engines for virtual engine. 1084 */ 1085 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1086 !intel_engine_has_relative_mmio(engine)); 1087 1088 if (engine->class == RENDER_CLASS) { 1089 if (INTEL_GEN(engine->i915) >= 12) 1090 return gen12_rcs_offsets; 1091 else if (INTEL_GEN(engine->i915) >= 11) 1092 return gen11_rcs_offsets; 1093 else if (INTEL_GEN(engine->i915) >= 9) 1094 return gen9_rcs_offsets; 1095 else 1096 return gen8_rcs_offsets; 1097 } else { 1098 if (INTEL_GEN(engine->i915) >= 12) 1099 return gen12_xcs_offsets; 1100 else if (INTEL_GEN(engine->i915) >= 9) 1101 return gen9_xcs_offsets; 1102 else 1103 return gen8_xcs_offsets; 1104 } 1105 } 1106 1107 static struct i915_request * 1108 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1109 { 1110 struct i915_request *rq, *rn, *active = NULL; 1111 struct list_head *pl; 1112 int prio = I915_PRIORITY_INVALID; 1113 1114 lockdep_assert_held(&engine->active.lock); 1115 1116 list_for_each_entry_safe_reverse(rq, rn, 1117 &engine->active.requests, 1118 sched.link) { 1119 if (i915_request_completed(rq)) 1120 continue; /* XXX */ 1121 1122 __i915_request_unsubmit(rq); 1123 1124 /* 1125 * Push the request back into the queue for later resubmission. 1126 * If this request is not native to this physical engine (i.e. 1127 * it came from a virtual source), push it back onto the virtual 1128 * engine so that it can be moved across onto another physical 1129 * engine as load dictates. 1130 */ 1131 if (likely(rq->execution_mask == engine->mask)) { 1132 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1133 if (rq_prio(rq) != prio) { 1134 prio = rq_prio(rq); 1135 pl = i915_sched_lookup_priolist(engine, prio); 1136 } 1137 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1138 1139 list_move(&rq->sched.link, pl); 1140 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1141 1142 /* Check in case we rollback so far we wrap [size/2] */ 1143 if (intel_ring_direction(rq->ring, 1144 rq->tail, 1145 rq->ring->tail + 8) > 0) 1146 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1147 1148 active = rq; 1149 } else { 1150 struct intel_engine_cs *owner = rq->context->engine; 1151 1152 WRITE_ONCE(rq->engine, owner); 1153 owner->submit_request(rq); 1154 active = NULL; 1155 } 1156 } 1157 1158 return active; 1159 } 1160 1161 struct i915_request * 1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1163 { 1164 struct intel_engine_cs *engine = 1165 container_of(execlists, typeof(*engine), execlists); 1166 1167 return __unwind_incomplete_requests(engine); 1168 } 1169 1170 static inline void 1171 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1172 { 1173 /* 1174 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1175 * The compiler should eliminate this function as dead-code. 1176 */ 1177 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1178 return; 1179 1180 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1181 status, rq); 1182 } 1183 1184 static void intel_engine_context_in(struct intel_engine_cs *engine) 1185 { 1186 unsigned long flags; 1187 1188 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1189 return; 1190 1191 write_seqlock_irqsave(&engine->stats.lock, flags); 1192 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1193 engine->stats.start = ktime_get(); 1194 atomic_inc(&engine->stats.active); 1195 } 1196 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1197 } 1198 1199 static void intel_engine_context_out(struct intel_engine_cs *engine) 1200 { 1201 unsigned long flags; 1202 1203 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1204 1205 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1206 return; 1207 1208 write_seqlock_irqsave(&engine->stats.lock, flags); 1209 if (atomic_dec_and_test(&engine->stats.active)) { 1210 engine->stats.total = 1211 ktime_add(engine->stats.total, 1212 ktime_sub(ktime_get(), engine->stats.start)); 1213 } 1214 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1215 } 1216 1217 static void 1218 execlists_check_context(const struct intel_context *ce, 1219 const struct intel_engine_cs *engine, 1220 const char *when) 1221 { 1222 const struct intel_ring *ring = ce->ring; 1223 u32 *regs = ce->lrc_reg_state; 1224 bool valid = true; 1225 int x; 1226 1227 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1228 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1229 engine->name, 1230 regs[CTX_RING_START], 1231 i915_ggtt_offset(ring->vma)); 1232 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1233 valid = false; 1234 } 1235 1236 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1237 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1238 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1239 engine->name, 1240 regs[CTX_RING_CTL], 1241 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1242 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1243 valid = false; 1244 } 1245 1246 x = lrc_ring_mi_mode(engine); 1247 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1248 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1249 engine->name, regs[x + 1]); 1250 regs[x + 1] &= ~STOP_RING; 1251 regs[x + 1] |= STOP_RING << 16; 1252 valid = false; 1253 } 1254 1255 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1256 } 1257 1258 static void restore_default_state(struct intel_context *ce, 1259 struct intel_engine_cs *engine) 1260 { 1261 u32 *regs; 1262 1263 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1264 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1265 1266 ce->runtime.last = intel_context_get_runtime(ce); 1267 } 1268 1269 static void reset_active(struct i915_request *rq, 1270 struct intel_engine_cs *engine) 1271 { 1272 struct intel_context * const ce = rq->context; 1273 u32 head; 1274 1275 /* 1276 * The executing context has been cancelled. We want to prevent 1277 * further execution along this context and propagate the error on 1278 * to anything depending on its results. 1279 * 1280 * In __i915_request_submit(), we apply the -EIO and remove the 1281 * requests' payloads for any banned requests. But first, we must 1282 * rewind the context back to the start of the incomplete request so 1283 * that we do not jump back into the middle of the batch. 1284 * 1285 * We preserve the breadcrumbs and semaphores of the incomplete 1286 * requests so that inter-timeline dependencies (i.e other timelines) 1287 * remain correctly ordered. And we defer to __i915_request_submit() 1288 * so that all asynchronous waits are correctly handled. 1289 */ 1290 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1291 rq->fence.context, rq->fence.seqno); 1292 1293 /* On resubmission of the active request, payload will be scrubbed */ 1294 if (i915_request_completed(rq)) 1295 head = rq->tail; 1296 else 1297 head = active_request(ce->timeline, rq)->head; 1298 head = intel_ring_wrap(ce->ring, head); 1299 1300 /* Scrub the context image to prevent replaying the previous batch */ 1301 restore_default_state(ce, engine); 1302 __execlists_update_reg_state(ce, engine, head); 1303 1304 /* We've switched away, so this should be a no-op, but intent matters */ 1305 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1306 } 1307 1308 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1309 { 1310 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1311 ce->runtime.num_underflow += dt < 0; 1312 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1313 #endif 1314 } 1315 1316 static void intel_context_update_runtime(struct intel_context *ce) 1317 { 1318 u32 old; 1319 s32 dt; 1320 1321 if (intel_context_is_barrier(ce)) 1322 return; 1323 1324 old = ce->runtime.last; 1325 ce->runtime.last = intel_context_get_runtime(ce); 1326 dt = ce->runtime.last - old; 1327 1328 if (unlikely(dt <= 0)) { 1329 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1330 old, ce->runtime.last, dt); 1331 st_update_runtime_underflow(ce, dt); 1332 return; 1333 } 1334 1335 ewma_runtime_add(&ce->runtime.avg, dt); 1336 ce->runtime.total += dt; 1337 } 1338 1339 static inline struct intel_engine_cs * 1340 __execlists_schedule_in(struct i915_request *rq) 1341 { 1342 struct intel_engine_cs * const engine = rq->engine; 1343 struct intel_context * const ce = rq->context; 1344 1345 intel_context_get(ce); 1346 1347 if (unlikely(intel_context_is_banned(ce))) 1348 reset_active(rq, engine); 1349 1350 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1351 execlists_check_context(ce, engine, "before"); 1352 1353 if (ce->tag) { 1354 /* Use a fixed tag for OA and friends */ 1355 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1356 ce->lrc.ccid = ce->tag; 1357 } else { 1358 /* We don't need a strict matching tag, just different values */ 1359 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1360 1361 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1362 clear_bit(tag - 1, &engine->context_tag); 1363 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1364 1365 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1366 } 1367 1368 ce->lrc.ccid |= engine->execlists.ccid; 1369 1370 __intel_gt_pm_get(engine->gt); 1371 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1372 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1373 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1374 intel_engine_context_in(engine); 1375 1376 return engine; 1377 } 1378 1379 static inline struct i915_request * 1380 execlists_schedule_in(struct i915_request *rq, int idx) 1381 { 1382 struct intel_context * const ce = rq->context; 1383 struct intel_engine_cs *old; 1384 1385 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1386 trace_i915_request_in(rq, idx); 1387 1388 old = READ_ONCE(ce->inflight); 1389 do { 1390 if (!old) { 1391 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1392 break; 1393 } 1394 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1395 1396 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1397 return i915_request_get(rq); 1398 } 1399 1400 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1401 { 1402 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1403 struct i915_request *next = READ_ONCE(ve->request); 1404 1405 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1406 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1407 } 1408 1409 static inline void 1410 __execlists_schedule_out(struct i915_request *rq, 1411 struct intel_engine_cs * const engine, 1412 unsigned int ccid) 1413 { 1414 struct intel_context * const ce = rq->context; 1415 1416 /* 1417 * NB process_csb() is not under the engine->active.lock and hence 1418 * schedule_out can race with schedule_in meaning that we should 1419 * refrain from doing non-trivial work here. 1420 */ 1421 1422 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1423 execlists_check_context(ce, engine, "after"); 1424 1425 /* 1426 * If we have just completed this context, the engine may now be 1427 * idle and we want to re-enter powersaving. 1428 */ 1429 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1430 i915_request_completed(rq)) 1431 intel_engine_add_retire(engine, ce->timeline); 1432 1433 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1434 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1435 if (ccid < BITS_PER_LONG) { 1436 GEM_BUG_ON(ccid == 0); 1437 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1438 set_bit(ccid - 1, &engine->context_tag); 1439 } 1440 1441 intel_context_update_runtime(ce); 1442 intel_engine_context_out(engine); 1443 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1444 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1445 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1446 intel_gt_pm_put_async(engine->gt); 1447 1448 /* 1449 * If this is part of a virtual engine, its next request may 1450 * have been blocked waiting for access to the active context. 1451 * We have to kick all the siblings again in case we need to 1452 * switch (e.g. the next request is not runnable on this 1453 * engine). Hopefully, we will already have submitted the next 1454 * request before the tasklet runs and do not need to rebuild 1455 * each virtual tree and kick everyone again. 1456 */ 1457 if (ce->engine != engine) 1458 kick_siblings(rq, ce); 1459 1460 intel_context_put(ce); 1461 } 1462 1463 static inline void 1464 execlists_schedule_out(struct i915_request *rq) 1465 { 1466 struct intel_context * const ce = rq->context; 1467 struct intel_engine_cs *cur, *old; 1468 u32 ccid; 1469 1470 trace_i915_request_out(rq); 1471 1472 ccid = rq->context->lrc.ccid; 1473 old = READ_ONCE(ce->inflight); 1474 do 1475 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1476 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1477 if (!cur) 1478 __execlists_schedule_out(rq, old, ccid); 1479 1480 i915_request_put(rq); 1481 } 1482 1483 static u64 execlists_update_context(struct i915_request *rq) 1484 { 1485 struct intel_context *ce = rq->context; 1486 u64 desc = ce->lrc.desc; 1487 u32 tail, prev; 1488 1489 /* 1490 * WaIdleLiteRestore:bdw,skl 1491 * 1492 * We should never submit the context with the same RING_TAIL twice 1493 * just in case we submit an empty ring, which confuses the HW. 1494 * 1495 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1496 * the normal request to be able to always advance the RING_TAIL on 1497 * subsequent resubmissions (for lite restore). Should that fail us, 1498 * and we try and submit the same tail again, force the context 1499 * reload. 1500 * 1501 * If we need to return to a preempted context, we need to skip the 1502 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1503 * HW has a tendency to ignore us rewinding the TAIL to the end of 1504 * an earlier request. 1505 */ 1506 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1507 prev = rq->ring->tail; 1508 tail = intel_ring_set_tail(rq->ring, rq->tail); 1509 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1510 desc |= CTX_DESC_FORCE_RESTORE; 1511 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1512 rq->tail = rq->wa_tail; 1513 1514 /* 1515 * Make sure the context image is complete before we submit it to HW. 1516 * 1517 * Ostensibly, writes (including the WCB) should be flushed prior to 1518 * an uncached write such as our mmio register access, the empirical 1519 * evidence (esp. on Braswell) suggests that the WC write into memory 1520 * may not be visible to the HW prior to the completion of the UC 1521 * register write and that we may begin execution from the context 1522 * before its image is complete leading to invalid PD chasing. 1523 */ 1524 wmb(); 1525 1526 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1527 return desc; 1528 } 1529 1530 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1531 { 1532 if (execlists->ctrl_reg) { 1533 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1534 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1535 } else { 1536 writel(upper_32_bits(desc), execlists->submit_reg); 1537 writel(lower_32_bits(desc), execlists->submit_reg); 1538 } 1539 } 1540 1541 static __maybe_unused char * 1542 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1543 { 1544 if (!rq) 1545 return ""; 1546 1547 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1548 prefix, 1549 rq->context->lrc.ccid, 1550 rq->fence.context, rq->fence.seqno, 1551 i915_request_completed(rq) ? "!" : 1552 i915_request_started(rq) ? "*" : 1553 "", 1554 rq_prio(rq)); 1555 1556 return buf; 1557 } 1558 1559 static __maybe_unused void 1560 trace_ports(const struct intel_engine_execlists *execlists, 1561 const char *msg, 1562 struct i915_request * const *ports) 1563 { 1564 const struct intel_engine_cs *engine = 1565 container_of(execlists, typeof(*engine), execlists); 1566 char __maybe_unused p0[40], p1[40]; 1567 1568 if (!ports[0]) 1569 return; 1570 1571 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1572 dump_port(p0, sizeof(p0), "", ports[0]), 1573 dump_port(p1, sizeof(p1), ", ", ports[1])); 1574 } 1575 1576 static inline bool 1577 reset_in_progress(const struct intel_engine_execlists *execlists) 1578 { 1579 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1580 } 1581 1582 static __maybe_unused bool 1583 assert_pending_valid(const struct intel_engine_execlists *execlists, 1584 const char *msg) 1585 { 1586 struct intel_engine_cs *engine = 1587 container_of(execlists, typeof(*engine), execlists); 1588 struct i915_request * const *port, *rq; 1589 struct intel_context *ce = NULL; 1590 bool sentinel = false; 1591 u32 ccid = -1; 1592 1593 trace_ports(execlists, msg, execlists->pending); 1594 1595 /* We may be messing around with the lists during reset, lalala */ 1596 if (reset_in_progress(execlists)) 1597 return true; 1598 1599 if (!execlists->pending[0]) { 1600 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1601 engine->name); 1602 return false; 1603 } 1604 1605 if (execlists->pending[execlists_num_ports(execlists)]) { 1606 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1607 engine->name, execlists_num_ports(execlists)); 1608 return false; 1609 } 1610 1611 for (port = execlists->pending; (rq = *port); port++) { 1612 unsigned long flags; 1613 bool ok = true; 1614 1615 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1616 GEM_BUG_ON(!i915_request_is_active(rq)); 1617 1618 if (ce == rq->context) { 1619 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1620 engine->name, 1621 ce->timeline->fence_context, 1622 port - execlists->pending); 1623 return false; 1624 } 1625 ce = rq->context; 1626 1627 if (ccid == ce->lrc.ccid) { 1628 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1629 engine->name, 1630 ccid, ce->timeline->fence_context, 1631 port - execlists->pending); 1632 return false; 1633 } 1634 ccid = ce->lrc.ccid; 1635 1636 /* 1637 * Sentinels are supposed to be the last request so they flush 1638 * the current execution off the HW. Check that they are the only 1639 * request in the pending submission. 1640 */ 1641 if (sentinel) { 1642 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1643 engine->name, 1644 ce->timeline->fence_context, 1645 port - execlists->pending); 1646 return false; 1647 } 1648 sentinel = i915_request_has_sentinel(rq); 1649 1650 /* Hold tightly onto the lock to prevent concurrent retires! */ 1651 if (!spin_trylock_irqsave(&rq->lock, flags)) 1652 continue; 1653 1654 if (i915_request_completed(rq)) 1655 goto unlock; 1656 1657 if (i915_active_is_idle(&ce->active) && 1658 !intel_context_is_barrier(ce)) { 1659 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1660 engine->name, 1661 ce->timeline->fence_context, 1662 port - execlists->pending); 1663 ok = false; 1664 goto unlock; 1665 } 1666 1667 if (!i915_vma_is_pinned(ce->state)) { 1668 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1669 engine->name, 1670 ce->timeline->fence_context, 1671 port - execlists->pending); 1672 ok = false; 1673 goto unlock; 1674 } 1675 1676 if (!i915_vma_is_pinned(ce->ring->vma)) { 1677 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1678 engine->name, 1679 ce->timeline->fence_context, 1680 port - execlists->pending); 1681 ok = false; 1682 goto unlock; 1683 } 1684 1685 unlock: 1686 spin_unlock_irqrestore(&rq->lock, flags); 1687 if (!ok) 1688 return false; 1689 } 1690 1691 return ce; 1692 } 1693 1694 static void execlists_submit_ports(struct intel_engine_cs *engine) 1695 { 1696 struct intel_engine_execlists *execlists = &engine->execlists; 1697 unsigned int n; 1698 1699 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1700 1701 /* 1702 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1703 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1704 * not be relinquished until the device is idle (see 1705 * i915_gem_idle_work_handler()). As a precaution, we make sure 1706 * that all ELSP are drained i.e. we have processed the CSB, 1707 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1708 */ 1709 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1710 1711 /* 1712 * ELSQ note: the submit queue is not cleared after being submitted 1713 * to the HW so we need to make sure we always clean it up. This is 1714 * currently ensured by the fact that we always write the same number 1715 * of elsq entries, keep this in mind before changing the loop below. 1716 */ 1717 for (n = execlists_num_ports(execlists); n--; ) { 1718 struct i915_request *rq = execlists->pending[n]; 1719 1720 write_desc(execlists, 1721 rq ? execlists_update_context(rq) : 0, 1722 n); 1723 } 1724 1725 /* we need to manually load the submit queue */ 1726 if (execlists->ctrl_reg) 1727 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1728 } 1729 1730 static bool ctx_single_port_submission(const struct intel_context *ce) 1731 { 1732 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1733 intel_context_force_single_submission(ce)); 1734 } 1735 1736 static bool can_merge_ctx(const struct intel_context *prev, 1737 const struct intel_context *next) 1738 { 1739 if (prev != next) 1740 return false; 1741 1742 if (ctx_single_port_submission(prev)) 1743 return false; 1744 1745 return true; 1746 } 1747 1748 static unsigned long i915_request_flags(const struct i915_request *rq) 1749 { 1750 return READ_ONCE(rq->fence.flags); 1751 } 1752 1753 static bool can_merge_rq(const struct i915_request *prev, 1754 const struct i915_request *next) 1755 { 1756 GEM_BUG_ON(prev == next); 1757 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1758 1759 /* 1760 * We do not submit known completed requests. Therefore if the next 1761 * request is already completed, we can pretend to merge it in 1762 * with the previous context (and we will skip updating the ELSP 1763 * and tracking). Thus hopefully keeping the ELSP full with active 1764 * contexts, despite the best efforts of preempt-to-busy to confuse 1765 * us. 1766 */ 1767 if (i915_request_completed(next)) 1768 return true; 1769 1770 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1771 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1772 BIT(I915_FENCE_FLAG_SENTINEL)))) 1773 return false; 1774 1775 if (!can_merge_ctx(prev->context, next->context)) 1776 return false; 1777 1778 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1779 return true; 1780 } 1781 1782 static void virtual_update_register_offsets(u32 *regs, 1783 struct intel_engine_cs *engine) 1784 { 1785 set_offsets(regs, reg_offsets(engine), engine, false); 1786 } 1787 1788 static bool virtual_matches(const struct virtual_engine *ve, 1789 const struct i915_request *rq, 1790 const struct intel_engine_cs *engine) 1791 { 1792 const struct intel_engine_cs *inflight; 1793 1794 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1795 return false; 1796 1797 /* 1798 * We track when the HW has completed saving the context image 1799 * (i.e. when we have seen the final CS event switching out of 1800 * the context) and must not overwrite the context image before 1801 * then. This restricts us to only using the active engine 1802 * while the previous virtualized request is inflight (so 1803 * we reuse the register offsets). This is a very small 1804 * hystersis on the greedy seelction algorithm. 1805 */ 1806 inflight = intel_context_inflight(&ve->context); 1807 if (inflight && inflight != engine) 1808 return false; 1809 1810 return true; 1811 } 1812 1813 static void virtual_xfer_context(struct virtual_engine *ve, 1814 struct intel_engine_cs *engine) 1815 { 1816 unsigned int n; 1817 1818 if (likely(engine == ve->siblings[0])) 1819 return; 1820 1821 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1822 if (!intel_engine_has_relative_mmio(engine)) 1823 virtual_update_register_offsets(ve->context.lrc_reg_state, 1824 engine); 1825 1826 /* 1827 * Move the bound engine to the top of the list for 1828 * future execution. We then kick this tasklet first 1829 * before checking others, so that we preferentially 1830 * reuse this set of bound registers. 1831 */ 1832 for (n = 1; n < ve->num_siblings; n++) { 1833 if (ve->siblings[n] == engine) { 1834 swap(ve->siblings[n], ve->siblings[0]); 1835 break; 1836 } 1837 } 1838 } 1839 1840 #define for_each_waiter(p__, rq__) \ 1841 list_for_each_entry_lockless(p__, \ 1842 &(rq__)->sched.waiters_list, \ 1843 wait_link) 1844 1845 #define for_each_signaler(p__, rq__) \ 1846 list_for_each_entry_rcu(p__, \ 1847 &(rq__)->sched.signalers_list, \ 1848 signal_link) 1849 1850 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1851 { 1852 LIST_HEAD(list); 1853 1854 /* 1855 * We want to move the interrupted request to the back of 1856 * the round-robin list (i.e. its priority level), but 1857 * in doing so, we must then move all requests that were in 1858 * flight and were waiting for the interrupted request to 1859 * be run after it again. 1860 */ 1861 do { 1862 struct i915_dependency *p; 1863 1864 GEM_BUG_ON(i915_request_is_active(rq)); 1865 list_move_tail(&rq->sched.link, pl); 1866 1867 for_each_waiter(p, rq) { 1868 struct i915_request *w = 1869 container_of(p->waiter, typeof(*w), sched); 1870 1871 if (p->flags & I915_DEPENDENCY_WEAK) 1872 continue; 1873 1874 /* Leave semaphores spinning on the other engines */ 1875 if (w->engine != rq->engine) 1876 continue; 1877 1878 /* No waiter should start before its signaler */ 1879 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1880 i915_request_started(w) && 1881 !i915_request_completed(rq)); 1882 1883 GEM_BUG_ON(i915_request_is_active(w)); 1884 if (!i915_request_is_ready(w)) 1885 continue; 1886 1887 if (rq_prio(w) < rq_prio(rq)) 1888 continue; 1889 1890 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1891 list_move_tail(&w->sched.link, &list); 1892 } 1893 1894 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1895 } while (rq); 1896 } 1897 1898 static void defer_active(struct intel_engine_cs *engine) 1899 { 1900 struct i915_request *rq; 1901 1902 rq = __unwind_incomplete_requests(engine); 1903 if (!rq) 1904 return; 1905 1906 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1907 } 1908 1909 static bool 1910 need_timeslice(const struct intel_engine_cs *engine, 1911 const struct i915_request *rq, 1912 const struct rb_node *rb) 1913 { 1914 int hint; 1915 1916 if (!intel_engine_has_timeslices(engine)) 1917 return false; 1918 1919 hint = engine->execlists.queue_priority_hint; 1920 1921 if (rb) { 1922 const struct virtual_engine *ve = 1923 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1924 const struct intel_engine_cs *inflight = 1925 intel_context_inflight(&ve->context); 1926 1927 if (!inflight || inflight == engine) { 1928 struct i915_request *next; 1929 1930 rcu_read_lock(); 1931 next = READ_ONCE(ve->request); 1932 if (next) 1933 hint = max(hint, rq_prio(next)); 1934 rcu_read_unlock(); 1935 } 1936 } 1937 1938 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1939 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1940 1941 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1942 return hint >= effective_prio(rq); 1943 } 1944 1945 static bool 1946 timeslice_yield(const struct intel_engine_execlists *el, 1947 const struct i915_request *rq) 1948 { 1949 /* 1950 * Once bitten, forever smitten! 1951 * 1952 * If the active context ever busy-waited on a semaphore, 1953 * it will be treated as a hog until the end of its timeslice (i.e. 1954 * until it is scheduled out and replaced by a new submission, 1955 * possibly even its own lite-restore). The HW only sends an interrupt 1956 * on the first miss, and we do know if that semaphore has been 1957 * signaled, or even if it is now stuck on another semaphore. Play 1958 * safe, yield if it might be stuck -- it will be given a fresh 1959 * timeslice in the near future. 1960 */ 1961 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1962 } 1963 1964 static bool 1965 timeslice_expired(const struct intel_engine_execlists *el, 1966 const struct i915_request *rq) 1967 { 1968 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1969 } 1970 1971 static int 1972 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1973 { 1974 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1975 return engine->execlists.queue_priority_hint; 1976 1977 return rq_prio(list_next_entry(rq, sched.link)); 1978 } 1979 1980 static inline unsigned long 1981 timeslice(const struct intel_engine_cs *engine) 1982 { 1983 return READ_ONCE(engine->props.timeslice_duration_ms); 1984 } 1985 1986 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1987 { 1988 const struct intel_engine_execlists *execlists = &engine->execlists; 1989 const struct i915_request *rq = *execlists->active; 1990 1991 if (!rq || i915_request_completed(rq)) 1992 return 0; 1993 1994 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1995 return 0; 1996 1997 return timeslice(engine); 1998 } 1999 2000 static void set_timeslice(struct intel_engine_cs *engine) 2001 { 2002 unsigned long duration; 2003 2004 if (!intel_engine_has_timeslices(engine)) 2005 return; 2006 2007 duration = active_timeslice(engine); 2008 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2009 2010 set_timer_ms(&engine->execlists.timer, duration); 2011 } 2012 2013 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2014 { 2015 struct intel_engine_execlists *execlists = &engine->execlists; 2016 unsigned long duration; 2017 2018 if (!intel_engine_has_timeslices(engine)) 2019 return; 2020 2021 WRITE_ONCE(execlists->switch_priority_hint, prio); 2022 if (prio == INT_MIN) 2023 return; 2024 2025 if (timer_pending(&execlists->timer)) 2026 return; 2027 2028 duration = timeslice(engine); 2029 ENGINE_TRACE(engine, 2030 "start timeslicing, prio:%d, interval:%lu", 2031 prio, duration); 2032 2033 set_timer_ms(&execlists->timer, duration); 2034 } 2035 2036 static void record_preemption(struct intel_engine_execlists *execlists) 2037 { 2038 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2039 } 2040 2041 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2042 const struct i915_request *rq) 2043 { 2044 if (!rq) 2045 return 0; 2046 2047 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2048 if (unlikely(intel_context_is_banned(rq->context))) 2049 return 1; 2050 2051 return READ_ONCE(engine->props.preempt_timeout_ms); 2052 } 2053 2054 static void set_preempt_timeout(struct intel_engine_cs *engine, 2055 const struct i915_request *rq) 2056 { 2057 if (!intel_engine_has_preempt_reset(engine)) 2058 return; 2059 2060 set_timer_ms(&engine->execlists.preempt, 2061 active_preempt_timeout(engine, rq)); 2062 } 2063 2064 static inline void clear_ports(struct i915_request **ports, int count) 2065 { 2066 memset_p((void **)ports, NULL, count); 2067 } 2068 2069 static inline void 2070 copy_ports(struct i915_request **dst, struct i915_request **src, int count) 2071 { 2072 /* A memcpy_p() would be very useful here! */ 2073 while (count--) 2074 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ 2075 } 2076 2077 static void execlists_dequeue(struct intel_engine_cs *engine) 2078 { 2079 struct intel_engine_execlists * const execlists = &engine->execlists; 2080 struct i915_request **port = execlists->pending; 2081 struct i915_request ** const last_port = port + execlists->port_mask; 2082 struct i915_request * const *active; 2083 struct i915_request *last; 2084 struct rb_node *rb; 2085 bool submit = false; 2086 2087 /* 2088 * Hardware submission is through 2 ports. Conceptually each port 2089 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2090 * static for a context, and unique to each, so we only execute 2091 * requests belonging to a single context from each ring. RING_HEAD 2092 * is maintained by the CS in the context image, it marks the place 2093 * where it got up to last time, and through RING_TAIL we tell the CS 2094 * where we want to execute up to this time. 2095 * 2096 * In this list the requests are in order of execution. Consecutive 2097 * requests from the same context are adjacent in the ringbuffer. We 2098 * can combine these requests into a single RING_TAIL update: 2099 * 2100 * RING_HEAD...req1...req2 2101 * ^- RING_TAIL 2102 * since to execute req2 the CS must first execute req1. 2103 * 2104 * Our goal then is to point each port to the end of a consecutive 2105 * sequence of requests as being the most optimal (fewest wake ups 2106 * and context switches) submission. 2107 */ 2108 2109 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2110 struct virtual_engine *ve = 2111 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2112 struct i915_request *rq = READ_ONCE(ve->request); 2113 2114 if (!rq) { /* lazily cleanup after another engine handled rq */ 2115 rb_erase_cached(rb, &execlists->virtual); 2116 RB_CLEAR_NODE(rb); 2117 rb = rb_first_cached(&execlists->virtual); 2118 continue; 2119 } 2120 2121 if (!virtual_matches(ve, rq, engine)) { 2122 rb = rb_next(rb); 2123 continue; 2124 } 2125 2126 break; 2127 } 2128 2129 /* 2130 * If the queue is higher priority than the last 2131 * request in the currently active context, submit afresh. 2132 * We will resubmit again afterwards in case we need to split 2133 * the active context to interject the preemption request, 2134 * i.e. we will retrigger preemption following the ack in case 2135 * of trouble. 2136 */ 2137 active = READ_ONCE(execlists->active); 2138 2139 /* 2140 * In theory we can skip over completed contexts that have not 2141 * yet been processed by events (as those events are in flight): 2142 * 2143 * while ((last = *active) && i915_request_completed(last)) 2144 * active++; 2145 * 2146 * However, the GPU cannot handle this as it will ultimately 2147 * find itself trying to jump back into a context it has just 2148 * completed and barf. 2149 */ 2150 2151 if ((last = *active)) { 2152 if (need_preempt(engine, last, rb)) { 2153 if (i915_request_completed(last)) { 2154 tasklet_hi_schedule(&execlists->tasklet); 2155 return; 2156 } 2157 2158 ENGINE_TRACE(engine, 2159 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2160 last->fence.context, 2161 last->fence.seqno, 2162 last->sched.attr.priority, 2163 execlists->queue_priority_hint); 2164 record_preemption(execlists); 2165 2166 /* 2167 * Don't let the RING_HEAD advance past the breadcrumb 2168 * as we unwind (and until we resubmit) so that we do 2169 * not accidentally tell it to go backwards. 2170 */ 2171 ring_set_paused(engine, 1); 2172 2173 /* 2174 * Note that we have not stopped the GPU at this point, 2175 * so we are unwinding the incomplete requests as they 2176 * remain inflight and so by the time we do complete 2177 * the preemption, some of the unwound requests may 2178 * complete! 2179 */ 2180 __unwind_incomplete_requests(engine); 2181 2182 last = NULL; 2183 } else if (need_timeslice(engine, last, rb) && 2184 timeslice_expired(execlists, last)) { 2185 if (i915_request_completed(last)) { 2186 tasklet_hi_schedule(&execlists->tasklet); 2187 return; 2188 } 2189 2190 ENGINE_TRACE(engine, 2191 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2192 last->fence.context, 2193 last->fence.seqno, 2194 last->sched.attr.priority, 2195 execlists->queue_priority_hint, 2196 yesno(timeslice_yield(execlists, last))); 2197 2198 ring_set_paused(engine, 1); 2199 defer_active(engine); 2200 2201 /* 2202 * Unlike for preemption, if we rewind and continue 2203 * executing the same context as previously active, 2204 * the order of execution will remain the same and 2205 * the tail will only advance. We do not need to 2206 * force a full context restore, as a lite-restore 2207 * is sufficient to resample the monotonic TAIL. 2208 * 2209 * If we switch to any other context, similarly we 2210 * will not rewind TAIL of current context, and 2211 * normal save/restore will preserve state and allow 2212 * us to later continue executing the same request. 2213 */ 2214 last = NULL; 2215 } else { 2216 /* 2217 * Otherwise if we already have a request pending 2218 * for execution after the current one, we can 2219 * just wait until the next CS event before 2220 * queuing more. In either case we will force a 2221 * lite-restore preemption event, but if we wait 2222 * we hopefully coalesce several updates into a single 2223 * submission. 2224 */ 2225 if (!list_is_last(&last->sched.link, 2226 &engine->active.requests)) { 2227 /* 2228 * Even if ELSP[1] is occupied and not worthy 2229 * of timeslices, our queue might be. 2230 */ 2231 start_timeslice(engine, queue_prio(execlists)); 2232 return; 2233 } 2234 } 2235 } 2236 2237 while (rb) { /* XXX virtual is always taking precedence */ 2238 struct virtual_engine *ve = 2239 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2240 struct i915_request *rq; 2241 2242 spin_lock(&ve->base.active.lock); 2243 2244 rq = ve->request; 2245 if (unlikely(!rq)) { /* lost the race to a sibling */ 2246 spin_unlock(&ve->base.active.lock); 2247 rb_erase_cached(rb, &execlists->virtual); 2248 RB_CLEAR_NODE(rb); 2249 rb = rb_first_cached(&execlists->virtual); 2250 continue; 2251 } 2252 2253 GEM_BUG_ON(rq != ve->request); 2254 GEM_BUG_ON(rq->engine != &ve->base); 2255 GEM_BUG_ON(rq->context != &ve->context); 2256 2257 if (rq_prio(rq) >= queue_prio(execlists)) { 2258 if (!virtual_matches(ve, rq, engine)) { 2259 spin_unlock(&ve->base.active.lock); 2260 rb = rb_next(rb); 2261 continue; 2262 } 2263 2264 if (last && !can_merge_rq(last, rq)) { 2265 spin_unlock(&ve->base.active.lock); 2266 start_timeslice(engine, rq_prio(rq)); 2267 return; /* leave this for another sibling */ 2268 } 2269 2270 ENGINE_TRACE(engine, 2271 "virtual rq=%llx:%lld%s, new engine? %s\n", 2272 rq->fence.context, 2273 rq->fence.seqno, 2274 i915_request_completed(rq) ? "!" : 2275 i915_request_started(rq) ? "*" : 2276 "", 2277 yesno(engine != ve->siblings[0])); 2278 2279 WRITE_ONCE(ve->request, NULL); 2280 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2281 INT_MIN); 2282 rb_erase_cached(rb, &execlists->virtual); 2283 RB_CLEAR_NODE(rb); 2284 2285 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2286 WRITE_ONCE(rq->engine, engine); 2287 2288 if (__i915_request_submit(rq)) { 2289 /* 2290 * Only after we confirm that we will submit 2291 * this request (i.e. it has not already 2292 * completed), do we want to update the context. 2293 * 2294 * This serves two purposes. It avoids 2295 * unnecessary work if we are resubmitting an 2296 * already completed request after timeslicing. 2297 * But more importantly, it prevents us altering 2298 * ve->siblings[] on an idle context, where 2299 * we may be using ve->siblings[] in 2300 * virtual_context_enter / virtual_context_exit. 2301 */ 2302 virtual_xfer_context(ve, engine); 2303 GEM_BUG_ON(ve->siblings[0] != engine); 2304 2305 submit = true; 2306 last = rq; 2307 } 2308 i915_request_put(rq); 2309 2310 /* 2311 * Hmm, we have a bunch of virtual engine requests, 2312 * but the first one was already completed (thanks 2313 * preempt-to-busy!). Keep looking at the veng queue 2314 * until we have no more relevant requests (i.e. 2315 * the normal submit queue has higher priority). 2316 */ 2317 if (!submit) { 2318 spin_unlock(&ve->base.active.lock); 2319 rb = rb_first_cached(&execlists->virtual); 2320 continue; 2321 } 2322 } 2323 2324 spin_unlock(&ve->base.active.lock); 2325 break; 2326 } 2327 2328 while ((rb = rb_first_cached(&execlists->queue))) { 2329 struct i915_priolist *p = to_priolist(rb); 2330 struct i915_request *rq, *rn; 2331 int i; 2332 2333 priolist_for_each_request_consume(rq, rn, p, i) { 2334 bool merge = true; 2335 2336 /* 2337 * Can we combine this request with the current port? 2338 * It has to be the same context/ringbuffer and not 2339 * have any exceptions (e.g. GVT saying never to 2340 * combine contexts). 2341 * 2342 * If we can combine the requests, we can execute both 2343 * by updating the RING_TAIL to point to the end of the 2344 * second request, and so we never need to tell the 2345 * hardware about the first. 2346 */ 2347 if (last && !can_merge_rq(last, rq)) { 2348 /* 2349 * If we are on the second port and cannot 2350 * combine this request with the last, then we 2351 * are done. 2352 */ 2353 if (port == last_port) 2354 goto done; 2355 2356 /* 2357 * We must not populate both ELSP[] with the 2358 * same LRCA, i.e. we must submit 2 different 2359 * contexts if we submit 2 ELSP. 2360 */ 2361 if (last->context == rq->context) 2362 goto done; 2363 2364 if (i915_request_has_sentinel(last)) 2365 goto done; 2366 2367 /* 2368 * If GVT overrides us we only ever submit 2369 * port[0], leaving port[1] empty. Note that we 2370 * also have to be careful that we don't queue 2371 * the same context (even though a different 2372 * request) to the second port. 2373 */ 2374 if (ctx_single_port_submission(last->context) || 2375 ctx_single_port_submission(rq->context)) 2376 goto done; 2377 2378 merge = false; 2379 } 2380 2381 if (__i915_request_submit(rq)) { 2382 if (!merge) { 2383 *port = execlists_schedule_in(last, port - execlists->pending); 2384 port++; 2385 last = NULL; 2386 } 2387 2388 GEM_BUG_ON(last && 2389 !can_merge_ctx(last->context, 2390 rq->context)); 2391 GEM_BUG_ON(last && 2392 i915_seqno_passed(last->fence.seqno, 2393 rq->fence.seqno)); 2394 2395 submit = true; 2396 last = rq; 2397 } 2398 } 2399 2400 rb_erase_cached(&p->node, &execlists->queue); 2401 i915_priolist_free(p); 2402 } 2403 2404 done: 2405 /* 2406 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2407 * 2408 * We choose the priority hint such that if we add a request of greater 2409 * priority than this, we kick the submission tasklet to decide on 2410 * the right order of submitting the requests to hardware. We must 2411 * also be prepared to reorder requests as they are in-flight on the 2412 * HW. We derive the priority hint then as the first "hole" in 2413 * the HW submission ports and if there are no available slots, 2414 * the priority of the lowest executing request, i.e. last. 2415 * 2416 * When we do receive a higher priority request ready to run from the 2417 * user, see queue_request(), the priority hint is bumped to that 2418 * request triggering preemption on the next dequeue (or subsequent 2419 * interrupt for secondary ports). 2420 */ 2421 execlists->queue_priority_hint = queue_prio(execlists); 2422 2423 if (submit) { 2424 *port = execlists_schedule_in(last, port - execlists->pending); 2425 execlists->switch_priority_hint = 2426 switch_prio(engine, *execlists->pending); 2427 2428 /* 2429 * Skip if we ended up with exactly the same set of requests, 2430 * e.g. trying to timeslice a pair of ordered contexts 2431 */ 2432 if (!memcmp(active, execlists->pending, 2433 (port - execlists->pending + 1) * sizeof(*port))) { 2434 do 2435 execlists_schedule_out(fetch_and_zero(port)); 2436 while (port-- != execlists->pending); 2437 2438 goto skip_submit; 2439 } 2440 clear_ports(port + 1, last_port - port); 2441 2442 WRITE_ONCE(execlists->yield, -1); 2443 set_preempt_timeout(engine, *active); 2444 execlists_submit_ports(engine); 2445 } else { 2446 start_timeslice(engine, execlists->queue_priority_hint); 2447 skip_submit: 2448 ring_set_paused(engine, 0); 2449 } 2450 } 2451 2452 static void 2453 cancel_port_requests(struct intel_engine_execlists * const execlists) 2454 { 2455 struct i915_request * const *port; 2456 2457 for (port = execlists->pending; *port; port++) 2458 execlists_schedule_out(*port); 2459 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2460 2461 /* Mark the end of active before we overwrite *active */ 2462 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2463 execlists_schedule_out(*port); 2464 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2465 2466 smp_wmb(); /* complete the seqlock for execlists_active() */ 2467 WRITE_ONCE(execlists->active, execlists->inflight); 2468 } 2469 2470 static inline void 2471 invalidate_csb_entries(const u64 *first, const u64 *last) 2472 { 2473 clflush((void *)first); 2474 clflush((void *)last); 2475 } 2476 2477 /* 2478 * Starting with Gen12, the status has a new format: 2479 * 2480 * bit 0: switched to new queue 2481 * bit 1: reserved 2482 * bit 2: semaphore wait mode (poll or signal), only valid when 2483 * switch detail is set to "wait on semaphore" 2484 * bits 3-5: engine class 2485 * bits 6-11: engine instance 2486 * bits 12-14: reserved 2487 * bits 15-25: sw context id of the lrc the GT switched to 2488 * bits 26-31: sw counter of the lrc the GT switched to 2489 * bits 32-35: context switch detail 2490 * - 0: ctx complete 2491 * - 1: wait on sync flip 2492 * - 2: wait on vblank 2493 * - 3: wait on scanline 2494 * - 4: wait on semaphore 2495 * - 5: context preempted (not on SEMAPHORE_WAIT or 2496 * WAIT_FOR_EVENT) 2497 * bit 36: reserved 2498 * bits 37-43: wait detail (for switch detail 1 to 4) 2499 * bits 44-46: reserved 2500 * bits 47-57: sw context id of the lrc the GT switched away from 2501 * bits 58-63: sw counter of the lrc the GT switched away from 2502 */ 2503 static inline bool gen12_csb_parse(const u64 csb) 2504 { 2505 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb)); 2506 bool new_queue = 2507 lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2508 2509 /* 2510 * The context switch detail is not guaranteed to be 5 when a preemption 2511 * occurs, so we can't just check for that. The check below works for 2512 * all the cases we care about, including preemptions of WAIT 2513 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2514 * would require some extra handling, but we don't support that. 2515 */ 2516 if (!ctx_away_valid || new_queue) { 2517 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb))); 2518 return true; 2519 } 2520 2521 /* 2522 * switch detail = 5 is covered by the case above and we do not expect a 2523 * context switch on an unsuccessful wait instruction since we always 2524 * use polling mode. 2525 */ 2526 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb))); 2527 return false; 2528 } 2529 2530 static inline bool gen8_csb_parse(const u64 csb) 2531 { 2532 return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2533 } 2534 2535 static noinline u64 2536 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2537 { 2538 u64 entry; 2539 2540 /* 2541 * Reading from the HWSP has one particular advantage: we can detect 2542 * a stale entry. Since the write into HWSP is broken, we have no reason 2543 * to trust the HW at all, the mmio entry may equally be unordered, so 2544 * we prefer the path that is self-checking and as a last resort, 2545 * return the mmio value. 2546 * 2547 * tgl,dg1:HSDES#22011327657 2548 */ 2549 preempt_disable(); 2550 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) { 2551 int idx = csb - engine->execlists.csb_status; 2552 int status; 2553 2554 status = GEN8_EXECLISTS_STATUS_BUF; 2555 if (idx >= 6) { 2556 status = GEN11_EXECLISTS_STATUS_BUF2; 2557 idx -= 6; 2558 } 2559 status += sizeof(u64) * idx; 2560 2561 entry = intel_uncore_read64(engine->uncore, 2562 _MMIO(engine->mmio_base + status)); 2563 } 2564 preempt_enable(); 2565 2566 return entry; 2567 } 2568 2569 static inline u64 2570 csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2571 { 2572 u64 entry = READ_ONCE(*csb); 2573 2574 /* 2575 * Unfortunately, the GPU does not always serialise its write 2576 * of the CSB entries before its write of the CSB pointer, at least 2577 * from the perspective of the CPU, using what is known as a Global 2578 * Observation Point. We may read a new CSB tail pointer, but then 2579 * read the stale CSB entries, causing us to misinterpret the 2580 * context-switch events, and eventually declare the GPU hung. 2581 * 2582 * icl:HSDES#1806554093 2583 * tgl:HSDES#22011248461 2584 */ 2585 if (unlikely(entry == -1)) 2586 entry = wa_csb_read(engine, csb); 2587 2588 /* Consume this entry so that we can spot its future reuse. */ 2589 WRITE_ONCE(*csb, -1); 2590 2591 /* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */ 2592 return entry; 2593 } 2594 2595 static void process_csb(struct intel_engine_cs *engine) 2596 { 2597 struct intel_engine_execlists * const execlists = &engine->execlists; 2598 u64 * const buf = execlists->csb_status; 2599 const u8 num_entries = execlists->csb_size; 2600 u8 head, tail; 2601 2602 /* 2603 * As we modify our execlists state tracking we require exclusive 2604 * access. Either we are inside the tasklet, or the tasklet is disabled 2605 * and we assume that is only inside the reset paths and so serialised. 2606 */ 2607 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2608 !reset_in_progress(execlists)); 2609 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2610 2611 /* 2612 * Note that csb_write, csb_status may be either in HWSP or mmio. 2613 * When reading from the csb_write mmio register, we have to be 2614 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2615 * the low 4bits. As it happens we know the next 4bits are always 2616 * zero and so we can simply masked off the low u8 of the register 2617 * and treat it identically to reading from the HWSP (without having 2618 * to use explicit shifting and masking, and probably bifurcating 2619 * the code to handle the legacy mmio read). 2620 */ 2621 head = execlists->csb_head; 2622 tail = READ_ONCE(*execlists->csb_write); 2623 if (unlikely(head == tail)) 2624 return; 2625 2626 /* 2627 * We will consume all events from HW, or at least pretend to. 2628 * 2629 * The sequence of events from the HW is deterministic, and derived 2630 * from our writes to the ELSP, with a smidgen of variability for 2631 * the arrival of the asynchronous requests wrt to the inflight 2632 * execution. If the HW sends an event that does not correspond with 2633 * the one we are expecting, we have to abandon all hope as we lose 2634 * all tracking of what the engine is actually executing. We will 2635 * only detect we are out of sequence with the HW when we get an 2636 * 'impossible' event because we have already drained our own 2637 * preemption/promotion queue. If this occurs, we know that we likely 2638 * lost track of execution earlier and must unwind and restart, the 2639 * simplest way is by stop processing the event queue and force the 2640 * engine to reset. 2641 */ 2642 execlists->csb_head = tail; 2643 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2644 2645 /* 2646 * Hopefully paired with a wmb() in HW! 2647 * 2648 * We must complete the read of the write pointer before any reads 2649 * from the CSB, so that we do not see stale values. Without an rmb 2650 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2651 * we perform the READ_ONCE(*csb_write). 2652 */ 2653 rmb(); 2654 do { 2655 bool promote; 2656 u64 csb; 2657 2658 if (++head == num_entries) 2659 head = 0; 2660 2661 /* 2662 * We are flying near dragons again. 2663 * 2664 * We hold a reference to the request in execlist_port[] 2665 * but no more than that. We are operating in softirq 2666 * context and so cannot hold any mutex or sleep. That 2667 * prevents us stopping the requests we are processing 2668 * in port[] from being retired simultaneously (the 2669 * breadcrumb will be complete before we see the 2670 * context-switch). As we only hold the reference to the 2671 * request, any pointer chasing underneath the request 2672 * is subject to a potential use-after-free. Thus we 2673 * store all of the bookkeeping within port[] as 2674 * required, and avoid using unguarded pointers beneath 2675 * request itself. The same applies to the atomic 2676 * status notifier. 2677 */ 2678 2679 csb = csb_read(engine, buf + head); 2680 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2681 head, upper_32_bits(csb), lower_32_bits(csb)); 2682 2683 if (INTEL_GEN(engine->i915) >= 12) 2684 promote = gen12_csb_parse(csb); 2685 else 2686 promote = gen8_csb_parse(csb); 2687 if (promote) { 2688 struct i915_request * const *old = execlists->active; 2689 2690 if (GEM_WARN_ON(!*execlists->pending)) { 2691 execlists->error_interrupt |= ERROR_CSB; 2692 break; 2693 } 2694 2695 ring_set_paused(engine, 0); 2696 2697 /* Point active to the new ELSP; prevent overwriting */ 2698 WRITE_ONCE(execlists->active, execlists->pending); 2699 smp_wmb(); /* notify execlists_active() */ 2700 2701 /* cancel old inflight, prepare for switch */ 2702 trace_ports(execlists, "preempted", old); 2703 while (*old) 2704 execlists_schedule_out(*old++); 2705 2706 /* switch pending to inflight */ 2707 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2708 copy_ports(execlists->inflight, 2709 execlists->pending, 2710 execlists_num_ports(execlists)); 2711 smp_wmb(); /* complete the seqlock */ 2712 WRITE_ONCE(execlists->active, execlists->inflight); 2713 2714 /* XXX Magic delay for tgl */ 2715 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 2716 2717 WRITE_ONCE(execlists->pending[0], NULL); 2718 } else { 2719 if (GEM_WARN_ON(!*execlists->active)) { 2720 execlists->error_interrupt |= ERROR_CSB; 2721 break; 2722 } 2723 2724 /* port0 completed, advanced to port1 */ 2725 trace_ports(execlists, "completed", execlists->active); 2726 2727 /* 2728 * We rely on the hardware being strongly 2729 * ordered, that the breadcrumb write is 2730 * coherent (visible from the CPU) before the 2731 * user interrupt is processed. One might assume 2732 * that the breadcrumb write being before the 2733 * user interrupt and the CS event for the context 2734 * switch would therefore be before the CS event 2735 * itself... 2736 */ 2737 if (GEM_SHOW_DEBUG() && 2738 !i915_request_completed(*execlists->active)) { 2739 struct i915_request *rq = *execlists->active; 2740 const u32 *regs __maybe_unused = 2741 rq->context->lrc_reg_state; 2742 2743 ENGINE_TRACE(engine, 2744 "context completed before request!\n"); 2745 ENGINE_TRACE(engine, 2746 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2747 ENGINE_READ(engine, RING_START), 2748 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2749 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2750 ENGINE_READ(engine, RING_CTL), 2751 ENGINE_READ(engine, RING_MI_MODE)); 2752 ENGINE_TRACE(engine, 2753 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2754 i915_ggtt_offset(rq->ring->vma), 2755 rq->head, rq->tail, 2756 rq->fence.context, 2757 lower_32_bits(rq->fence.seqno), 2758 hwsp_seqno(rq)); 2759 ENGINE_TRACE(engine, 2760 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2761 regs[CTX_RING_START], 2762 regs[CTX_RING_HEAD], 2763 regs[CTX_RING_TAIL]); 2764 } 2765 2766 execlists_schedule_out(*execlists->active++); 2767 2768 GEM_BUG_ON(execlists->active - execlists->inflight > 2769 execlists_num_ports(execlists)); 2770 } 2771 } while (head != tail); 2772 2773 set_timeslice(engine); 2774 2775 /* 2776 * Gen11 has proven to fail wrt global observation point between 2777 * entry and tail update, failing on the ordering and thus 2778 * we see an old entry in the context status buffer. 2779 * 2780 * Forcibly evict out entries for the next gpu csb update, 2781 * to increase the odds that we get a fresh entries with non 2782 * working hardware. The cost for doing so comes out mostly with 2783 * the wash as hardware, working or not, will need to do the 2784 * invalidation before. 2785 */ 2786 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2787 } 2788 2789 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2790 { 2791 lockdep_assert_held(&engine->active.lock); 2792 if (!READ_ONCE(engine->execlists.pending[0])) { 2793 rcu_read_lock(); /* protect peeking at execlists->active */ 2794 execlists_dequeue(engine); 2795 rcu_read_unlock(); 2796 } 2797 } 2798 2799 static void __execlists_hold(struct i915_request *rq) 2800 { 2801 LIST_HEAD(list); 2802 2803 do { 2804 struct i915_dependency *p; 2805 2806 if (i915_request_is_active(rq)) 2807 __i915_request_unsubmit(rq); 2808 2809 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2810 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2811 i915_request_set_hold(rq); 2812 RQ_TRACE(rq, "on hold\n"); 2813 2814 for_each_waiter(p, rq) { 2815 struct i915_request *w = 2816 container_of(p->waiter, typeof(*w), sched); 2817 2818 /* Leave semaphores spinning on the other engines */ 2819 if (w->engine != rq->engine) 2820 continue; 2821 2822 if (!i915_request_is_ready(w)) 2823 continue; 2824 2825 if (i915_request_completed(w)) 2826 continue; 2827 2828 if (i915_request_on_hold(w)) 2829 continue; 2830 2831 list_move_tail(&w->sched.link, &list); 2832 } 2833 2834 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2835 } while (rq); 2836 } 2837 2838 static bool execlists_hold(struct intel_engine_cs *engine, 2839 struct i915_request *rq) 2840 { 2841 if (i915_request_on_hold(rq)) 2842 return false; 2843 2844 spin_lock_irq(&engine->active.lock); 2845 2846 if (i915_request_completed(rq)) { /* too late! */ 2847 rq = NULL; 2848 goto unlock; 2849 } 2850 2851 if (rq->engine != engine) { /* preempted virtual engine */ 2852 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2853 2854 /* 2855 * intel_context_inflight() is only protected by virtue 2856 * of process_csb() being called only by the tasklet (or 2857 * directly from inside reset while the tasklet is suspended). 2858 * Assert that neither of those are allowed to run while we 2859 * poke at the request queues. 2860 */ 2861 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2862 2863 /* 2864 * An unsubmitted request along a virtual engine will 2865 * remain on the active (this) engine until we are able 2866 * to process the context switch away (and so mark the 2867 * context as no longer in flight). That cannot have happened 2868 * yet, otherwise we would not be hanging! 2869 */ 2870 spin_lock(&ve->base.active.lock); 2871 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2872 GEM_BUG_ON(ve->request != rq); 2873 ve->request = NULL; 2874 spin_unlock(&ve->base.active.lock); 2875 i915_request_put(rq); 2876 2877 rq->engine = engine; 2878 } 2879 2880 /* 2881 * Transfer this request onto the hold queue to prevent it 2882 * being resumbitted to HW (and potentially completed) before we have 2883 * released it. Since we may have already submitted following 2884 * requests, we need to remove those as well. 2885 */ 2886 GEM_BUG_ON(i915_request_on_hold(rq)); 2887 GEM_BUG_ON(rq->engine != engine); 2888 __execlists_hold(rq); 2889 GEM_BUG_ON(list_empty(&engine->active.hold)); 2890 2891 unlock: 2892 spin_unlock_irq(&engine->active.lock); 2893 return rq; 2894 } 2895 2896 static bool hold_request(const struct i915_request *rq) 2897 { 2898 struct i915_dependency *p; 2899 bool result = false; 2900 2901 /* 2902 * If one of our ancestors is on hold, we must also be on hold, 2903 * otherwise we will bypass it and execute before it. 2904 */ 2905 rcu_read_lock(); 2906 for_each_signaler(p, rq) { 2907 const struct i915_request *s = 2908 container_of(p->signaler, typeof(*s), sched); 2909 2910 if (s->engine != rq->engine) 2911 continue; 2912 2913 result = i915_request_on_hold(s); 2914 if (result) 2915 break; 2916 } 2917 rcu_read_unlock(); 2918 2919 return result; 2920 } 2921 2922 static void __execlists_unhold(struct i915_request *rq) 2923 { 2924 LIST_HEAD(list); 2925 2926 do { 2927 struct i915_dependency *p; 2928 2929 RQ_TRACE(rq, "hold release\n"); 2930 2931 GEM_BUG_ON(!i915_request_on_hold(rq)); 2932 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2933 2934 i915_request_clear_hold(rq); 2935 list_move_tail(&rq->sched.link, 2936 i915_sched_lookup_priolist(rq->engine, 2937 rq_prio(rq))); 2938 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2939 2940 /* Also release any children on this engine that are ready */ 2941 for_each_waiter(p, rq) { 2942 struct i915_request *w = 2943 container_of(p->waiter, typeof(*w), sched); 2944 2945 /* Propagate any change in error status */ 2946 if (rq->fence.error) 2947 i915_request_set_error_once(w, rq->fence.error); 2948 2949 if (w->engine != rq->engine) 2950 continue; 2951 2952 if (!i915_request_on_hold(w)) 2953 continue; 2954 2955 /* Check that no other parents are also on hold */ 2956 if (hold_request(w)) 2957 continue; 2958 2959 list_move_tail(&w->sched.link, &list); 2960 } 2961 2962 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2963 } while (rq); 2964 } 2965 2966 static void execlists_unhold(struct intel_engine_cs *engine, 2967 struct i915_request *rq) 2968 { 2969 spin_lock_irq(&engine->active.lock); 2970 2971 /* 2972 * Move this request back to the priority queue, and all of its 2973 * children and grandchildren that were suspended along with it. 2974 */ 2975 __execlists_unhold(rq); 2976 2977 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2978 engine->execlists.queue_priority_hint = rq_prio(rq); 2979 tasklet_hi_schedule(&engine->execlists.tasklet); 2980 } 2981 2982 spin_unlock_irq(&engine->active.lock); 2983 } 2984 2985 struct execlists_capture { 2986 struct work_struct work; 2987 struct i915_request *rq; 2988 struct i915_gpu_coredump *error; 2989 }; 2990 2991 static void execlists_capture_work(struct work_struct *work) 2992 { 2993 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2994 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2995 struct intel_engine_cs *engine = cap->rq->engine; 2996 struct intel_gt_coredump *gt = cap->error->gt; 2997 struct intel_engine_capture_vma *vma; 2998 2999 /* Compress all the objects attached to the request, slow! */ 3000 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 3001 if (vma) { 3002 struct i915_vma_compress *compress = 3003 i915_vma_capture_prepare(gt); 3004 3005 intel_engine_coredump_add_vma(gt->engine, vma, compress); 3006 i915_vma_capture_finish(gt, compress); 3007 } 3008 3009 gt->simulated = gt->engine->simulated; 3010 cap->error->simulated = gt->simulated; 3011 3012 /* Publish the error state, and announce it to the world */ 3013 i915_error_state_store(cap->error); 3014 i915_gpu_coredump_put(cap->error); 3015 3016 /* Return this request and all that depend upon it for signaling */ 3017 execlists_unhold(engine, cap->rq); 3018 i915_request_put(cap->rq); 3019 3020 kfree(cap); 3021 } 3022 3023 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 3024 { 3025 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 3026 struct execlists_capture *cap; 3027 3028 cap = kmalloc(sizeof(*cap), gfp); 3029 if (!cap) 3030 return NULL; 3031 3032 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 3033 if (!cap->error) 3034 goto err_cap; 3035 3036 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 3037 if (!cap->error->gt) 3038 goto err_gpu; 3039 3040 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 3041 if (!cap->error->gt->engine) 3042 goto err_gt; 3043 3044 cap->error->gt->engine->hung = true; 3045 3046 return cap; 3047 3048 err_gt: 3049 kfree(cap->error->gt); 3050 err_gpu: 3051 kfree(cap->error); 3052 err_cap: 3053 kfree(cap); 3054 return NULL; 3055 } 3056 3057 static struct i915_request * 3058 active_context(struct intel_engine_cs *engine, u32 ccid) 3059 { 3060 const struct intel_engine_execlists * const el = &engine->execlists; 3061 struct i915_request * const *port, *rq; 3062 3063 /* 3064 * Use the most recent result from process_csb(), but just in case 3065 * we trigger an error (via interrupt) before the first CS event has 3066 * been written, peek at the next submission. 3067 */ 3068 3069 for (port = el->active; (rq = *port); port++) { 3070 if (rq->context->lrc.ccid == ccid) { 3071 ENGINE_TRACE(engine, 3072 "ccid found at active:%zd\n", 3073 port - el->active); 3074 return rq; 3075 } 3076 } 3077 3078 for (port = el->pending; (rq = *port); port++) { 3079 if (rq->context->lrc.ccid == ccid) { 3080 ENGINE_TRACE(engine, 3081 "ccid found at pending:%zd\n", 3082 port - el->pending); 3083 return rq; 3084 } 3085 } 3086 3087 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3088 return NULL; 3089 } 3090 3091 static u32 active_ccid(struct intel_engine_cs *engine) 3092 { 3093 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3094 } 3095 3096 static void execlists_capture(struct intel_engine_cs *engine) 3097 { 3098 struct execlists_capture *cap; 3099 3100 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3101 return; 3102 3103 /* 3104 * We need to _quickly_ capture the engine state before we reset. 3105 * We are inside an atomic section (softirq) here and we are delaying 3106 * the forced preemption event. 3107 */ 3108 cap = capture_regs(engine); 3109 if (!cap) 3110 return; 3111 3112 spin_lock_irq(&engine->active.lock); 3113 cap->rq = active_context(engine, active_ccid(engine)); 3114 if (cap->rq) { 3115 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3116 cap->rq = i915_request_get_rcu(cap->rq); 3117 } 3118 spin_unlock_irq(&engine->active.lock); 3119 if (!cap->rq) 3120 goto err_free; 3121 3122 /* 3123 * Remove the request from the execlists queue, and take ownership 3124 * of the request. We pass it to our worker who will _slowly_ compress 3125 * all the pages the _user_ requested for debugging their batch, after 3126 * which we return it to the queue for signaling. 3127 * 3128 * By removing them from the execlists queue, we also remove the 3129 * requests from being processed by __unwind_incomplete_requests() 3130 * during the intel_engine_reset(), and so they will *not* be replayed 3131 * afterwards. 3132 * 3133 * Note that because we have not yet reset the engine at this point, 3134 * it is possible for the request that we have identified as being 3135 * guilty, did in fact complete and we will then hit an arbitration 3136 * point allowing the outstanding preemption to succeed. The likelihood 3137 * of that is very low (as capturing of the engine registers should be 3138 * fast enough to run inside an irq-off atomic section!), so we will 3139 * simply hold that request accountable for being non-preemptible 3140 * long enough to force the reset. 3141 */ 3142 if (!execlists_hold(engine, cap->rq)) 3143 goto err_rq; 3144 3145 INIT_WORK(&cap->work, execlists_capture_work); 3146 schedule_work(&cap->work); 3147 return; 3148 3149 err_rq: 3150 i915_request_put(cap->rq); 3151 err_free: 3152 i915_gpu_coredump_put(cap->error); 3153 kfree(cap); 3154 } 3155 3156 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3157 { 3158 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3159 unsigned long *lock = &engine->gt->reset.flags; 3160 3161 if (!intel_has_reset_engine(engine->gt)) 3162 return; 3163 3164 if (test_and_set_bit(bit, lock)) 3165 return; 3166 3167 ENGINE_TRACE(engine, "reset for %s\n", msg); 3168 3169 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3170 tasklet_disable_nosync(&engine->execlists.tasklet); 3171 3172 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3173 execlists_capture(engine); 3174 intel_engine_reset(engine, msg); 3175 3176 tasklet_enable(&engine->execlists.tasklet); 3177 clear_and_wake_up_bit(bit, lock); 3178 } 3179 3180 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3181 { 3182 const struct timer_list *t = &engine->execlists.preempt; 3183 3184 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3185 return false; 3186 3187 if (!timer_expired(t)) 3188 return false; 3189 3190 return READ_ONCE(engine->execlists.pending[0]); 3191 } 3192 3193 /* 3194 * Check the unread Context Status Buffers and manage the submission of new 3195 * contexts to the ELSP accordingly. 3196 */ 3197 static void execlists_submission_tasklet(unsigned long data) 3198 { 3199 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3200 bool timeout = preempt_timeout(engine); 3201 3202 process_csb(engine); 3203 3204 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3205 const char *msg; 3206 3207 /* Generate the error message in priority wrt to the user! */ 3208 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3209 msg = "CS error"; /* thrown by a user payload */ 3210 else if (engine->execlists.error_interrupt & ERROR_CSB) 3211 msg = "invalid CSB event"; 3212 else 3213 msg = "internal error"; 3214 3215 engine->execlists.error_interrupt = 0; 3216 execlists_reset(engine, msg); 3217 } 3218 3219 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3220 unsigned long flags; 3221 3222 spin_lock_irqsave(&engine->active.lock, flags); 3223 __execlists_submission_tasklet(engine); 3224 spin_unlock_irqrestore(&engine->active.lock, flags); 3225 3226 /* Recheck after serialising with direct-submission */ 3227 if (unlikely(timeout && preempt_timeout(engine))) { 3228 cancel_timer(&engine->execlists.preempt); 3229 execlists_reset(engine, "preemption time out"); 3230 } 3231 } 3232 } 3233 3234 static void __execlists_kick(struct intel_engine_execlists *execlists) 3235 { 3236 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3237 tasklet_hi_schedule(&execlists->tasklet); 3238 } 3239 3240 #define execlists_kick(t, member) \ 3241 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3242 3243 static void execlists_timeslice(struct timer_list *timer) 3244 { 3245 execlists_kick(timer, timer); 3246 } 3247 3248 static void execlists_preempt(struct timer_list *timer) 3249 { 3250 execlists_kick(timer, preempt); 3251 } 3252 3253 static void queue_request(struct intel_engine_cs *engine, 3254 struct i915_request *rq) 3255 { 3256 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3257 list_add_tail(&rq->sched.link, 3258 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3259 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3260 } 3261 3262 static void __submit_queue_imm(struct intel_engine_cs *engine) 3263 { 3264 struct intel_engine_execlists * const execlists = &engine->execlists; 3265 3266 if (reset_in_progress(execlists)) 3267 return; /* defer until we restart the engine following reset */ 3268 3269 __execlists_submission_tasklet(engine); 3270 } 3271 3272 static void submit_queue(struct intel_engine_cs *engine, 3273 const struct i915_request *rq) 3274 { 3275 struct intel_engine_execlists *execlists = &engine->execlists; 3276 3277 if (rq_prio(rq) <= execlists->queue_priority_hint) 3278 return; 3279 3280 execlists->queue_priority_hint = rq_prio(rq); 3281 __submit_queue_imm(engine); 3282 } 3283 3284 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3285 const struct i915_request *rq) 3286 { 3287 GEM_BUG_ON(i915_request_on_hold(rq)); 3288 return !list_empty(&engine->active.hold) && hold_request(rq); 3289 } 3290 3291 static void flush_csb(struct intel_engine_cs *engine) 3292 { 3293 struct intel_engine_execlists *el = &engine->execlists; 3294 3295 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3296 if (!reset_in_progress(el)) 3297 process_csb(engine); 3298 tasklet_unlock(&el->tasklet); 3299 } 3300 } 3301 3302 static void execlists_submit_request(struct i915_request *request) 3303 { 3304 struct intel_engine_cs *engine = request->engine; 3305 unsigned long flags; 3306 3307 /* Hopefully we clear execlists->pending[] to let us through */ 3308 flush_csb(engine); 3309 3310 /* Will be called from irq-context when using foreign fences. */ 3311 spin_lock_irqsave(&engine->active.lock, flags); 3312 3313 if (unlikely(ancestor_on_hold(engine, request))) { 3314 RQ_TRACE(request, "ancestor on hold\n"); 3315 list_add_tail(&request->sched.link, &engine->active.hold); 3316 i915_request_set_hold(request); 3317 } else { 3318 queue_request(engine, request); 3319 3320 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3321 GEM_BUG_ON(list_empty(&request->sched.link)); 3322 3323 submit_queue(engine, request); 3324 } 3325 3326 spin_unlock_irqrestore(&engine->active.lock, flags); 3327 } 3328 3329 static void __execlists_context_fini(struct intel_context *ce) 3330 { 3331 intel_ring_put(ce->ring); 3332 i915_vma_put(ce->state); 3333 } 3334 3335 static void execlists_context_destroy(struct kref *kref) 3336 { 3337 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3338 3339 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3340 GEM_BUG_ON(intel_context_is_pinned(ce)); 3341 3342 if (ce->state) 3343 __execlists_context_fini(ce); 3344 3345 intel_context_fini(ce); 3346 intel_context_free(ce); 3347 } 3348 3349 static void 3350 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3351 { 3352 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3353 return; 3354 3355 vaddr += engine->context_size; 3356 3357 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3358 } 3359 3360 static void 3361 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3362 { 3363 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3364 return; 3365 3366 vaddr += engine->context_size; 3367 3368 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3369 drm_err_once(&engine->i915->drm, 3370 "%s context redzone overwritten!\n", 3371 engine->name); 3372 } 3373 3374 static void execlists_context_unpin(struct intel_context *ce) 3375 { 3376 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3377 ce->engine); 3378 } 3379 3380 static void execlists_context_post_unpin(struct intel_context *ce) 3381 { 3382 i915_gem_object_unpin_map(ce->state->obj); 3383 } 3384 3385 static u32 * 3386 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3387 { 3388 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3389 MI_SRM_LRM_GLOBAL_GTT | 3390 MI_LRI_LRM_CS_MMIO; 3391 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3392 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3393 CTX_TIMESTAMP * sizeof(u32); 3394 *cs++ = 0; 3395 3396 *cs++ = MI_LOAD_REGISTER_REG | 3397 MI_LRR_SOURCE_CS_MMIO | 3398 MI_LRI_LRM_CS_MMIO; 3399 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3400 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3401 3402 *cs++ = MI_LOAD_REGISTER_REG | 3403 MI_LRR_SOURCE_CS_MMIO | 3404 MI_LRI_LRM_CS_MMIO; 3405 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3406 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3407 3408 return cs; 3409 } 3410 3411 static u32 * 3412 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3413 { 3414 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3415 3416 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3417 MI_SRM_LRM_GLOBAL_GTT | 3418 MI_LRI_LRM_CS_MMIO; 3419 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3420 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3421 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3422 *cs++ = 0; 3423 3424 return cs; 3425 } 3426 3427 static u32 * 3428 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3429 { 3430 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3431 3432 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3433 MI_SRM_LRM_GLOBAL_GTT | 3434 MI_LRI_LRM_CS_MMIO; 3435 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3436 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3437 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3438 *cs++ = 0; 3439 3440 *cs++ = MI_LOAD_REGISTER_REG | 3441 MI_LRR_SOURCE_CS_MMIO | 3442 MI_LRI_LRM_CS_MMIO; 3443 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3444 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3445 3446 return cs; 3447 } 3448 3449 static u32 * 3450 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3451 { 3452 cs = gen12_emit_timestamp_wa(ce, cs); 3453 cs = gen12_emit_cmd_buf_wa(ce, cs); 3454 cs = gen12_emit_restore_scratch(ce, cs); 3455 3456 return cs; 3457 } 3458 3459 static u32 * 3460 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3461 { 3462 cs = gen12_emit_timestamp_wa(ce, cs); 3463 cs = gen12_emit_restore_scratch(ce, cs); 3464 3465 return cs; 3466 } 3467 3468 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3469 { 3470 return PAGE_SIZE * ce->wa_bb_page; 3471 } 3472 3473 static u32 *context_indirect_bb(const struct intel_context *ce) 3474 { 3475 void *ptr; 3476 3477 GEM_BUG_ON(!ce->wa_bb_page); 3478 3479 ptr = ce->lrc_reg_state; 3480 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3481 ptr += context_wa_bb_offset(ce); 3482 3483 return ptr; 3484 } 3485 3486 static void 3487 setup_indirect_ctx_bb(const struct intel_context *ce, 3488 const struct intel_engine_cs *engine, 3489 u32 *(*emit)(const struct intel_context *, u32 *)) 3490 { 3491 u32 * const start = context_indirect_bb(ce); 3492 u32 *cs; 3493 3494 cs = emit(ce, start); 3495 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3496 while ((unsigned long)cs % CACHELINE_BYTES) 3497 *cs++ = MI_NOOP; 3498 3499 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3500 i915_ggtt_offset(ce->state) + 3501 context_wa_bb_offset(ce), 3502 (cs - start) * sizeof(*cs)); 3503 } 3504 3505 static void 3506 __execlists_update_reg_state(const struct intel_context *ce, 3507 const struct intel_engine_cs *engine, 3508 u32 head) 3509 { 3510 struct intel_ring *ring = ce->ring; 3511 u32 *regs = ce->lrc_reg_state; 3512 3513 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3514 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3515 3516 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3517 regs[CTX_RING_HEAD] = head; 3518 regs[CTX_RING_TAIL] = ring->tail; 3519 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3520 3521 /* RPCS */ 3522 if (engine->class == RENDER_CLASS) { 3523 regs[CTX_R_PWR_CLK_STATE] = 3524 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3525 3526 i915_oa_init_reg_state(ce, engine); 3527 } 3528 3529 if (ce->wa_bb_page) { 3530 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3531 3532 fn = gen12_emit_indirect_ctx_xcs; 3533 if (ce->engine->class == RENDER_CLASS) 3534 fn = gen12_emit_indirect_ctx_rcs; 3535 3536 /* Mutually exclusive wrt to global indirect bb */ 3537 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3538 setup_indirect_ctx_bb(ce, engine, fn); 3539 } 3540 } 3541 3542 static int 3543 execlists_context_pre_pin(struct intel_context *ce, 3544 struct i915_gem_ww_ctx *ww, void **vaddr) 3545 { 3546 GEM_BUG_ON(!ce->state); 3547 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3548 3549 *vaddr = i915_gem_object_pin_map(ce->state->obj, 3550 i915_coherent_map_type(ce->engine->i915) | 3551 I915_MAP_OVERRIDE); 3552 3553 return PTR_ERR_OR_ZERO(*vaddr); 3554 } 3555 3556 static int 3557 __execlists_context_pin(struct intel_context *ce, 3558 struct intel_engine_cs *engine, 3559 void *vaddr) 3560 { 3561 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3562 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3563 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3564 3565 return 0; 3566 } 3567 3568 static int execlists_context_pin(struct intel_context *ce, void *vaddr) 3569 { 3570 return __execlists_context_pin(ce, ce->engine, vaddr); 3571 } 3572 3573 static int execlists_context_alloc(struct intel_context *ce) 3574 { 3575 return __execlists_context_alloc(ce, ce->engine); 3576 } 3577 3578 static void execlists_context_reset(struct intel_context *ce) 3579 { 3580 CE_TRACE(ce, "reset\n"); 3581 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3582 3583 intel_ring_reset(ce->ring, ce->ring->emit); 3584 3585 /* Scrub away the garbage */ 3586 execlists_init_reg_state(ce->lrc_reg_state, 3587 ce, ce->engine, ce->ring, true); 3588 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3589 3590 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3591 } 3592 3593 static const struct intel_context_ops execlists_context_ops = { 3594 .alloc = execlists_context_alloc, 3595 3596 .pre_pin = execlists_context_pre_pin, 3597 .pin = execlists_context_pin, 3598 .unpin = execlists_context_unpin, 3599 .post_unpin = execlists_context_post_unpin, 3600 3601 .enter = intel_context_enter_engine, 3602 .exit = intel_context_exit_engine, 3603 3604 .reset = execlists_context_reset, 3605 .destroy = execlists_context_destroy, 3606 }; 3607 3608 static u32 hwsp_offset(const struct i915_request *rq) 3609 { 3610 const struct intel_timeline_cacheline *cl; 3611 3612 /* Before the request is executed, the timeline/cachline is fixed */ 3613 3614 cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); 3615 if (cl) 3616 return cl->ggtt_offset; 3617 3618 return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; 3619 } 3620 3621 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3622 { 3623 u32 *cs; 3624 3625 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3626 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3627 return 0; 3628 3629 cs = intel_ring_begin(rq, 6); 3630 if (IS_ERR(cs)) 3631 return PTR_ERR(cs); 3632 3633 /* 3634 * Check if we have been preempted before we even get started. 3635 * 3636 * After this point i915_request_started() reports true, even if 3637 * we get preempted and so are no longer running. 3638 */ 3639 *cs++ = MI_ARB_CHECK; 3640 *cs++ = MI_NOOP; 3641 3642 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3643 *cs++ = hwsp_offset(rq); 3644 *cs++ = 0; 3645 *cs++ = rq->fence.seqno - 1; 3646 3647 intel_ring_advance(rq, cs); 3648 3649 /* Record the updated position of the request's payload */ 3650 rq->infix = intel_ring_offset(rq, cs); 3651 3652 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3653 3654 return 0; 3655 } 3656 3657 static int emit_pdps(struct i915_request *rq) 3658 { 3659 const struct intel_engine_cs * const engine = rq->engine; 3660 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3661 int err, i; 3662 u32 *cs; 3663 3664 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3665 3666 /* 3667 * Beware ye of the dragons, this sequence is magic! 3668 * 3669 * Small changes to this sequence can cause anything from 3670 * GPU hangs to forcewake errors and machine lockups! 3671 */ 3672 3673 /* Flush any residual operations from the context load */ 3674 err = engine->emit_flush(rq, EMIT_FLUSH); 3675 if (err) 3676 return err; 3677 3678 /* Magic required to prevent forcewake errors! */ 3679 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3680 if (err) 3681 return err; 3682 3683 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3684 if (IS_ERR(cs)) 3685 return PTR_ERR(cs); 3686 3687 /* Ensure the LRI have landed before we invalidate & continue */ 3688 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3689 for (i = GEN8_3LVL_PDPES; i--; ) { 3690 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3691 u32 base = engine->mmio_base; 3692 3693 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3694 *cs++ = upper_32_bits(pd_daddr); 3695 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3696 *cs++ = lower_32_bits(pd_daddr); 3697 } 3698 *cs++ = MI_NOOP; 3699 3700 intel_ring_advance(rq, cs); 3701 3702 return 0; 3703 } 3704 3705 static int execlists_request_alloc(struct i915_request *request) 3706 { 3707 int ret; 3708 3709 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3710 3711 /* 3712 * Flush enough space to reduce the likelihood of waiting after 3713 * we start building the request - in which case we will just 3714 * have to repeat work. 3715 */ 3716 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3717 3718 /* 3719 * Note that after this point, we have committed to using 3720 * this request as it is being used to both track the 3721 * state of engine initialisation and liveness of the 3722 * golden renderstate above. Think twice before you try 3723 * to cancel/unwind this request now. 3724 */ 3725 3726 if (!i915_vm_is_4lvl(request->context->vm)) { 3727 ret = emit_pdps(request); 3728 if (ret) 3729 return ret; 3730 } 3731 3732 /* Unconditionally invalidate GPU caches and TLBs. */ 3733 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3734 if (ret) 3735 return ret; 3736 3737 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3738 return 0; 3739 } 3740 3741 /* 3742 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3743 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3744 * but there is a slight complication as this is applied in WA batch where the 3745 * values are only initialized once so we cannot take register value at the 3746 * beginning and reuse it further; hence we save its value to memory, upload a 3747 * constant value with bit21 set and then we restore it back with the saved value. 3748 * To simplify the WA, a constant value is formed by using the default value 3749 * of this register. This shouldn't be a problem because we are only modifying 3750 * it for a short period and this batch in non-premptible. We can ofcourse 3751 * use additional instructions that read the actual value of the register 3752 * at that time and set our bit of interest but it makes the WA complicated. 3753 * 3754 * This WA is also required for Gen9 so extracting as a function avoids 3755 * code duplication. 3756 */ 3757 static u32 * 3758 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3759 { 3760 /* NB no one else is allowed to scribble over scratch + 256! */ 3761 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3762 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3763 *batch++ = intel_gt_scratch_offset(engine->gt, 3764 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3765 *batch++ = 0; 3766 3767 *batch++ = MI_LOAD_REGISTER_IMM(1); 3768 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3769 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3770 3771 batch = gen8_emit_pipe_control(batch, 3772 PIPE_CONTROL_CS_STALL | 3773 PIPE_CONTROL_DC_FLUSH_ENABLE, 3774 0); 3775 3776 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3777 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3778 *batch++ = intel_gt_scratch_offset(engine->gt, 3779 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3780 *batch++ = 0; 3781 3782 return batch; 3783 } 3784 3785 /* 3786 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3787 * initialized at the beginning and shared across all contexts but this field 3788 * helps us to have multiple batches at different offsets and select them based 3789 * on a criteria. At the moment this batch always start at the beginning of the page 3790 * and at this point we don't have multiple wa_ctx batch buffers. 3791 * 3792 * The number of WA applied are not known at the beginning; we use this field 3793 * to return the no of DWORDS written. 3794 * 3795 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3796 * so it adds NOOPs as padding to make it cacheline aligned. 3797 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3798 * makes a complete batch buffer. 3799 */ 3800 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3801 { 3802 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3803 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3804 3805 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3806 if (IS_BROADWELL(engine->i915)) 3807 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3808 3809 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3810 /* Actual scratch location is at 128 bytes offset */ 3811 batch = gen8_emit_pipe_control(batch, 3812 PIPE_CONTROL_FLUSH_L3 | 3813 PIPE_CONTROL_STORE_DATA_INDEX | 3814 PIPE_CONTROL_CS_STALL | 3815 PIPE_CONTROL_QW_WRITE, 3816 LRC_PPHWSP_SCRATCH_ADDR); 3817 3818 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3819 3820 /* Pad to end of cacheline */ 3821 while ((unsigned long)batch % CACHELINE_BYTES) 3822 *batch++ = MI_NOOP; 3823 3824 /* 3825 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3826 * execution depends on the length specified in terms of cache lines 3827 * in the register CTX_RCS_INDIRECT_CTX 3828 */ 3829 3830 return batch; 3831 } 3832 3833 struct lri { 3834 i915_reg_t reg; 3835 u32 value; 3836 }; 3837 3838 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3839 { 3840 GEM_BUG_ON(!count || count > 63); 3841 3842 *batch++ = MI_LOAD_REGISTER_IMM(count); 3843 do { 3844 *batch++ = i915_mmio_reg_offset(lri->reg); 3845 *batch++ = lri->value; 3846 } while (lri++, --count); 3847 *batch++ = MI_NOOP; 3848 3849 return batch; 3850 } 3851 3852 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3853 { 3854 static const struct lri lri[] = { 3855 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3856 { 3857 COMMON_SLICE_CHICKEN2, 3858 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3859 0), 3860 }, 3861 3862 /* BSpec: 11391 */ 3863 { 3864 FF_SLICE_CHICKEN, 3865 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3866 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3867 }, 3868 3869 /* BSpec: 11299 */ 3870 { 3871 _3D_CHICKEN3, 3872 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3873 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3874 } 3875 }; 3876 3877 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3878 3879 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3880 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3881 3882 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3883 batch = gen8_emit_pipe_control(batch, 3884 PIPE_CONTROL_FLUSH_L3 | 3885 PIPE_CONTROL_STORE_DATA_INDEX | 3886 PIPE_CONTROL_CS_STALL | 3887 PIPE_CONTROL_QW_WRITE, 3888 LRC_PPHWSP_SCRATCH_ADDR); 3889 3890 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3891 3892 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3893 if (HAS_POOLED_EU(engine->i915)) { 3894 /* 3895 * EU pool configuration is setup along with golden context 3896 * during context initialization. This value depends on 3897 * device type (2x6 or 3x6) and needs to be updated based 3898 * on which subslice is disabled especially for 2x6 3899 * devices, however it is safe to load default 3900 * configuration of 3x6 device instead of masking off 3901 * corresponding bits because HW ignores bits of a disabled 3902 * subslice and drops down to appropriate config. Please 3903 * see render_state_setup() in i915_gem_render_state.c for 3904 * possible configurations, to avoid duplication they are 3905 * not shown here again. 3906 */ 3907 *batch++ = GEN9_MEDIA_POOL_STATE; 3908 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3909 *batch++ = 0x00777000; 3910 *batch++ = 0; 3911 *batch++ = 0; 3912 *batch++ = 0; 3913 } 3914 3915 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3916 3917 /* Pad to end of cacheline */ 3918 while ((unsigned long)batch % CACHELINE_BYTES) 3919 *batch++ = MI_NOOP; 3920 3921 return batch; 3922 } 3923 3924 static u32 * 3925 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3926 { 3927 int i; 3928 3929 /* 3930 * WaPipeControlBefore3DStateSamplePattern: cnl 3931 * 3932 * Ensure the engine is idle prior to programming a 3933 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3934 */ 3935 batch = gen8_emit_pipe_control(batch, 3936 PIPE_CONTROL_CS_STALL, 3937 0); 3938 /* 3939 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3940 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3941 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3942 * confusing. Since gen8_emit_pipe_control() already advances the 3943 * batch by 6 dwords, we advance the other 10 here, completing a 3944 * cacheline. It's not clear if the workaround requires this padding 3945 * before other commands, or if it's just the regular padding we would 3946 * already have for the workaround bb, so leave it here for now. 3947 */ 3948 for (i = 0; i < 10; i++) 3949 *batch++ = MI_NOOP; 3950 3951 /* Pad to end of cacheline */ 3952 while ((unsigned long)batch % CACHELINE_BYTES) 3953 *batch++ = MI_NOOP; 3954 3955 return batch; 3956 } 3957 3958 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3959 3960 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3961 { 3962 struct drm_i915_gem_object *obj; 3963 struct i915_vma *vma; 3964 int err; 3965 3966 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3967 if (IS_ERR(obj)) 3968 return PTR_ERR(obj); 3969 3970 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3971 if (IS_ERR(vma)) { 3972 err = PTR_ERR(vma); 3973 goto err; 3974 } 3975 3976 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 3977 if (err) 3978 goto err; 3979 3980 engine->wa_ctx.vma = vma; 3981 return 0; 3982 3983 err: 3984 i915_gem_object_put(obj); 3985 return err; 3986 } 3987 3988 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3989 { 3990 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3991 } 3992 3993 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3994 3995 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3996 { 3997 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3998 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3999 &wa_ctx->per_ctx }; 4000 wa_bb_func_t wa_bb_fn[2]; 4001 void *batch, *batch_ptr; 4002 unsigned int i; 4003 int ret; 4004 4005 if (engine->class != RENDER_CLASS) 4006 return 0; 4007 4008 switch (INTEL_GEN(engine->i915)) { 4009 case 12: 4010 case 11: 4011 return 0; 4012 case 10: 4013 wa_bb_fn[0] = gen10_init_indirectctx_bb; 4014 wa_bb_fn[1] = NULL; 4015 break; 4016 case 9: 4017 wa_bb_fn[0] = gen9_init_indirectctx_bb; 4018 wa_bb_fn[1] = NULL; 4019 break; 4020 case 8: 4021 wa_bb_fn[0] = gen8_init_indirectctx_bb; 4022 wa_bb_fn[1] = NULL; 4023 break; 4024 default: 4025 MISSING_CASE(INTEL_GEN(engine->i915)); 4026 return 0; 4027 } 4028 4029 ret = lrc_setup_wa_ctx(engine); 4030 if (ret) { 4031 drm_dbg(&engine->i915->drm, 4032 "Failed to setup context WA page: %d\n", ret); 4033 return ret; 4034 } 4035 4036 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 4037 4038 /* 4039 * Emit the two workaround batch buffers, recording the offset from the 4040 * start of the workaround batch buffer object for each and their 4041 * respective sizes. 4042 */ 4043 batch_ptr = batch; 4044 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 4045 wa_bb[i]->offset = batch_ptr - batch; 4046 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 4047 CACHELINE_BYTES))) { 4048 ret = -EINVAL; 4049 break; 4050 } 4051 if (wa_bb_fn[i]) 4052 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 4053 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 4054 } 4055 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 4056 4057 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 4058 __i915_gem_object_release_map(wa_ctx->vma->obj); 4059 if (ret) 4060 lrc_destroy_wa_ctx(engine); 4061 4062 return ret; 4063 } 4064 4065 static void reset_csb_pointers(struct intel_engine_cs *engine) 4066 { 4067 struct intel_engine_execlists * const execlists = &engine->execlists; 4068 const unsigned int reset_value = execlists->csb_size - 1; 4069 4070 ring_set_paused(engine, 0); 4071 4072 /* 4073 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 4074 * Bludgeon them with a mmio update to be sure. 4075 */ 4076 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4077 0xffff << 16 | reset_value << 8 | reset_value); 4078 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4079 4080 /* 4081 * After a reset, the HW starts writing into CSB entry [0]. We 4082 * therefore have to set our HEAD pointer back one entry so that 4083 * the *first* entry we check is entry 0. To complicate this further, 4084 * as we don't wait for the first interrupt after reset, we have to 4085 * fake the HW write to point back to the last entry so that our 4086 * inline comparison of our cached head position against the last HW 4087 * write works even before the first interrupt. 4088 */ 4089 execlists->csb_head = reset_value; 4090 WRITE_ONCE(*execlists->csb_write, reset_value); 4091 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4092 4093 /* Check that the GPU does indeed update the CSB entries! */ 4094 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); 4095 invalidate_csb_entries(&execlists->csb_status[0], 4096 &execlists->csb_status[reset_value]); 4097 4098 /* Once more for luck and our trusty paranoia */ 4099 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4100 0xffff << 16 | reset_value << 8 | reset_value); 4101 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4102 4103 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4104 } 4105 4106 static void execlists_sanitize(struct intel_engine_cs *engine) 4107 { 4108 GEM_BUG_ON(execlists_active(&engine->execlists)); 4109 4110 /* 4111 * Poison residual state on resume, in case the suspend didn't! 4112 * 4113 * We have to assume that across suspend/resume (or other loss 4114 * of control) that the contents of our pinned buffers has been 4115 * lost, replaced by garbage. Since this doesn't always happen, 4116 * let's poison such state so that we more quickly spot when 4117 * we falsely assume it has been preserved. 4118 */ 4119 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4120 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4121 4122 reset_csb_pointers(engine); 4123 4124 /* 4125 * The kernel_context HWSP is stored in the status_page. As above, 4126 * that may be lost on resume/initialisation, and so we need to 4127 * reset the value in the HWSP. 4128 */ 4129 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4130 4131 /* And scrub the dirty cachelines for the HWSP */ 4132 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4133 } 4134 4135 static void enable_error_interrupt(struct intel_engine_cs *engine) 4136 { 4137 u32 status; 4138 4139 engine->execlists.error_interrupt = 0; 4140 ENGINE_WRITE(engine, RING_EMR, ~0u); 4141 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4142 4143 status = ENGINE_READ(engine, RING_ESR); 4144 if (unlikely(status)) { 4145 drm_err(&engine->i915->drm, 4146 "engine '%s' resumed still in error: %08x\n", 4147 engine->name, status); 4148 __intel_gt_reset(engine->gt, engine->mask); 4149 } 4150 4151 /* 4152 * On current gen8+, we have 2 signals to play with 4153 * 4154 * - I915_ERROR_INSTUCTION (bit 0) 4155 * 4156 * Generate an error if the command parser encounters an invalid 4157 * instruction 4158 * 4159 * This is a fatal error. 4160 * 4161 * - CP_PRIV (bit 2) 4162 * 4163 * Generate an error on privilege violation (where the CP replaces 4164 * the instruction with a no-op). This also fires for writes into 4165 * read-only scratch pages. 4166 * 4167 * This is a non-fatal error, parsing continues. 4168 * 4169 * * there are a few others defined for odd HW that we do not use 4170 * 4171 * Since CP_PRIV fires for cases where we have chosen to ignore the 4172 * error (as the HW is validating and suppressing the mistakes), we 4173 * only unmask the instruction error bit. 4174 */ 4175 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4176 } 4177 4178 static void enable_execlists(struct intel_engine_cs *engine) 4179 { 4180 u32 mode; 4181 4182 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4183 4184 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4185 4186 if (INTEL_GEN(engine->i915) >= 11) 4187 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4188 else 4189 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4190 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4191 4192 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4193 4194 ENGINE_WRITE_FW(engine, 4195 RING_HWS_PGA, 4196 i915_ggtt_offset(engine->status_page.vma)); 4197 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4198 4199 enable_error_interrupt(engine); 4200 4201 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4202 } 4203 4204 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4205 { 4206 bool unexpected = false; 4207 4208 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4209 drm_dbg(&engine->i915->drm, 4210 "STOP_RING still set in RING_MI_MODE\n"); 4211 unexpected = true; 4212 } 4213 4214 return unexpected; 4215 } 4216 4217 static int execlists_resume(struct intel_engine_cs *engine) 4218 { 4219 intel_mocs_init_engine(engine); 4220 4221 intel_breadcrumbs_reset(engine->breadcrumbs); 4222 4223 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4224 struct drm_printer p = drm_debug_printer(__func__); 4225 4226 intel_engine_dump(engine, &p, NULL); 4227 } 4228 4229 enable_execlists(engine); 4230 4231 return 0; 4232 } 4233 4234 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4235 { 4236 struct intel_engine_execlists * const execlists = &engine->execlists; 4237 unsigned long flags; 4238 4239 ENGINE_TRACE(engine, "depth<-%d\n", 4240 atomic_read(&execlists->tasklet.count)); 4241 4242 /* 4243 * Prevent request submission to the hardware until we have 4244 * completed the reset in i915_gem_reset_finish(). If a request 4245 * is completed by one engine, it may then queue a request 4246 * to a second via its execlists->tasklet *just* as we are 4247 * calling engine->resume() and also writing the ELSP. 4248 * Turning off the execlists->tasklet until the reset is over 4249 * prevents the race. 4250 */ 4251 __tasklet_disable_sync_once(&execlists->tasklet); 4252 GEM_BUG_ON(!reset_in_progress(execlists)); 4253 4254 /* And flush any current direct submission. */ 4255 spin_lock_irqsave(&engine->active.lock, flags); 4256 spin_unlock_irqrestore(&engine->active.lock, flags); 4257 4258 /* 4259 * We stop engines, otherwise we might get failed reset and a 4260 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4261 * from system hang if batchbuffer is progressing when 4262 * the reset is issued, regardless of READY_TO_RESET ack. 4263 * Thus assume it is best to stop engines on all gens 4264 * where we have a gpu reset. 4265 * 4266 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4267 * 4268 * FIXME: Wa for more modern gens needs to be validated 4269 */ 4270 ring_set_paused(engine, 1); 4271 intel_engine_stop_cs(engine); 4272 4273 engine->execlists.reset_ccid = active_ccid(engine); 4274 } 4275 4276 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4277 { 4278 int x; 4279 4280 x = lrc_ring_mi_mode(engine); 4281 if (x != -1) { 4282 regs[x + 1] &= ~STOP_RING; 4283 regs[x + 1] |= STOP_RING << 16; 4284 } 4285 } 4286 4287 static void __execlists_reset_reg_state(const struct intel_context *ce, 4288 const struct intel_engine_cs *engine) 4289 { 4290 u32 *regs = ce->lrc_reg_state; 4291 4292 __reset_stop_ring(regs, engine); 4293 } 4294 4295 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4296 { 4297 struct intel_engine_execlists * const execlists = &engine->execlists; 4298 struct intel_context *ce; 4299 struct i915_request *rq; 4300 u32 head; 4301 4302 mb(); /* paranoia: read the CSB pointers from after the reset */ 4303 clflush(execlists->csb_write); 4304 mb(); 4305 4306 process_csb(engine); /* drain preemption events */ 4307 4308 /* Following the reset, we need to reload the CSB read/write pointers */ 4309 reset_csb_pointers(engine); 4310 4311 /* 4312 * Save the currently executing context, even if we completed 4313 * its request, it was still running at the time of the 4314 * reset and will have been clobbered. 4315 */ 4316 rq = active_context(engine, engine->execlists.reset_ccid); 4317 if (!rq) 4318 goto unwind; 4319 4320 ce = rq->context; 4321 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4322 4323 if (i915_request_completed(rq)) { 4324 /* Idle context; tidy up the ring so we can restart afresh */ 4325 head = intel_ring_wrap(ce->ring, rq->tail); 4326 goto out_replay; 4327 } 4328 4329 /* We still have requests in-flight; the engine should be active */ 4330 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4331 4332 /* Context has requests still in-flight; it should not be idle! */ 4333 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4334 4335 rq = active_request(ce->timeline, rq); 4336 head = intel_ring_wrap(ce->ring, rq->head); 4337 GEM_BUG_ON(head == ce->ring->tail); 4338 4339 /* 4340 * If this request hasn't started yet, e.g. it is waiting on a 4341 * semaphore, we need to avoid skipping the request or else we 4342 * break the signaling chain. However, if the context is corrupt 4343 * the request will not restart and we will be stuck with a wedged 4344 * device. It is quite often the case that if we issue a reset 4345 * while the GPU is loading the context image, that the context 4346 * image becomes corrupt. 4347 * 4348 * Otherwise, if we have not started yet, the request should replay 4349 * perfectly and we do not need to flag the result as being erroneous. 4350 */ 4351 if (!i915_request_started(rq)) 4352 goto out_replay; 4353 4354 /* 4355 * If the request was innocent, we leave the request in the ELSP 4356 * and will try to replay it on restarting. The context image may 4357 * have been corrupted by the reset, in which case we may have 4358 * to service a new GPU hang, but more likely we can continue on 4359 * without impact. 4360 * 4361 * If the request was guilty, we presume the context is corrupt 4362 * and have to at least restore the RING register in the context 4363 * image back to the expected values to skip over the guilty request. 4364 */ 4365 __i915_request_reset(rq, stalled); 4366 4367 /* 4368 * We want a simple context + ring to execute the breadcrumb update. 4369 * We cannot rely on the context being intact across the GPU hang, 4370 * so clear it and rebuild just what we need for the breadcrumb. 4371 * All pending requests for this context will be zapped, and any 4372 * future request will be after userspace has had the opportunity 4373 * to recreate its own state. 4374 */ 4375 out_replay: 4376 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4377 head, ce->ring->tail); 4378 __execlists_reset_reg_state(ce, engine); 4379 __execlists_update_reg_state(ce, engine, head); 4380 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4381 4382 unwind: 4383 /* Push back any incomplete requests for replay after the reset. */ 4384 cancel_port_requests(execlists); 4385 __unwind_incomplete_requests(engine); 4386 } 4387 4388 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4389 { 4390 unsigned long flags; 4391 4392 ENGINE_TRACE(engine, "\n"); 4393 4394 spin_lock_irqsave(&engine->active.lock, flags); 4395 4396 __execlists_reset(engine, stalled); 4397 4398 spin_unlock_irqrestore(&engine->active.lock, flags); 4399 } 4400 4401 static void nop_submission_tasklet(unsigned long data) 4402 { 4403 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4404 4405 /* The driver is wedged; don't process any more events. */ 4406 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4407 } 4408 4409 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4410 { 4411 struct intel_engine_execlists * const execlists = &engine->execlists; 4412 struct i915_request *rq, *rn; 4413 struct rb_node *rb; 4414 unsigned long flags; 4415 4416 ENGINE_TRACE(engine, "\n"); 4417 4418 /* 4419 * Before we call engine->cancel_requests(), we should have exclusive 4420 * access to the submission state. This is arranged for us by the 4421 * caller disabling the interrupt generation, the tasklet and other 4422 * threads that may then access the same state, giving us a free hand 4423 * to reset state. However, we still need to let lockdep be aware that 4424 * we know this state may be accessed in hardirq context, so we 4425 * disable the irq around this manipulation and we want to keep 4426 * the spinlock focused on its duties and not accidentally conflate 4427 * coverage to the submission's irq state. (Similarly, although we 4428 * shouldn't need to disable irq around the manipulation of the 4429 * submission's irq state, we also wish to remind ourselves that 4430 * it is irq state.) 4431 */ 4432 spin_lock_irqsave(&engine->active.lock, flags); 4433 4434 __execlists_reset(engine, true); 4435 4436 /* Mark all executing requests as skipped. */ 4437 list_for_each_entry(rq, &engine->active.requests, sched.link) 4438 mark_eio(rq); 4439 intel_engine_signal_breadcrumbs(engine); 4440 4441 /* Flush the queued requests to the timeline list (for retiring). */ 4442 while ((rb = rb_first_cached(&execlists->queue))) { 4443 struct i915_priolist *p = to_priolist(rb); 4444 int i; 4445 4446 priolist_for_each_request_consume(rq, rn, p, i) { 4447 mark_eio(rq); 4448 __i915_request_submit(rq); 4449 } 4450 4451 rb_erase_cached(&p->node, &execlists->queue); 4452 i915_priolist_free(p); 4453 } 4454 4455 /* On-hold requests will be flushed to timeline upon their release */ 4456 list_for_each_entry(rq, &engine->active.hold, sched.link) 4457 mark_eio(rq); 4458 4459 /* Cancel all attached virtual engines */ 4460 while ((rb = rb_first_cached(&execlists->virtual))) { 4461 struct virtual_engine *ve = 4462 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4463 4464 rb_erase_cached(rb, &execlists->virtual); 4465 RB_CLEAR_NODE(rb); 4466 4467 spin_lock(&ve->base.active.lock); 4468 rq = fetch_and_zero(&ve->request); 4469 if (rq) { 4470 mark_eio(rq); 4471 4472 rq->engine = engine; 4473 __i915_request_submit(rq); 4474 i915_request_put(rq); 4475 4476 ve->base.execlists.queue_priority_hint = INT_MIN; 4477 } 4478 spin_unlock(&ve->base.active.lock); 4479 } 4480 4481 /* Remaining _unready_ requests will be nop'ed when submitted */ 4482 4483 execlists->queue_priority_hint = INT_MIN; 4484 execlists->queue = RB_ROOT_CACHED; 4485 4486 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4487 execlists->tasklet.func = nop_submission_tasklet; 4488 4489 spin_unlock_irqrestore(&engine->active.lock, flags); 4490 } 4491 4492 static void execlists_reset_finish(struct intel_engine_cs *engine) 4493 { 4494 struct intel_engine_execlists * const execlists = &engine->execlists; 4495 4496 /* 4497 * After a GPU reset, we may have requests to replay. Do so now while 4498 * we still have the forcewake to be sure that the GPU is not allowed 4499 * to sleep before we restart and reload a context. 4500 */ 4501 GEM_BUG_ON(!reset_in_progress(execlists)); 4502 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4503 execlists->tasklet.func(execlists->tasklet.data); 4504 4505 if (__tasklet_enable(&execlists->tasklet)) 4506 /* And kick in case we missed a new request submission. */ 4507 tasklet_hi_schedule(&execlists->tasklet); 4508 ENGINE_TRACE(engine, "depth->%d\n", 4509 atomic_read(&execlists->tasklet.count)); 4510 } 4511 4512 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4513 u64 offset, u32 len, 4514 const unsigned int flags) 4515 { 4516 u32 *cs; 4517 4518 cs = intel_ring_begin(rq, 4); 4519 if (IS_ERR(cs)) 4520 return PTR_ERR(cs); 4521 4522 /* 4523 * WaDisableCtxRestoreArbitration:bdw,chv 4524 * 4525 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4526 * particular all the gen that do not need the w/a at all!), if we 4527 * took care to make sure that on every switch into this context 4528 * (both ordinary and for preemption) that arbitrartion was enabled 4529 * we would be fine. However, for gen8 there is another w/a that 4530 * requires us to not preempt inside GPGPU execution, so we keep 4531 * arbitration disabled for gen8 batches. Arbitration will be 4532 * re-enabled before we close the request 4533 * (engine->emit_fini_breadcrumb). 4534 */ 4535 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4536 4537 /* FIXME(BDW+): Address space and security selectors. */ 4538 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4539 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4540 *cs++ = lower_32_bits(offset); 4541 *cs++ = upper_32_bits(offset); 4542 4543 intel_ring_advance(rq, cs); 4544 4545 return 0; 4546 } 4547 4548 static int gen8_emit_bb_start(struct i915_request *rq, 4549 u64 offset, u32 len, 4550 const unsigned int flags) 4551 { 4552 u32 *cs; 4553 4554 cs = intel_ring_begin(rq, 6); 4555 if (IS_ERR(cs)) 4556 return PTR_ERR(cs); 4557 4558 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4559 4560 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4561 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4562 *cs++ = lower_32_bits(offset); 4563 *cs++ = upper_32_bits(offset); 4564 4565 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4566 *cs++ = MI_NOOP; 4567 4568 intel_ring_advance(rq, cs); 4569 4570 return 0; 4571 } 4572 4573 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4574 { 4575 ENGINE_WRITE(engine, RING_IMR, 4576 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4577 ENGINE_POSTING_READ(engine, RING_IMR); 4578 } 4579 4580 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4581 { 4582 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4583 } 4584 4585 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4586 { 4587 u32 cmd, *cs; 4588 4589 cs = intel_ring_begin(request, 4); 4590 if (IS_ERR(cs)) 4591 return PTR_ERR(cs); 4592 4593 cmd = MI_FLUSH_DW + 1; 4594 4595 /* We always require a command barrier so that subsequent 4596 * commands, such as breadcrumb interrupts, are strictly ordered 4597 * wrt the contents of the write cache being flushed to memory 4598 * (and thus being coherent from the CPU). 4599 */ 4600 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4601 4602 if (mode & EMIT_INVALIDATE) { 4603 cmd |= MI_INVALIDATE_TLB; 4604 if (request->engine->class == VIDEO_DECODE_CLASS) 4605 cmd |= MI_INVALIDATE_BSD; 4606 } 4607 4608 *cs++ = cmd; 4609 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4610 *cs++ = 0; /* upper addr */ 4611 *cs++ = 0; /* value */ 4612 intel_ring_advance(request, cs); 4613 4614 return 0; 4615 } 4616 4617 static int gen8_emit_flush_render(struct i915_request *request, 4618 u32 mode) 4619 { 4620 bool vf_flush_wa = false, dc_flush_wa = false; 4621 u32 *cs, flags = 0; 4622 int len; 4623 4624 flags |= PIPE_CONTROL_CS_STALL; 4625 4626 if (mode & EMIT_FLUSH) { 4627 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4628 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4629 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4630 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4631 } 4632 4633 if (mode & EMIT_INVALIDATE) { 4634 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4635 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4636 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4637 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4638 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4639 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4640 flags |= PIPE_CONTROL_QW_WRITE; 4641 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4642 4643 /* 4644 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4645 * pipe control. 4646 */ 4647 if (IS_GEN(request->engine->i915, 9)) 4648 vf_flush_wa = true; 4649 4650 /* WaForGAMHang:kbl */ 4651 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4652 dc_flush_wa = true; 4653 } 4654 4655 len = 6; 4656 4657 if (vf_flush_wa) 4658 len += 6; 4659 4660 if (dc_flush_wa) 4661 len += 12; 4662 4663 cs = intel_ring_begin(request, len); 4664 if (IS_ERR(cs)) 4665 return PTR_ERR(cs); 4666 4667 if (vf_flush_wa) 4668 cs = gen8_emit_pipe_control(cs, 0, 0); 4669 4670 if (dc_flush_wa) 4671 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4672 0); 4673 4674 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4675 4676 if (dc_flush_wa) 4677 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4678 4679 intel_ring_advance(request, cs); 4680 4681 return 0; 4682 } 4683 4684 static int gen11_emit_flush_render(struct i915_request *request, 4685 u32 mode) 4686 { 4687 if (mode & EMIT_FLUSH) { 4688 u32 *cs; 4689 u32 flags = 0; 4690 4691 flags |= PIPE_CONTROL_CS_STALL; 4692 4693 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4694 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4695 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4696 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4697 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4698 flags |= PIPE_CONTROL_QW_WRITE; 4699 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4700 4701 cs = intel_ring_begin(request, 6); 4702 if (IS_ERR(cs)) 4703 return PTR_ERR(cs); 4704 4705 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4706 intel_ring_advance(request, cs); 4707 } 4708 4709 if (mode & EMIT_INVALIDATE) { 4710 u32 *cs; 4711 u32 flags = 0; 4712 4713 flags |= PIPE_CONTROL_CS_STALL; 4714 4715 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4716 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4717 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4718 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4719 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4720 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4721 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4722 flags |= PIPE_CONTROL_QW_WRITE; 4723 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4724 4725 cs = intel_ring_begin(request, 6); 4726 if (IS_ERR(cs)) 4727 return PTR_ERR(cs); 4728 4729 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4730 intel_ring_advance(request, cs); 4731 } 4732 4733 return 0; 4734 } 4735 4736 static u32 preparser_disable(bool state) 4737 { 4738 return MI_ARB_CHECK | 1 << 8 | state; 4739 } 4740 4741 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4742 { 4743 static const i915_reg_t vd[] = { 4744 GEN12_VD0_AUX_NV, 4745 GEN12_VD1_AUX_NV, 4746 GEN12_VD2_AUX_NV, 4747 GEN12_VD3_AUX_NV, 4748 }; 4749 4750 static const i915_reg_t ve[] = { 4751 GEN12_VE0_AUX_NV, 4752 GEN12_VE1_AUX_NV, 4753 }; 4754 4755 if (engine->class == VIDEO_DECODE_CLASS) 4756 return vd[engine->instance]; 4757 4758 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4759 return ve[engine->instance]; 4760 4761 GEM_BUG_ON("unknown aux_inv_reg\n"); 4762 4763 return INVALID_MMIO_REG; 4764 } 4765 4766 static u32 * 4767 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4768 { 4769 *cs++ = MI_LOAD_REGISTER_IMM(1); 4770 *cs++ = i915_mmio_reg_offset(inv_reg); 4771 *cs++ = AUX_INV; 4772 *cs++ = MI_NOOP; 4773 4774 return cs; 4775 } 4776 4777 static int gen12_emit_flush_render(struct i915_request *request, 4778 u32 mode) 4779 { 4780 if (mode & EMIT_FLUSH) { 4781 u32 flags = 0; 4782 u32 *cs; 4783 4784 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4785 flags |= PIPE_CONTROL_FLUSH_L3; 4786 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4787 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4788 /* Wa_1409600907:tgl */ 4789 flags |= PIPE_CONTROL_DEPTH_STALL; 4790 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4791 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4792 4793 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4794 flags |= PIPE_CONTROL_QW_WRITE; 4795 4796 flags |= PIPE_CONTROL_CS_STALL; 4797 4798 cs = intel_ring_begin(request, 6); 4799 if (IS_ERR(cs)) 4800 return PTR_ERR(cs); 4801 4802 cs = gen12_emit_pipe_control(cs, 4803 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4804 flags, LRC_PPHWSP_SCRATCH_ADDR); 4805 intel_ring_advance(request, cs); 4806 } 4807 4808 if (mode & EMIT_INVALIDATE) { 4809 u32 flags = 0; 4810 u32 *cs; 4811 4812 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4813 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4814 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4815 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4816 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4817 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4818 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4819 4820 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4821 flags |= PIPE_CONTROL_QW_WRITE; 4822 4823 flags |= PIPE_CONTROL_CS_STALL; 4824 4825 cs = intel_ring_begin(request, 8 + 4); 4826 if (IS_ERR(cs)) 4827 return PTR_ERR(cs); 4828 4829 /* 4830 * Prevent the pre-parser from skipping past the TLB 4831 * invalidate and loading a stale page for the batch 4832 * buffer / request payload. 4833 */ 4834 *cs++ = preparser_disable(true); 4835 4836 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4837 4838 /* hsdes: 1809175790 */ 4839 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4840 4841 *cs++ = preparser_disable(false); 4842 intel_ring_advance(request, cs); 4843 } 4844 4845 return 0; 4846 } 4847 4848 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4849 { 4850 intel_engine_mask_t aux_inv = 0; 4851 u32 cmd, *cs; 4852 4853 cmd = 4; 4854 if (mode & EMIT_INVALIDATE) 4855 cmd += 2; 4856 if (mode & EMIT_INVALIDATE) 4857 aux_inv = request->engine->mask & ~BIT(BCS0); 4858 if (aux_inv) 4859 cmd += 2 * hweight8(aux_inv) + 2; 4860 4861 cs = intel_ring_begin(request, cmd); 4862 if (IS_ERR(cs)) 4863 return PTR_ERR(cs); 4864 4865 if (mode & EMIT_INVALIDATE) 4866 *cs++ = preparser_disable(true); 4867 4868 cmd = MI_FLUSH_DW + 1; 4869 4870 /* We always require a command barrier so that subsequent 4871 * commands, such as breadcrumb interrupts, are strictly ordered 4872 * wrt the contents of the write cache being flushed to memory 4873 * (and thus being coherent from the CPU). 4874 */ 4875 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4876 4877 if (mode & EMIT_INVALIDATE) { 4878 cmd |= MI_INVALIDATE_TLB; 4879 if (request->engine->class == VIDEO_DECODE_CLASS) 4880 cmd |= MI_INVALIDATE_BSD; 4881 } 4882 4883 *cs++ = cmd; 4884 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4885 *cs++ = 0; /* upper addr */ 4886 *cs++ = 0; /* value */ 4887 4888 if (aux_inv) { /* hsdes: 1809175790 */ 4889 struct intel_engine_cs *engine; 4890 unsigned int tmp; 4891 4892 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4893 for_each_engine_masked(engine, request->engine->gt, 4894 aux_inv, tmp) { 4895 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4896 *cs++ = AUX_INV; 4897 } 4898 *cs++ = MI_NOOP; 4899 } 4900 4901 if (mode & EMIT_INVALIDATE) 4902 *cs++ = preparser_disable(false); 4903 4904 intel_ring_advance(request, cs); 4905 4906 return 0; 4907 } 4908 4909 static void assert_request_valid(struct i915_request *rq) 4910 { 4911 struct intel_ring *ring __maybe_unused = rq->ring; 4912 4913 /* Can we unwind this request without appearing to go forwards? */ 4914 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4915 } 4916 4917 /* 4918 * Reserve space for 2 NOOPs at the end of each request to be 4919 * used as a workaround for not being allowed to do lite 4920 * restore with HEAD==TAIL (WaIdleLiteRestore). 4921 */ 4922 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4923 { 4924 /* Ensure there's always at least one preemption point per-request. */ 4925 *cs++ = MI_ARB_CHECK; 4926 *cs++ = MI_NOOP; 4927 request->wa_tail = intel_ring_offset(request, cs); 4928 4929 /* Check that entire request is less than half the ring */ 4930 assert_request_valid(request); 4931 4932 return cs; 4933 } 4934 4935 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4936 { 4937 *cs++ = MI_SEMAPHORE_WAIT | 4938 MI_SEMAPHORE_GLOBAL_GTT | 4939 MI_SEMAPHORE_POLL | 4940 MI_SEMAPHORE_SAD_EQ_SDD; 4941 *cs++ = 0; 4942 *cs++ = intel_hws_preempt_address(request->engine); 4943 *cs++ = 0; 4944 4945 return cs; 4946 } 4947 4948 static __always_inline u32* 4949 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4950 { 4951 *cs++ = MI_USER_INTERRUPT; 4952 4953 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4954 if (intel_engine_has_semaphores(request->engine)) 4955 cs = emit_preempt_busywait(request, cs); 4956 4957 request->tail = intel_ring_offset(request, cs); 4958 assert_ring_tail_valid(request->ring, request->tail); 4959 4960 return gen8_emit_wa_tail(request, cs); 4961 } 4962 4963 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) 4964 { 4965 return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); 4966 } 4967 4968 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4969 { 4970 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4971 } 4972 4973 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4974 { 4975 cs = gen8_emit_pipe_control(cs, 4976 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4977 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4978 PIPE_CONTROL_DC_FLUSH_ENABLE, 4979 0); 4980 4981 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4982 cs = gen8_emit_ggtt_write_rcs(cs, 4983 request->fence.seqno, 4984 hwsp_offset(request), 4985 PIPE_CONTROL_FLUSH_ENABLE | 4986 PIPE_CONTROL_CS_STALL); 4987 4988 return gen8_emit_fini_breadcrumb_tail(request, cs); 4989 } 4990 4991 static u32 * 4992 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4993 { 4994 cs = gen8_emit_ggtt_write_rcs(cs, 4995 request->fence.seqno, 4996 hwsp_offset(request), 4997 PIPE_CONTROL_CS_STALL | 4998 PIPE_CONTROL_TILE_CACHE_FLUSH | 4999 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5000 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5001 PIPE_CONTROL_DC_FLUSH_ENABLE | 5002 PIPE_CONTROL_FLUSH_ENABLE); 5003 5004 return gen8_emit_fini_breadcrumb_tail(request, cs); 5005 } 5006 5007 /* 5008 * Note that the CS instruction pre-parser will not stall on the breadcrumb 5009 * flush and will continue pre-fetching the instructions after it before the 5010 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 5011 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 5012 * of the next request before the memory has been flushed, we're guaranteed that 5013 * we won't access the batch itself too early. 5014 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 5015 * so, if the current request is modifying an instruction in the next request on 5016 * the same intel_context, we might pre-fetch and then execute the pre-update 5017 * instruction. To avoid this, the users of self-modifying code should either 5018 * disable the parser around the code emitting the memory writes, via a new flag 5019 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 5020 * the in-kernel use-cases we've opted to use a separate context, see 5021 * reloc_gpu() as an example. 5022 * All the above applies only to the instructions themselves. Non-inline data 5023 * used by the instructions is not pre-fetched. 5024 */ 5025 5026 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 5027 { 5028 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 5029 MI_SEMAPHORE_GLOBAL_GTT | 5030 MI_SEMAPHORE_POLL | 5031 MI_SEMAPHORE_SAD_EQ_SDD; 5032 *cs++ = 0; 5033 *cs++ = intel_hws_preempt_address(request->engine); 5034 *cs++ = 0; 5035 *cs++ = 0; 5036 *cs++ = MI_NOOP; 5037 5038 return cs; 5039 } 5040 5041 static __always_inline u32* 5042 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 5043 { 5044 *cs++ = MI_USER_INTERRUPT; 5045 5046 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 5047 if (intel_engine_has_semaphores(request->engine)) 5048 cs = gen12_emit_preempt_busywait(request, cs); 5049 5050 request->tail = intel_ring_offset(request, cs); 5051 assert_ring_tail_valid(request->ring, request->tail); 5052 5053 return gen8_emit_wa_tail(request, cs); 5054 } 5055 5056 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 5057 { 5058 /* XXX Stalling flush before seqno write; post-sync not */ 5059 cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); 5060 return gen12_emit_fini_breadcrumb_tail(rq, cs); 5061 } 5062 5063 static u32 * 5064 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 5065 { 5066 cs = gen12_emit_ggtt_write_rcs(cs, 5067 request->fence.seqno, 5068 hwsp_offset(request), 5069 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 5070 PIPE_CONTROL_CS_STALL | 5071 PIPE_CONTROL_TILE_CACHE_FLUSH | 5072 PIPE_CONTROL_FLUSH_L3 | 5073 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5074 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5075 /* Wa_1409600907:tgl */ 5076 PIPE_CONTROL_DEPTH_STALL | 5077 PIPE_CONTROL_DC_FLUSH_ENABLE | 5078 PIPE_CONTROL_FLUSH_ENABLE); 5079 5080 return gen12_emit_fini_breadcrumb_tail(request, cs); 5081 } 5082 5083 static void execlists_park(struct intel_engine_cs *engine) 5084 { 5085 cancel_timer(&engine->execlists.timer); 5086 cancel_timer(&engine->execlists.preempt); 5087 } 5088 5089 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 5090 { 5091 engine->submit_request = execlists_submit_request; 5092 engine->schedule = i915_schedule; 5093 engine->execlists.tasklet.func = execlists_submission_tasklet; 5094 5095 engine->reset.prepare = execlists_reset_prepare; 5096 engine->reset.rewind = execlists_reset_rewind; 5097 engine->reset.cancel = execlists_reset_cancel; 5098 engine->reset.finish = execlists_reset_finish; 5099 5100 engine->park = execlists_park; 5101 engine->unpark = NULL; 5102 5103 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5104 if (!intel_vgpu_active(engine->i915)) { 5105 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5106 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5107 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5108 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5109 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5110 } 5111 } 5112 5113 if (INTEL_GEN(engine->i915) >= 12) 5114 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5115 5116 if (intel_engine_has_preemption(engine)) 5117 engine->emit_bb_start = gen8_emit_bb_start; 5118 else 5119 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5120 } 5121 5122 static void execlists_shutdown(struct intel_engine_cs *engine) 5123 { 5124 /* Synchronise with residual timers and any softirq they raise */ 5125 del_timer_sync(&engine->execlists.timer); 5126 del_timer_sync(&engine->execlists.preempt); 5127 tasklet_kill(&engine->execlists.tasklet); 5128 } 5129 5130 static void execlists_release(struct intel_engine_cs *engine) 5131 { 5132 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5133 5134 execlists_shutdown(engine); 5135 5136 intel_engine_cleanup_common(engine); 5137 lrc_destroy_wa_ctx(engine); 5138 } 5139 5140 static void 5141 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5142 { 5143 /* Default vfuncs which can be overriden by each engine. */ 5144 5145 engine->resume = execlists_resume; 5146 5147 engine->cops = &execlists_context_ops; 5148 engine->request_alloc = execlists_request_alloc; 5149 5150 engine->emit_flush = gen8_emit_flush; 5151 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5152 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5153 if (INTEL_GEN(engine->i915) >= 12) { 5154 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5155 engine->emit_flush = gen12_emit_flush; 5156 } 5157 engine->set_default_submission = intel_execlists_set_default_submission; 5158 5159 if (INTEL_GEN(engine->i915) < 11) { 5160 engine->irq_enable = gen8_logical_ring_enable_irq; 5161 engine->irq_disable = gen8_logical_ring_disable_irq; 5162 } else { 5163 /* 5164 * TODO: On Gen11 interrupt masks need to be clear 5165 * to allow C6 entry. Keep interrupts enabled at 5166 * and take the hit of generating extra interrupts 5167 * until a more refined solution exists. 5168 */ 5169 } 5170 } 5171 5172 static inline void 5173 logical_ring_default_irqs(struct intel_engine_cs *engine) 5174 { 5175 unsigned int shift = 0; 5176 5177 if (INTEL_GEN(engine->i915) < 11) { 5178 const u8 irq_shifts[] = { 5179 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5180 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5181 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5182 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5183 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5184 }; 5185 5186 shift = irq_shifts[engine->id]; 5187 } 5188 5189 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5190 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5191 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5192 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5193 } 5194 5195 static void rcs_submission_override(struct intel_engine_cs *engine) 5196 { 5197 switch (INTEL_GEN(engine->i915)) { 5198 case 12: 5199 engine->emit_flush = gen12_emit_flush_render; 5200 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5201 break; 5202 case 11: 5203 engine->emit_flush = gen11_emit_flush_render; 5204 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5205 break; 5206 default: 5207 engine->emit_flush = gen8_emit_flush_render; 5208 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5209 break; 5210 } 5211 } 5212 5213 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5214 { 5215 struct intel_engine_execlists * const execlists = &engine->execlists; 5216 struct drm_i915_private *i915 = engine->i915; 5217 struct intel_uncore *uncore = engine->uncore; 5218 u32 base = engine->mmio_base; 5219 5220 tasklet_init(&engine->execlists.tasklet, 5221 execlists_submission_tasklet, (unsigned long)engine); 5222 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5223 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5224 5225 logical_ring_default_vfuncs(engine); 5226 logical_ring_default_irqs(engine); 5227 5228 if (engine->class == RENDER_CLASS) 5229 rcs_submission_override(engine); 5230 5231 if (intel_init_workaround_bb(engine)) 5232 /* 5233 * We continue even if we fail to initialize WA batch 5234 * because we only expect rare glitches but nothing 5235 * critical to prevent us from using GPU 5236 */ 5237 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5238 5239 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5240 execlists->submit_reg = uncore->regs + 5241 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5242 execlists->ctrl_reg = uncore->regs + 5243 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5244 } else { 5245 execlists->submit_reg = uncore->regs + 5246 i915_mmio_reg_offset(RING_ELSP(base)); 5247 } 5248 5249 execlists->csb_status = 5250 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5251 5252 execlists->csb_write = 5253 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5254 5255 if (INTEL_GEN(i915) < 11) 5256 execlists->csb_size = GEN8_CSB_ENTRIES; 5257 else 5258 execlists->csb_size = GEN11_CSB_ENTRIES; 5259 5260 if (INTEL_GEN(engine->i915) >= 11) { 5261 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5262 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5263 } 5264 5265 /* Finally, take ownership and responsibility for cleanup! */ 5266 engine->sanitize = execlists_sanitize; 5267 engine->release = execlists_release; 5268 5269 return 0; 5270 } 5271 5272 static void init_common_reg_state(u32 * const regs, 5273 const struct intel_engine_cs *engine, 5274 const struct intel_ring *ring, 5275 bool inhibit) 5276 { 5277 u32 ctl; 5278 5279 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5280 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5281 if (inhibit) 5282 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5283 if (INTEL_GEN(engine->i915) < 11) 5284 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5285 CTX_CTRL_RS_CTX_ENABLE); 5286 regs[CTX_CONTEXT_CONTROL] = ctl; 5287 5288 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5289 regs[CTX_TIMESTAMP] = 0; 5290 } 5291 5292 static void init_wa_bb_reg_state(u32 * const regs, 5293 const struct intel_engine_cs *engine) 5294 { 5295 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5296 5297 if (wa_ctx->per_ctx.size) { 5298 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5299 5300 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5301 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5302 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5303 } 5304 5305 if (wa_ctx->indirect_ctx.size) { 5306 lrc_ring_setup_indirect_ctx(regs, engine, 5307 i915_ggtt_offset(wa_ctx->vma) + 5308 wa_ctx->indirect_ctx.offset, 5309 wa_ctx->indirect_ctx.size); 5310 } 5311 } 5312 5313 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5314 { 5315 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5316 /* 64b PPGTT (48bit canonical) 5317 * PDP0_DESCRIPTOR contains the base address to PML4 and 5318 * other PDP Descriptors are ignored. 5319 */ 5320 ASSIGN_CTX_PML4(ppgtt, regs); 5321 } else { 5322 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5323 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5324 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5325 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5326 } 5327 } 5328 5329 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5330 { 5331 if (i915_is_ggtt(vm)) 5332 return i915_vm_to_ggtt(vm)->alias; 5333 else 5334 return i915_vm_to_ppgtt(vm); 5335 } 5336 5337 static void execlists_init_reg_state(u32 *regs, 5338 const struct intel_context *ce, 5339 const struct intel_engine_cs *engine, 5340 const struct intel_ring *ring, 5341 bool inhibit) 5342 { 5343 /* 5344 * A context is actually a big batch buffer with several 5345 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5346 * values we are setting here are only for the first context restore: 5347 * on a subsequent save, the GPU will recreate this batchbuffer with new 5348 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5349 * we are not initializing here). 5350 * 5351 * Must keep consistent with virtual_update_register_offsets(). 5352 */ 5353 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5354 5355 init_common_reg_state(regs, engine, ring, inhibit); 5356 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5357 5358 init_wa_bb_reg_state(regs, engine); 5359 5360 __reset_stop_ring(regs, engine); 5361 } 5362 5363 static int 5364 populate_lr_context(struct intel_context *ce, 5365 struct drm_i915_gem_object *ctx_obj, 5366 struct intel_engine_cs *engine, 5367 struct intel_ring *ring) 5368 { 5369 bool inhibit = true; 5370 void *vaddr; 5371 5372 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5373 if (IS_ERR(vaddr)) { 5374 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5375 return PTR_ERR(vaddr); 5376 } 5377 5378 set_redzone(vaddr, engine); 5379 5380 if (engine->default_state) { 5381 shmem_read(engine->default_state, 0, 5382 vaddr, engine->context_size); 5383 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5384 inhibit = false; 5385 } 5386 5387 /* Clear the ppHWSP (inc. per-context counters) */ 5388 memset(vaddr, 0, PAGE_SIZE); 5389 5390 /* 5391 * The second page of the context object contains some registers which 5392 * must be set up prior to the first execution. 5393 */ 5394 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5395 ce, engine, ring, inhibit); 5396 5397 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5398 i915_gem_object_unpin_map(ctx_obj); 5399 return 0; 5400 } 5401 5402 static struct intel_timeline *pinned_timeline(struct intel_context *ce) 5403 { 5404 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 5405 5406 return intel_timeline_create_from_engine(ce->engine, 5407 page_unmask_bits(tl)); 5408 } 5409 5410 static int __execlists_context_alloc(struct intel_context *ce, 5411 struct intel_engine_cs *engine) 5412 { 5413 struct drm_i915_gem_object *ctx_obj; 5414 struct intel_ring *ring; 5415 struct i915_vma *vma; 5416 u32 context_size; 5417 int ret; 5418 5419 GEM_BUG_ON(ce->state); 5420 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5421 5422 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5423 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5424 5425 if (INTEL_GEN(engine->i915) == 12) { 5426 ce->wa_bb_page = context_size / PAGE_SIZE; 5427 context_size += PAGE_SIZE; 5428 } 5429 5430 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5431 if (IS_ERR(ctx_obj)) 5432 return PTR_ERR(ctx_obj); 5433 5434 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5435 if (IS_ERR(vma)) { 5436 ret = PTR_ERR(vma); 5437 goto error_deref_obj; 5438 } 5439 5440 if (!page_mask_bits(ce->timeline)) { 5441 struct intel_timeline *tl; 5442 5443 /* 5444 * Use the static global HWSP for the kernel context, and 5445 * a dynamically allocated cacheline for everyone else. 5446 */ 5447 if (unlikely(ce->timeline)) 5448 tl = pinned_timeline(ce); 5449 else 5450 tl = intel_timeline_create(engine->gt); 5451 if (IS_ERR(tl)) { 5452 ret = PTR_ERR(tl); 5453 goto error_deref_obj; 5454 } 5455 5456 ce->timeline = tl; 5457 } 5458 5459 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5460 if (IS_ERR(ring)) { 5461 ret = PTR_ERR(ring); 5462 goto error_deref_obj; 5463 } 5464 5465 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5466 if (ret) { 5467 drm_dbg(&engine->i915->drm, 5468 "Failed to populate LRC: %d\n", ret); 5469 goto error_ring_free; 5470 } 5471 5472 ce->ring = ring; 5473 ce->state = vma; 5474 5475 return 0; 5476 5477 error_ring_free: 5478 intel_ring_put(ring); 5479 error_deref_obj: 5480 i915_gem_object_put(ctx_obj); 5481 return ret; 5482 } 5483 5484 static struct list_head *virtual_queue(struct virtual_engine *ve) 5485 { 5486 return &ve->base.execlists.default_priolist.requests[0]; 5487 } 5488 5489 static void rcu_virtual_context_destroy(struct work_struct *wrk) 5490 { 5491 struct virtual_engine *ve = 5492 container_of(wrk, typeof(*ve), rcu.work); 5493 unsigned int n; 5494 5495 GEM_BUG_ON(ve->context.inflight); 5496 5497 /* Preempt-to-busy may leave a stale request behind. */ 5498 if (unlikely(ve->request)) { 5499 struct i915_request *old; 5500 5501 spin_lock_irq(&ve->base.active.lock); 5502 5503 old = fetch_and_zero(&ve->request); 5504 if (old) { 5505 GEM_BUG_ON(!i915_request_completed(old)); 5506 __i915_request_submit(old); 5507 i915_request_put(old); 5508 } 5509 5510 spin_unlock_irq(&ve->base.active.lock); 5511 } 5512 5513 /* 5514 * Flush the tasklet in case it is still running on another core. 5515 * 5516 * This needs to be done before we remove ourselves from the siblings' 5517 * rbtrees as in the case it is running in parallel, it may reinsert 5518 * the rb_node into a sibling. 5519 */ 5520 tasklet_kill(&ve->base.execlists.tasklet); 5521 5522 /* Decouple ourselves from the siblings, no more access allowed. */ 5523 for (n = 0; n < ve->num_siblings; n++) { 5524 struct intel_engine_cs *sibling = ve->siblings[n]; 5525 struct rb_node *node = &ve->nodes[sibling->id].rb; 5526 5527 if (RB_EMPTY_NODE(node)) 5528 continue; 5529 5530 spin_lock_irq(&sibling->active.lock); 5531 5532 /* Detachment is lazily performed in the execlists tasklet */ 5533 if (!RB_EMPTY_NODE(node)) 5534 rb_erase_cached(node, &sibling->execlists.virtual); 5535 5536 spin_unlock_irq(&sibling->active.lock); 5537 } 5538 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5539 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5540 5541 if (ve->context.state) 5542 __execlists_context_fini(&ve->context); 5543 intel_context_fini(&ve->context); 5544 5545 intel_breadcrumbs_free(ve->base.breadcrumbs); 5546 intel_engine_free_request_pool(&ve->base); 5547 5548 kfree(ve->bonds); 5549 kfree(ve); 5550 } 5551 5552 static void virtual_context_destroy(struct kref *kref) 5553 { 5554 struct virtual_engine *ve = 5555 container_of(kref, typeof(*ve), context.ref); 5556 5557 GEM_BUG_ON(!list_empty(&ve->context.signals)); 5558 5559 /* 5560 * When destroying the virtual engine, we have to be aware that 5561 * it may still be in use from an hardirq/softirq context causing 5562 * the resubmission of a completed request (background completion 5563 * due to preempt-to-busy). Before we can free the engine, we need 5564 * to flush the submission code and tasklets that are still potentially 5565 * accessing the engine. Flushing the tasklets requires process context, 5566 * and since we can guard the resubmit onto the engine with an RCU read 5567 * lock, we can delegate the free of the engine to an RCU worker. 5568 */ 5569 INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy); 5570 queue_rcu_work(system_wq, &ve->rcu); 5571 } 5572 5573 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5574 { 5575 int swp; 5576 5577 /* 5578 * Pick a random sibling on starting to help spread the load around. 5579 * 5580 * New contexts are typically created with exactly the same order 5581 * of siblings, and often started in batches. Due to the way we iterate 5582 * the array of sibling when submitting requests, sibling[0] is 5583 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5584 * randomised across the system, we also help spread the load by the 5585 * first engine we inspect being different each time. 5586 * 5587 * NB This does not force us to execute on this engine, it will just 5588 * typically be the first we inspect for submission. 5589 */ 5590 swp = prandom_u32_max(ve->num_siblings); 5591 if (swp) 5592 swap(ve->siblings[swp], ve->siblings[0]); 5593 } 5594 5595 static int virtual_context_alloc(struct intel_context *ce) 5596 { 5597 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5598 5599 return __execlists_context_alloc(ce, ve->siblings[0]); 5600 } 5601 5602 static int virtual_context_pin(struct intel_context *ce, void *vaddr) 5603 { 5604 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5605 5606 /* Note: we must use a real engine class for setting up reg state */ 5607 return __execlists_context_pin(ce, ve->siblings[0], vaddr); 5608 } 5609 5610 static void virtual_context_enter(struct intel_context *ce) 5611 { 5612 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5613 unsigned int n; 5614 5615 for (n = 0; n < ve->num_siblings; n++) 5616 intel_engine_pm_get(ve->siblings[n]); 5617 5618 intel_timeline_enter(ce->timeline); 5619 } 5620 5621 static void virtual_context_exit(struct intel_context *ce) 5622 { 5623 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5624 unsigned int n; 5625 5626 intel_timeline_exit(ce->timeline); 5627 5628 for (n = 0; n < ve->num_siblings; n++) 5629 intel_engine_pm_put(ve->siblings[n]); 5630 } 5631 5632 static const struct intel_context_ops virtual_context_ops = { 5633 .alloc = virtual_context_alloc, 5634 5635 .pre_pin = execlists_context_pre_pin, 5636 .pin = virtual_context_pin, 5637 .unpin = execlists_context_unpin, 5638 .post_unpin = execlists_context_post_unpin, 5639 5640 .enter = virtual_context_enter, 5641 .exit = virtual_context_exit, 5642 5643 .destroy = virtual_context_destroy, 5644 }; 5645 5646 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5647 { 5648 struct i915_request *rq; 5649 intel_engine_mask_t mask; 5650 5651 rq = READ_ONCE(ve->request); 5652 if (!rq) 5653 return 0; 5654 5655 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5656 mask = rq->execution_mask; 5657 if (unlikely(!mask)) { 5658 /* Invalid selection, submit to a random engine in error */ 5659 i915_request_set_error_once(rq, -ENODEV); 5660 mask = ve->siblings[0]->mask; 5661 } 5662 5663 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5664 rq->fence.context, rq->fence.seqno, 5665 mask, ve->base.execlists.queue_priority_hint); 5666 5667 return mask; 5668 } 5669 5670 static void virtual_submission_tasklet(unsigned long data) 5671 { 5672 struct virtual_engine * const ve = (struct virtual_engine *)data; 5673 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5674 intel_engine_mask_t mask; 5675 unsigned int n; 5676 5677 rcu_read_lock(); 5678 mask = virtual_submission_mask(ve); 5679 rcu_read_unlock(); 5680 if (unlikely(!mask)) 5681 return; 5682 5683 local_irq_disable(); 5684 for (n = 0; n < ve->num_siblings; n++) { 5685 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5686 struct ve_node * const node = &ve->nodes[sibling->id]; 5687 struct rb_node **parent, *rb; 5688 bool first; 5689 5690 if (!READ_ONCE(ve->request)) 5691 break; /* already handled by a sibling's tasklet */ 5692 5693 if (unlikely(!(mask & sibling->mask))) { 5694 if (!RB_EMPTY_NODE(&node->rb)) { 5695 spin_lock(&sibling->active.lock); 5696 rb_erase_cached(&node->rb, 5697 &sibling->execlists.virtual); 5698 RB_CLEAR_NODE(&node->rb); 5699 spin_unlock(&sibling->active.lock); 5700 } 5701 continue; 5702 } 5703 5704 spin_lock(&sibling->active.lock); 5705 5706 if (!RB_EMPTY_NODE(&node->rb)) { 5707 /* 5708 * Cheat and avoid rebalancing the tree if we can 5709 * reuse this node in situ. 5710 */ 5711 first = rb_first_cached(&sibling->execlists.virtual) == 5712 &node->rb; 5713 if (prio == node->prio || (prio > node->prio && first)) 5714 goto submit_engine; 5715 5716 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5717 } 5718 5719 rb = NULL; 5720 first = true; 5721 parent = &sibling->execlists.virtual.rb_root.rb_node; 5722 while (*parent) { 5723 struct ve_node *other; 5724 5725 rb = *parent; 5726 other = rb_entry(rb, typeof(*other), rb); 5727 if (prio > other->prio) { 5728 parent = &rb->rb_left; 5729 } else { 5730 parent = &rb->rb_right; 5731 first = false; 5732 } 5733 } 5734 5735 rb_link_node(&node->rb, rb, parent); 5736 rb_insert_color_cached(&node->rb, 5737 &sibling->execlists.virtual, 5738 first); 5739 5740 submit_engine: 5741 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5742 node->prio = prio; 5743 if (first && prio > sibling->execlists.queue_priority_hint) 5744 tasklet_hi_schedule(&sibling->execlists.tasklet); 5745 5746 spin_unlock(&sibling->active.lock); 5747 } 5748 local_irq_enable(); 5749 } 5750 5751 static void virtual_submit_request(struct i915_request *rq) 5752 { 5753 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5754 struct i915_request *old; 5755 unsigned long flags; 5756 5757 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5758 rq->fence.context, 5759 rq->fence.seqno); 5760 5761 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5762 5763 spin_lock_irqsave(&ve->base.active.lock, flags); 5764 5765 old = ve->request; 5766 if (old) { /* background completion event from preempt-to-busy */ 5767 GEM_BUG_ON(!i915_request_completed(old)); 5768 __i915_request_submit(old); 5769 i915_request_put(old); 5770 } 5771 5772 if (i915_request_completed(rq)) { 5773 __i915_request_submit(rq); 5774 5775 ve->base.execlists.queue_priority_hint = INT_MIN; 5776 ve->request = NULL; 5777 } else { 5778 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5779 ve->request = i915_request_get(rq); 5780 5781 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5782 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5783 5784 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5785 } 5786 5787 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5788 } 5789 5790 static struct ve_bond * 5791 virtual_find_bond(struct virtual_engine *ve, 5792 const struct intel_engine_cs *master) 5793 { 5794 int i; 5795 5796 for (i = 0; i < ve->num_bonds; i++) { 5797 if (ve->bonds[i].master == master) 5798 return &ve->bonds[i]; 5799 } 5800 5801 return NULL; 5802 } 5803 5804 static void 5805 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5806 { 5807 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5808 intel_engine_mask_t allowed, exec; 5809 struct ve_bond *bond; 5810 5811 allowed = ~to_request(signal)->engine->mask; 5812 5813 bond = virtual_find_bond(ve, to_request(signal)->engine); 5814 if (bond) 5815 allowed &= bond->sibling_mask; 5816 5817 /* Restrict the bonded request to run on only the available engines */ 5818 exec = READ_ONCE(rq->execution_mask); 5819 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5820 ; 5821 5822 /* Prevent the master from being re-run on the bonded engines */ 5823 to_request(signal)->execution_mask &= ~allowed; 5824 } 5825 5826 struct intel_context * 5827 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5828 unsigned int count) 5829 { 5830 struct virtual_engine *ve; 5831 unsigned int n; 5832 int err; 5833 5834 if (count == 0) 5835 return ERR_PTR(-EINVAL); 5836 5837 if (count == 1) 5838 return intel_context_create(siblings[0]); 5839 5840 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5841 if (!ve) 5842 return ERR_PTR(-ENOMEM); 5843 5844 ve->base.i915 = siblings[0]->i915; 5845 ve->base.gt = siblings[0]->gt; 5846 ve->base.uncore = siblings[0]->uncore; 5847 ve->base.id = -1; 5848 5849 ve->base.class = OTHER_CLASS; 5850 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5851 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5852 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5853 5854 /* 5855 * The decision on whether to submit a request using semaphores 5856 * depends on the saturated state of the engine. We only compute 5857 * this during HW submission of the request, and we need for this 5858 * state to be globally applied to all requests being submitted 5859 * to this engine. Virtual engines encompass more than one physical 5860 * engine and so we cannot accurately tell in advance if one of those 5861 * engines is already saturated and so cannot afford to use a semaphore 5862 * and be pessimized in priority for doing so -- if we are the only 5863 * context using semaphores after all other clients have stopped, we 5864 * will be starved on the saturated system. Such a global switch for 5865 * semaphores is less than ideal, but alas is the current compromise. 5866 */ 5867 ve->base.saturated = ALL_ENGINES; 5868 5869 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5870 5871 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5872 intel_engine_init_execlists(&ve->base); 5873 5874 ve->base.cops = &virtual_context_ops; 5875 ve->base.request_alloc = execlists_request_alloc; 5876 5877 ve->base.schedule = i915_schedule; 5878 ve->base.submit_request = virtual_submit_request; 5879 ve->base.bond_execute = virtual_bond_execute; 5880 5881 INIT_LIST_HEAD(virtual_queue(ve)); 5882 ve->base.execlists.queue_priority_hint = INT_MIN; 5883 tasklet_init(&ve->base.execlists.tasklet, 5884 virtual_submission_tasklet, 5885 (unsigned long)ve); 5886 5887 intel_context_init(&ve->context, &ve->base); 5888 5889 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); 5890 if (!ve->base.breadcrumbs) { 5891 err = -ENOMEM; 5892 goto err_put; 5893 } 5894 5895 for (n = 0; n < count; n++) { 5896 struct intel_engine_cs *sibling = siblings[n]; 5897 5898 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5899 if (sibling->mask & ve->base.mask) { 5900 DRM_DEBUG("duplicate %s entry in load balancer\n", 5901 sibling->name); 5902 err = -EINVAL; 5903 goto err_put; 5904 } 5905 5906 /* 5907 * The virtual engine implementation is tightly coupled to 5908 * the execlists backend -- we push out request directly 5909 * into a tree inside each physical engine. We could support 5910 * layering if we handle cloning of the requests and 5911 * submitting a copy into each backend. 5912 */ 5913 if (sibling->execlists.tasklet.func != 5914 execlists_submission_tasklet) { 5915 err = -ENODEV; 5916 goto err_put; 5917 } 5918 5919 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5920 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5921 5922 ve->siblings[ve->num_siblings++] = sibling; 5923 ve->base.mask |= sibling->mask; 5924 5925 /* 5926 * All physical engines must be compatible for their emission 5927 * functions (as we build the instructions during request 5928 * construction and do not alter them before submission 5929 * on the physical engine). We use the engine class as a guide 5930 * here, although that could be refined. 5931 */ 5932 if (ve->base.class != OTHER_CLASS) { 5933 if (ve->base.class != sibling->class) { 5934 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5935 sibling->class, ve->base.class); 5936 err = -EINVAL; 5937 goto err_put; 5938 } 5939 continue; 5940 } 5941 5942 ve->base.class = sibling->class; 5943 ve->base.uabi_class = sibling->uabi_class; 5944 snprintf(ve->base.name, sizeof(ve->base.name), 5945 "v%dx%d", ve->base.class, count); 5946 ve->base.context_size = sibling->context_size; 5947 5948 ve->base.emit_bb_start = sibling->emit_bb_start; 5949 ve->base.emit_flush = sibling->emit_flush; 5950 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5951 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5952 ve->base.emit_fini_breadcrumb_dw = 5953 sibling->emit_fini_breadcrumb_dw; 5954 5955 ve->base.flags = sibling->flags; 5956 } 5957 5958 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5959 5960 virtual_engine_initial_hint(ve); 5961 return &ve->context; 5962 5963 err_put: 5964 intel_context_put(&ve->context); 5965 return ERR_PTR(err); 5966 } 5967 5968 struct intel_context * 5969 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5970 { 5971 struct virtual_engine *se = to_virtual_engine(src); 5972 struct intel_context *dst; 5973 5974 dst = intel_execlists_create_virtual(se->siblings, 5975 se->num_siblings); 5976 if (IS_ERR(dst)) 5977 return dst; 5978 5979 if (se->num_bonds) { 5980 struct virtual_engine *de = to_virtual_engine(dst->engine); 5981 5982 de->bonds = kmemdup(se->bonds, 5983 sizeof(*se->bonds) * se->num_bonds, 5984 GFP_KERNEL); 5985 if (!de->bonds) { 5986 intel_context_put(dst); 5987 return ERR_PTR(-ENOMEM); 5988 } 5989 5990 de->num_bonds = se->num_bonds; 5991 } 5992 5993 return dst; 5994 } 5995 5996 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5997 const struct intel_engine_cs *master, 5998 const struct intel_engine_cs *sibling) 5999 { 6000 struct virtual_engine *ve = to_virtual_engine(engine); 6001 struct ve_bond *bond; 6002 int n; 6003 6004 /* Sanity check the sibling is part of the virtual engine */ 6005 for (n = 0; n < ve->num_siblings; n++) 6006 if (sibling == ve->siblings[n]) 6007 break; 6008 if (n == ve->num_siblings) 6009 return -EINVAL; 6010 6011 bond = virtual_find_bond(ve, master); 6012 if (bond) { 6013 bond->sibling_mask |= sibling->mask; 6014 return 0; 6015 } 6016 6017 bond = krealloc(ve->bonds, 6018 sizeof(*bond) * (ve->num_bonds + 1), 6019 GFP_KERNEL); 6020 if (!bond) 6021 return -ENOMEM; 6022 6023 bond[ve->num_bonds].master = master; 6024 bond[ve->num_bonds].sibling_mask = sibling->mask; 6025 6026 ve->bonds = bond; 6027 ve->num_bonds++; 6028 6029 return 0; 6030 } 6031 6032 void intel_execlists_show_requests(struct intel_engine_cs *engine, 6033 struct drm_printer *m, 6034 void (*show_request)(struct drm_printer *m, 6035 struct i915_request *rq, 6036 const char *prefix), 6037 unsigned int max) 6038 { 6039 const struct intel_engine_execlists *execlists = &engine->execlists; 6040 struct i915_request *rq, *last; 6041 unsigned long flags; 6042 unsigned int count; 6043 struct rb_node *rb; 6044 6045 spin_lock_irqsave(&engine->active.lock, flags); 6046 6047 last = NULL; 6048 count = 0; 6049 list_for_each_entry(rq, &engine->active.requests, sched.link) { 6050 if (count++ < max - 1) 6051 show_request(m, rq, "\t\tE "); 6052 else 6053 last = rq; 6054 } 6055 if (last) { 6056 if (count > max) { 6057 drm_printf(m, 6058 "\t\t...skipping %d executing requests...\n", 6059 count - max); 6060 } 6061 show_request(m, last, "\t\tE "); 6062 } 6063 6064 if (execlists->switch_priority_hint != INT_MIN) 6065 drm_printf(m, "\t\tSwitch priority hint: %d\n", 6066 READ_ONCE(execlists->switch_priority_hint)); 6067 if (execlists->queue_priority_hint != INT_MIN) 6068 drm_printf(m, "\t\tQueue priority hint: %d\n", 6069 READ_ONCE(execlists->queue_priority_hint)); 6070 6071 last = NULL; 6072 count = 0; 6073 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 6074 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 6075 int i; 6076 6077 priolist_for_each_request(rq, p, i) { 6078 if (count++ < max - 1) 6079 show_request(m, rq, "\t\tQ "); 6080 else 6081 last = rq; 6082 } 6083 } 6084 if (last) { 6085 if (count > max) { 6086 drm_printf(m, 6087 "\t\t...skipping %d queued requests...\n", 6088 count - max); 6089 } 6090 show_request(m, last, "\t\tQ "); 6091 } 6092 6093 last = NULL; 6094 count = 0; 6095 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 6096 struct virtual_engine *ve = 6097 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 6098 struct i915_request *rq = READ_ONCE(ve->request); 6099 6100 if (rq) { 6101 if (count++ < max - 1) 6102 show_request(m, rq, "\t\tV "); 6103 else 6104 last = rq; 6105 } 6106 } 6107 if (last) { 6108 if (count > max) { 6109 drm_printf(m, 6110 "\t\t...skipping %d virtual requests...\n", 6111 count - max); 6112 } 6113 show_request(m, last, "\t\tV "); 6114 } 6115 6116 spin_unlock_irqrestore(&engine->active.lock, flags); 6117 } 6118 6119 void intel_lr_context_reset(struct intel_engine_cs *engine, 6120 struct intel_context *ce, 6121 u32 head, 6122 bool scrub) 6123 { 6124 GEM_BUG_ON(!intel_context_is_pinned(ce)); 6125 6126 /* 6127 * We want a simple context + ring to execute the breadcrumb update. 6128 * We cannot rely on the context being intact across the GPU hang, 6129 * so clear it and rebuild just what we need for the breadcrumb. 6130 * All pending requests for this context will be zapped, and any 6131 * future request will be after userspace has had the opportunity 6132 * to recreate its own state. 6133 */ 6134 if (scrub) 6135 restore_default_state(ce, engine); 6136 6137 /* Rerun the request; its payload has been neutered (if guilty). */ 6138 __execlists_update_reg_state(ce, engine, head); 6139 } 6140 6141 bool 6142 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 6143 { 6144 return engine->set_default_submission == 6145 intel_execlists_set_default_submission; 6146 } 6147 6148 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 6149 #include "selftest_lrc.c" 6150 #endif 6151