1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_breadcrumbs.h" 141 #include "intel_context.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 #include "shmem_utils.h" 152 153 #define RING_EXECLIST_QFULL (1 << 0x2) 154 #define RING_EXECLIST1_VALID (1 << 0x3) 155 #define RING_EXECLIST0_VALID (1 << 0x4) 156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 157 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 158 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 159 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 166 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 169 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 171 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 175 #define GEN12_IDLE_CTX_ID 0x7FF 176 #define GEN12_CSB_CTX_VALID(csb_dw) \ 177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 178 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 struct rcu_work rcu; 186 187 /* 188 * We allow only a single request through the virtual engine at a time 189 * (each request in the timeline waits for the completion fence of 190 * the previous before being submitted). By restricting ourselves to 191 * only submitting a single request, each request is placed on to a 192 * physical to maximise load spreading (by virtue of the late greedy 193 * scheduling -- each real engine takes the next available request 194 * upon idling). 195 */ 196 struct i915_request *request; 197 198 /* 199 * We keep a rbtree of available virtual engines inside each physical 200 * engine, sorted by priority. Here we preallocate the nodes we need 201 * for the virtual engine, indexed by physical_engine->id. 202 */ 203 struct ve_node { 204 struct rb_node rb; 205 int prio; 206 } nodes[I915_NUM_ENGINES]; 207 208 /* 209 * Keep track of bonded pairs -- restrictions upon on our selection 210 * of physical engines any particular request may be submitted to. 211 * If we receive a submit-fence from a master engine, we will only 212 * use one of sibling_mask physical engines. 213 */ 214 struct ve_bond { 215 const struct intel_engine_cs *master; 216 intel_engine_mask_t sibling_mask; 217 } *bonds; 218 unsigned int num_bonds; 219 220 /* And finally, which physical engines this virtual engine maps onto. */ 221 unsigned int num_siblings; 222 struct intel_engine_cs *siblings[]; 223 }; 224 225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 226 { 227 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 228 return container_of(engine, struct virtual_engine, base); 229 } 230 231 static int __execlists_context_alloc(struct intel_context *ce, 232 struct intel_engine_cs *engine); 233 234 static void execlists_init_reg_state(u32 *reg_state, 235 const struct intel_context *ce, 236 const struct intel_engine_cs *engine, 237 const struct intel_ring *ring, 238 bool close); 239 static void 240 __execlists_update_reg_state(const struct intel_context *ce, 241 const struct intel_engine_cs *engine, 242 u32 head); 243 244 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 245 { 246 if (INTEL_GEN(engine->i915) >= 12) 247 return 0x60; 248 else if (INTEL_GEN(engine->i915) >= 9) 249 return 0x54; 250 else if (engine->class == RENDER_CLASS) 251 return 0x58; 252 else 253 return -1; 254 } 255 256 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 257 { 258 if (INTEL_GEN(engine->i915) >= 12) 259 return 0x74; 260 else if (INTEL_GEN(engine->i915) >= 9) 261 return 0x68; 262 else if (engine->class == RENDER_CLASS) 263 return 0xd8; 264 else 265 return -1; 266 } 267 268 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 269 { 270 if (INTEL_GEN(engine->i915) >= 12) 271 return 0x12; 272 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 273 return 0x18; 274 else 275 return -1; 276 } 277 278 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 279 { 280 int x; 281 282 x = lrc_ring_wa_bb_per_ctx(engine); 283 if (x < 0) 284 return x; 285 286 return x + 2; 287 } 288 289 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 290 { 291 int x; 292 293 x = lrc_ring_indirect_ptr(engine); 294 if (x < 0) 295 return x; 296 297 return x + 2; 298 } 299 300 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 301 { 302 if (engine->class != RENDER_CLASS) 303 return -1; 304 305 if (INTEL_GEN(engine->i915) >= 12) 306 return 0xb6; 307 else if (INTEL_GEN(engine->i915) >= 11) 308 return 0xaa; 309 else 310 return -1; 311 } 312 313 static u32 314 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 315 { 316 switch (INTEL_GEN(engine->i915)) { 317 default: 318 MISSING_CASE(INTEL_GEN(engine->i915)); 319 fallthrough; 320 case 12: 321 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 322 case 11: 323 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 324 case 10: 325 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 326 case 9: 327 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 328 case 8: 329 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 330 } 331 } 332 333 static void 334 lrc_ring_setup_indirect_ctx(u32 *regs, 335 const struct intel_engine_cs *engine, 336 u32 ctx_bb_ggtt_addr, 337 u32 size) 338 { 339 GEM_BUG_ON(!size); 340 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 341 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 342 regs[lrc_ring_indirect_ptr(engine) + 1] = 343 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 344 345 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 346 regs[lrc_ring_indirect_offset(engine) + 1] = 347 lrc_ring_indirect_offset_default(engine) << 6; 348 } 349 350 static u32 intel_context_get_runtime(const struct intel_context *ce) 351 { 352 /* 353 * We can use either ppHWSP[16] which is recorded before the context 354 * switch (and so excludes the cost of context switches) or use the 355 * value from the context image itself, which is saved/restored earlier 356 * and so includes the cost of the save. 357 */ 358 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 359 } 360 361 static void mark_eio(struct i915_request *rq) 362 { 363 if (i915_request_completed(rq)) 364 return; 365 366 GEM_BUG_ON(i915_request_signaled(rq)); 367 368 i915_request_set_error_once(rq, -EIO); 369 i915_request_mark_complete(rq); 370 } 371 372 static struct i915_request * 373 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 374 { 375 struct i915_request *active = rq; 376 377 rcu_read_lock(); 378 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 379 if (i915_request_completed(rq)) 380 break; 381 382 active = rq; 383 } 384 rcu_read_unlock(); 385 386 return active; 387 } 388 389 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 390 { 391 return (i915_ggtt_offset(engine->status_page.vma) + 392 I915_GEM_HWS_PREEMPT_ADDR); 393 } 394 395 static inline void 396 ring_set_paused(const struct intel_engine_cs *engine, int state) 397 { 398 /* 399 * We inspect HWS_PREEMPT with a semaphore inside 400 * engine->emit_fini_breadcrumb. If the dword is true, 401 * the ring is paused as the semaphore will busywait 402 * until the dword is false. 403 */ 404 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 405 if (state) 406 wmb(); 407 } 408 409 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 410 { 411 return rb_entry(rb, struct i915_priolist, node); 412 } 413 414 static inline int rq_prio(const struct i915_request *rq) 415 { 416 return READ_ONCE(rq->sched.attr.priority); 417 } 418 419 static int effective_prio(const struct i915_request *rq) 420 { 421 int prio = rq_prio(rq); 422 423 /* 424 * If this request is special and must not be interrupted at any 425 * cost, so be it. Note we are only checking the most recent request 426 * in the context and so may be masking an earlier vip request. It 427 * is hoped that under the conditions where nopreempt is used, this 428 * will not matter (i.e. all requests to that context will be 429 * nopreempt for as long as desired). 430 */ 431 if (i915_request_has_nopreempt(rq)) 432 prio = I915_PRIORITY_UNPREEMPTABLE; 433 434 return prio; 435 } 436 437 static int queue_prio(const struct intel_engine_execlists *execlists) 438 { 439 struct i915_priolist *p; 440 struct rb_node *rb; 441 442 rb = rb_first_cached(&execlists->queue); 443 if (!rb) 444 return INT_MIN; 445 446 /* 447 * As the priolist[] are inverted, with the highest priority in [0], 448 * we have to flip the index value to become priority. 449 */ 450 p = to_priolist(rb); 451 if (!I915_USER_PRIORITY_SHIFT) 452 return p->priority; 453 454 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 455 } 456 457 static inline bool need_preempt(const struct intel_engine_cs *engine, 458 const struct i915_request *rq, 459 struct rb_node *rb) 460 { 461 int last_prio; 462 463 if (!intel_engine_has_semaphores(engine)) 464 return false; 465 466 /* 467 * Check if the current priority hint merits a preemption attempt. 468 * 469 * We record the highest value priority we saw during rescheduling 470 * prior to this dequeue, therefore we know that if it is strictly 471 * less than the current tail of ESLP[0], we do not need to force 472 * a preempt-to-idle cycle. 473 * 474 * However, the priority hint is a mere hint that we may need to 475 * preempt. If that hint is stale or we may be trying to preempt 476 * ourselves, ignore the request. 477 * 478 * More naturally we would write 479 * prio >= max(0, last); 480 * except that we wish to prevent triggering preemption at the same 481 * priority level: the task that is running should remain running 482 * to preserve FIFO ordering of dependencies. 483 */ 484 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 485 if (engine->execlists.queue_priority_hint <= last_prio) 486 return false; 487 488 /* 489 * Check against the first request in ELSP[1], it will, thanks to the 490 * power of PI, be the highest priority of that context. 491 */ 492 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 493 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 494 return true; 495 496 if (rb) { 497 struct virtual_engine *ve = 498 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 499 bool preempt = false; 500 501 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 502 struct i915_request *next; 503 504 rcu_read_lock(); 505 next = READ_ONCE(ve->request); 506 if (next) 507 preempt = rq_prio(next) > last_prio; 508 rcu_read_unlock(); 509 } 510 511 if (preempt) 512 return preempt; 513 } 514 515 /* 516 * If the inflight context did not trigger the preemption, then maybe 517 * it was the set of queued requests? Pick the highest priority in 518 * the queue (the first active priolist) and see if it deserves to be 519 * running instead of ELSP[0]. 520 * 521 * The highest priority request in the queue can not be either 522 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 523 * context, it's priority would not exceed ELSP[0] aka last_prio. 524 */ 525 return queue_prio(&engine->execlists) > last_prio; 526 } 527 528 __maybe_unused static inline bool 529 assert_priority_queue(const struct i915_request *prev, 530 const struct i915_request *next) 531 { 532 /* 533 * Without preemption, the prev may refer to the still active element 534 * which we refuse to let go. 535 * 536 * Even with preemption, there are times when we think it is better not 537 * to preempt and leave an ostensibly lower priority request in flight. 538 */ 539 if (i915_request_is_active(prev)) 540 return true; 541 542 return rq_prio(prev) >= rq_prio(next); 543 } 544 545 /* 546 * The context descriptor encodes various attributes of a context, 547 * including its GTT address and some flags. Because it's fairly 548 * expensive to calculate, we'll just do it once and cache the result, 549 * which remains valid until the context is unpinned. 550 * 551 * This is what a descriptor looks like, from LSB to MSB:: 552 * 553 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 554 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 555 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 556 * bits 53-54: mbz, reserved for use by hardware 557 * bits 55-63: group ID, currently unused and set to 0 558 * 559 * Starting from Gen11, the upper dword of the descriptor has a new format: 560 * 561 * bits 32-36: reserved 562 * bits 37-47: SW context ID 563 * bits 48:53: engine instance 564 * bit 54: mbz, reserved for use by hardware 565 * bits 55-60: SW counter 566 * bits 61-63: engine class 567 * 568 * engine info, SW context ID and SW counter need to form a unique number 569 * (Context ID) per lrc. 570 */ 571 static u32 572 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 573 { 574 u32 desc; 575 576 desc = INTEL_LEGACY_32B_CONTEXT; 577 if (i915_vm_is_4lvl(ce->vm)) 578 desc = INTEL_LEGACY_64B_CONTEXT; 579 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 580 581 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 582 if (IS_GEN(engine->i915, 8)) 583 desc |= GEN8_CTX_L3LLC_COHERENT; 584 585 return i915_ggtt_offset(ce->state) | desc; 586 } 587 588 static inline unsigned int dword_in_page(void *addr) 589 { 590 return offset_in_page(addr) / sizeof(u32); 591 } 592 593 static void set_offsets(u32 *regs, 594 const u8 *data, 595 const struct intel_engine_cs *engine, 596 bool clear) 597 #define NOP(x) (BIT(7) | (x)) 598 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 599 #define POSTED BIT(0) 600 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 601 #define REG16(x) \ 602 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 603 (((x) >> 2) & 0x7f) 604 #define END(total_state_size) 0, (total_state_size) 605 { 606 const u32 base = engine->mmio_base; 607 608 while (*data) { 609 u8 count, flags; 610 611 if (*data & BIT(7)) { /* skip */ 612 count = *data++ & ~BIT(7); 613 if (clear) 614 memset32(regs, MI_NOOP, count); 615 regs += count; 616 continue; 617 } 618 619 count = *data & 0x3f; 620 flags = *data >> 6; 621 data++; 622 623 *regs = MI_LOAD_REGISTER_IMM(count); 624 if (flags & POSTED) 625 *regs |= MI_LRI_FORCE_POSTED; 626 if (INTEL_GEN(engine->i915) >= 11) 627 *regs |= MI_LRI_LRM_CS_MMIO; 628 regs++; 629 630 GEM_BUG_ON(!count); 631 do { 632 u32 offset = 0; 633 u8 v; 634 635 do { 636 v = *data++; 637 offset <<= 7; 638 offset |= v & ~BIT(7); 639 } while (v & BIT(7)); 640 641 regs[0] = base + (offset << 2); 642 if (clear) 643 regs[1] = 0; 644 regs += 2; 645 } while (--count); 646 } 647 648 if (clear) { 649 u8 count = *++data; 650 651 /* Clear past the tail for HW access */ 652 GEM_BUG_ON(dword_in_page(regs) > count); 653 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 654 655 /* Close the batch; used mainly by live_lrc_layout() */ 656 *regs = MI_BATCH_BUFFER_END; 657 if (INTEL_GEN(engine->i915) >= 10) 658 *regs |= BIT(0); 659 } 660 } 661 662 static const u8 gen8_xcs_offsets[] = { 663 NOP(1), 664 LRI(11, 0), 665 REG16(0x244), 666 REG(0x034), 667 REG(0x030), 668 REG(0x038), 669 REG(0x03c), 670 REG(0x168), 671 REG(0x140), 672 REG(0x110), 673 REG(0x11c), 674 REG(0x114), 675 REG(0x118), 676 677 NOP(9), 678 LRI(9, 0), 679 REG16(0x3a8), 680 REG16(0x28c), 681 REG16(0x288), 682 REG16(0x284), 683 REG16(0x280), 684 REG16(0x27c), 685 REG16(0x278), 686 REG16(0x274), 687 REG16(0x270), 688 689 NOP(13), 690 LRI(2, 0), 691 REG16(0x200), 692 REG(0x028), 693 694 END(80) 695 }; 696 697 static const u8 gen9_xcs_offsets[] = { 698 NOP(1), 699 LRI(14, POSTED), 700 REG16(0x244), 701 REG(0x034), 702 REG(0x030), 703 REG(0x038), 704 REG(0x03c), 705 REG(0x168), 706 REG(0x140), 707 REG(0x110), 708 REG(0x11c), 709 REG(0x114), 710 REG(0x118), 711 REG(0x1c0), 712 REG(0x1c4), 713 REG(0x1c8), 714 715 NOP(3), 716 LRI(9, POSTED), 717 REG16(0x3a8), 718 REG16(0x28c), 719 REG16(0x288), 720 REG16(0x284), 721 REG16(0x280), 722 REG16(0x27c), 723 REG16(0x278), 724 REG16(0x274), 725 REG16(0x270), 726 727 NOP(13), 728 LRI(1, POSTED), 729 REG16(0x200), 730 731 NOP(13), 732 LRI(44, POSTED), 733 REG(0x028), 734 REG(0x09c), 735 REG(0x0c0), 736 REG(0x178), 737 REG(0x17c), 738 REG16(0x358), 739 REG(0x170), 740 REG(0x150), 741 REG(0x154), 742 REG(0x158), 743 REG16(0x41c), 744 REG16(0x600), 745 REG16(0x604), 746 REG16(0x608), 747 REG16(0x60c), 748 REG16(0x610), 749 REG16(0x614), 750 REG16(0x618), 751 REG16(0x61c), 752 REG16(0x620), 753 REG16(0x624), 754 REG16(0x628), 755 REG16(0x62c), 756 REG16(0x630), 757 REG16(0x634), 758 REG16(0x638), 759 REG16(0x63c), 760 REG16(0x640), 761 REG16(0x644), 762 REG16(0x648), 763 REG16(0x64c), 764 REG16(0x650), 765 REG16(0x654), 766 REG16(0x658), 767 REG16(0x65c), 768 REG16(0x660), 769 REG16(0x664), 770 REG16(0x668), 771 REG16(0x66c), 772 REG16(0x670), 773 REG16(0x674), 774 REG16(0x678), 775 REG16(0x67c), 776 REG(0x068), 777 778 END(176) 779 }; 780 781 static const u8 gen12_xcs_offsets[] = { 782 NOP(1), 783 LRI(13, POSTED), 784 REG16(0x244), 785 REG(0x034), 786 REG(0x030), 787 REG(0x038), 788 REG(0x03c), 789 REG(0x168), 790 REG(0x140), 791 REG(0x110), 792 REG(0x1c0), 793 REG(0x1c4), 794 REG(0x1c8), 795 REG(0x180), 796 REG16(0x2b4), 797 798 NOP(5), 799 LRI(9, POSTED), 800 REG16(0x3a8), 801 REG16(0x28c), 802 REG16(0x288), 803 REG16(0x284), 804 REG16(0x280), 805 REG16(0x27c), 806 REG16(0x278), 807 REG16(0x274), 808 REG16(0x270), 809 810 END(80) 811 }; 812 813 static const u8 gen8_rcs_offsets[] = { 814 NOP(1), 815 LRI(14, POSTED), 816 REG16(0x244), 817 REG(0x034), 818 REG(0x030), 819 REG(0x038), 820 REG(0x03c), 821 REG(0x168), 822 REG(0x140), 823 REG(0x110), 824 REG(0x11c), 825 REG(0x114), 826 REG(0x118), 827 REG(0x1c0), 828 REG(0x1c4), 829 REG(0x1c8), 830 831 NOP(3), 832 LRI(9, POSTED), 833 REG16(0x3a8), 834 REG16(0x28c), 835 REG16(0x288), 836 REG16(0x284), 837 REG16(0x280), 838 REG16(0x27c), 839 REG16(0x278), 840 REG16(0x274), 841 REG16(0x270), 842 843 NOP(13), 844 LRI(1, 0), 845 REG(0x0c8), 846 847 END(80) 848 }; 849 850 static const u8 gen9_rcs_offsets[] = { 851 NOP(1), 852 LRI(14, POSTED), 853 REG16(0x244), 854 REG(0x34), 855 REG(0x30), 856 REG(0x38), 857 REG(0x3c), 858 REG(0x168), 859 REG(0x140), 860 REG(0x110), 861 REG(0x11c), 862 REG(0x114), 863 REG(0x118), 864 REG(0x1c0), 865 REG(0x1c4), 866 REG(0x1c8), 867 868 NOP(3), 869 LRI(9, POSTED), 870 REG16(0x3a8), 871 REG16(0x28c), 872 REG16(0x288), 873 REG16(0x284), 874 REG16(0x280), 875 REG16(0x27c), 876 REG16(0x278), 877 REG16(0x274), 878 REG16(0x270), 879 880 NOP(13), 881 LRI(1, 0), 882 REG(0xc8), 883 884 NOP(13), 885 LRI(44, POSTED), 886 REG(0x28), 887 REG(0x9c), 888 REG(0xc0), 889 REG(0x178), 890 REG(0x17c), 891 REG16(0x358), 892 REG(0x170), 893 REG(0x150), 894 REG(0x154), 895 REG(0x158), 896 REG16(0x41c), 897 REG16(0x600), 898 REG16(0x604), 899 REG16(0x608), 900 REG16(0x60c), 901 REG16(0x610), 902 REG16(0x614), 903 REG16(0x618), 904 REG16(0x61c), 905 REG16(0x620), 906 REG16(0x624), 907 REG16(0x628), 908 REG16(0x62c), 909 REG16(0x630), 910 REG16(0x634), 911 REG16(0x638), 912 REG16(0x63c), 913 REG16(0x640), 914 REG16(0x644), 915 REG16(0x648), 916 REG16(0x64c), 917 REG16(0x650), 918 REG16(0x654), 919 REG16(0x658), 920 REG16(0x65c), 921 REG16(0x660), 922 REG16(0x664), 923 REG16(0x668), 924 REG16(0x66c), 925 REG16(0x670), 926 REG16(0x674), 927 REG16(0x678), 928 REG16(0x67c), 929 REG(0x68), 930 931 END(176) 932 }; 933 934 static const u8 gen11_rcs_offsets[] = { 935 NOP(1), 936 LRI(15, POSTED), 937 REG16(0x244), 938 REG(0x034), 939 REG(0x030), 940 REG(0x038), 941 REG(0x03c), 942 REG(0x168), 943 REG(0x140), 944 REG(0x110), 945 REG(0x11c), 946 REG(0x114), 947 REG(0x118), 948 REG(0x1c0), 949 REG(0x1c4), 950 REG(0x1c8), 951 REG(0x180), 952 953 NOP(1), 954 LRI(9, POSTED), 955 REG16(0x3a8), 956 REG16(0x28c), 957 REG16(0x288), 958 REG16(0x284), 959 REG16(0x280), 960 REG16(0x27c), 961 REG16(0x278), 962 REG16(0x274), 963 REG16(0x270), 964 965 LRI(1, POSTED), 966 REG(0x1b0), 967 968 NOP(10), 969 LRI(1, 0), 970 REG(0x0c8), 971 972 END(80) 973 }; 974 975 static const u8 gen12_rcs_offsets[] = { 976 NOP(1), 977 LRI(13, POSTED), 978 REG16(0x244), 979 REG(0x034), 980 REG(0x030), 981 REG(0x038), 982 REG(0x03c), 983 REG(0x168), 984 REG(0x140), 985 REG(0x110), 986 REG(0x1c0), 987 REG(0x1c4), 988 REG(0x1c8), 989 REG(0x180), 990 REG16(0x2b4), 991 992 NOP(5), 993 LRI(9, POSTED), 994 REG16(0x3a8), 995 REG16(0x28c), 996 REG16(0x288), 997 REG16(0x284), 998 REG16(0x280), 999 REG16(0x27c), 1000 REG16(0x278), 1001 REG16(0x274), 1002 REG16(0x270), 1003 1004 LRI(3, POSTED), 1005 REG(0x1b0), 1006 REG16(0x5a8), 1007 REG16(0x5ac), 1008 1009 NOP(6), 1010 LRI(1, 0), 1011 REG(0x0c8), 1012 NOP(3 + 9 + 1), 1013 1014 LRI(51, POSTED), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG16(0x588), 1020 REG16(0x588), 1021 REG(0x028), 1022 REG(0x09c), 1023 REG(0x0c0), 1024 REG(0x178), 1025 REG(0x17c), 1026 REG16(0x358), 1027 REG(0x170), 1028 REG(0x150), 1029 REG(0x154), 1030 REG(0x158), 1031 REG16(0x41c), 1032 REG16(0x600), 1033 REG16(0x604), 1034 REG16(0x608), 1035 REG16(0x60c), 1036 REG16(0x610), 1037 REG16(0x614), 1038 REG16(0x618), 1039 REG16(0x61c), 1040 REG16(0x620), 1041 REG16(0x624), 1042 REG16(0x628), 1043 REG16(0x62c), 1044 REG16(0x630), 1045 REG16(0x634), 1046 REG16(0x638), 1047 REG16(0x63c), 1048 REG16(0x640), 1049 REG16(0x644), 1050 REG16(0x648), 1051 REG16(0x64c), 1052 REG16(0x650), 1053 REG16(0x654), 1054 REG16(0x658), 1055 REG16(0x65c), 1056 REG16(0x660), 1057 REG16(0x664), 1058 REG16(0x668), 1059 REG16(0x66c), 1060 REG16(0x670), 1061 REG16(0x674), 1062 REG16(0x678), 1063 REG16(0x67c), 1064 REG(0x068), 1065 REG(0x084), 1066 NOP(1), 1067 1068 END(192) 1069 }; 1070 1071 #undef END 1072 #undef REG16 1073 #undef REG 1074 #undef LRI 1075 #undef NOP 1076 1077 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1078 { 1079 /* 1080 * The gen12+ lists only have the registers we program in the basic 1081 * default state. We rely on the context image using relative 1082 * addressing to automatic fixup the register state between the 1083 * physical engines for virtual engine. 1084 */ 1085 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1086 !intel_engine_has_relative_mmio(engine)); 1087 1088 if (engine->class == RENDER_CLASS) { 1089 if (INTEL_GEN(engine->i915) >= 12) 1090 return gen12_rcs_offsets; 1091 else if (INTEL_GEN(engine->i915) >= 11) 1092 return gen11_rcs_offsets; 1093 else if (INTEL_GEN(engine->i915) >= 9) 1094 return gen9_rcs_offsets; 1095 else 1096 return gen8_rcs_offsets; 1097 } else { 1098 if (INTEL_GEN(engine->i915) >= 12) 1099 return gen12_xcs_offsets; 1100 else if (INTEL_GEN(engine->i915) >= 9) 1101 return gen9_xcs_offsets; 1102 else 1103 return gen8_xcs_offsets; 1104 } 1105 } 1106 1107 static struct i915_request * 1108 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1109 { 1110 struct i915_request *rq, *rn, *active = NULL; 1111 struct list_head *pl; 1112 int prio = I915_PRIORITY_INVALID; 1113 1114 lockdep_assert_held(&engine->active.lock); 1115 1116 list_for_each_entry_safe_reverse(rq, rn, 1117 &engine->active.requests, 1118 sched.link) { 1119 if (i915_request_completed(rq)) { 1120 list_del_init(&rq->sched.link); 1121 continue; 1122 } 1123 1124 __i915_request_unsubmit(rq); 1125 1126 /* 1127 * Push the request back into the queue for later resubmission. 1128 * If this request is not native to this physical engine (i.e. 1129 * it came from a virtual source), push it back onto the virtual 1130 * engine so that it can be moved across onto another physical 1131 * engine as load dictates. 1132 */ 1133 if (likely(rq->execution_mask == engine->mask)) { 1134 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1135 if (rq_prio(rq) != prio) { 1136 prio = rq_prio(rq); 1137 pl = i915_sched_lookup_priolist(engine, prio); 1138 } 1139 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1140 1141 list_move(&rq->sched.link, pl); 1142 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1143 1144 /* Check in case we rollback so far we wrap [size/2] */ 1145 if (intel_ring_direction(rq->ring, 1146 rq->tail, 1147 rq->ring->tail + 8) > 0) 1148 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1149 1150 active = rq; 1151 } else { 1152 struct intel_engine_cs *owner = rq->context->engine; 1153 1154 WRITE_ONCE(rq->engine, owner); 1155 owner->submit_request(rq); 1156 active = NULL; 1157 } 1158 } 1159 1160 return active; 1161 } 1162 1163 struct i915_request * 1164 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1165 { 1166 struct intel_engine_cs *engine = 1167 container_of(execlists, typeof(*engine), execlists); 1168 1169 return __unwind_incomplete_requests(engine); 1170 } 1171 1172 static inline void 1173 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1174 { 1175 /* 1176 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1177 * The compiler should eliminate this function as dead-code. 1178 */ 1179 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1180 return; 1181 1182 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1183 status, rq); 1184 } 1185 1186 static void intel_engine_context_in(struct intel_engine_cs *engine) 1187 { 1188 unsigned long flags; 1189 1190 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1191 return; 1192 1193 write_seqlock_irqsave(&engine->stats.lock, flags); 1194 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1195 engine->stats.start = ktime_get(); 1196 atomic_inc(&engine->stats.active); 1197 } 1198 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1199 } 1200 1201 static void intel_engine_context_out(struct intel_engine_cs *engine) 1202 { 1203 unsigned long flags; 1204 1205 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1206 1207 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1208 return; 1209 1210 write_seqlock_irqsave(&engine->stats.lock, flags); 1211 if (atomic_dec_and_test(&engine->stats.active)) { 1212 engine->stats.total = 1213 ktime_add(engine->stats.total, 1214 ktime_sub(ktime_get(), engine->stats.start)); 1215 } 1216 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1217 } 1218 1219 static void 1220 execlists_check_context(const struct intel_context *ce, 1221 const struct intel_engine_cs *engine, 1222 const char *when) 1223 { 1224 const struct intel_ring *ring = ce->ring; 1225 u32 *regs = ce->lrc_reg_state; 1226 bool valid = true; 1227 int x; 1228 1229 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1230 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1231 engine->name, 1232 regs[CTX_RING_START], 1233 i915_ggtt_offset(ring->vma)); 1234 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1235 valid = false; 1236 } 1237 1238 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1239 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1240 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1241 engine->name, 1242 regs[CTX_RING_CTL], 1243 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1244 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1245 valid = false; 1246 } 1247 1248 x = lrc_ring_mi_mode(engine); 1249 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1250 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1251 engine->name, regs[x + 1]); 1252 regs[x + 1] &= ~STOP_RING; 1253 regs[x + 1] |= STOP_RING << 16; 1254 valid = false; 1255 } 1256 1257 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when); 1258 } 1259 1260 static void restore_default_state(struct intel_context *ce, 1261 struct intel_engine_cs *engine) 1262 { 1263 u32 *regs; 1264 1265 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1266 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1267 1268 ce->runtime.last = intel_context_get_runtime(ce); 1269 } 1270 1271 static void reset_active(struct i915_request *rq, 1272 struct intel_engine_cs *engine) 1273 { 1274 struct intel_context * const ce = rq->context; 1275 u32 head; 1276 1277 /* 1278 * The executing context has been cancelled. We want to prevent 1279 * further execution along this context and propagate the error on 1280 * to anything depending on its results. 1281 * 1282 * In __i915_request_submit(), we apply the -EIO and remove the 1283 * requests' payloads for any banned requests. But first, we must 1284 * rewind the context back to the start of the incomplete request so 1285 * that we do not jump back into the middle of the batch. 1286 * 1287 * We preserve the breadcrumbs and semaphores of the incomplete 1288 * requests so that inter-timeline dependencies (i.e other timelines) 1289 * remain correctly ordered. And we defer to __i915_request_submit() 1290 * so that all asynchronous waits are correctly handled. 1291 */ 1292 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1293 rq->fence.context, rq->fence.seqno); 1294 1295 /* On resubmission of the active request, payload will be scrubbed */ 1296 if (i915_request_completed(rq)) 1297 head = rq->tail; 1298 else 1299 head = active_request(ce->timeline, rq)->head; 1300 head = intel_ring_wrap(ce->ring, head); 1301 1302 /* Scrub the context image to prevent replaying the previous batch */ 1303 restore_default_state(ce, engine); 1304 __execlists_update_reg_state(ce, engine, head); 1305 1306 /* We've switched away, so this should be a no-op, but intent matters */ 1307 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1308 } 1309 1310 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1311 { 1312 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1313 ce->runtime.num_underflow++; 1314 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1315 #endif 1316 } 1317 1318 static void intel_context_update_runtime(struct intel_context *ce) 1319 { 1320 u32 old; 1321 s32 dt; 1322 1323 if (intel_context_is_barrier(ce)) 1324 return; 1325 1326 old = ce->runtime.last; 1327 ce->runtime.last = intel_context_get_runtime(ce); 1328 dt = ce->runtime.last - old; 1329 1330 if (unlikely(dt < 0)) { 1331 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1332 old, ce->runtime.last, dt); 1333 st_update_runtime_underflow(ce, dt); 1334 return; 1335 } 1336 1337 ewma_runtime_add(&ce->runtime.avg, dt); 1338 ce->runtime.total += dt; 1339 } 1340 1341 static inline struct intel_engine_cs * 1342 __execlists_schedule_in(struct i915_request *rq) 1343 { 1344 struct intel_engine_cs * const engine = rq->engine; 1345 struct intel_context * const ce = rq->context; 1346 1347 intel_context_get(ce); 1348 1349 if (unlikely(intel_context_is_banned(ce))) 1350 reset_active(rq, engine); 1351 1352 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1353 execlists_check_context(ce, engine, "before"); 1354 1355 if (ce->tag) { 1356 /* Use a fixed tag for OA and friends */ 1357 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1358 ce->lrc.ccid = ce->tag; 1359 } else { 1360 /* We don't need a strict matching tag, just different values */ 1361 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1362 1363 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1364 clear_bit(tag - 1, &engine->context_tag); 1365 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1366 1367 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1368 } 1369 1370 ce->lrc.ccid |= engine->execlists.ccid; 1371 1372 __intel_gt_pm_get(engine->gt); 1373 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1374 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1375 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1376 intel_engine_context_in(engine); 1377 1378 return engine; 1379 } 1380 1381 static inline struct i915_request * 1382 execlists_schedule_in(struct i915_request *rq, int idx) 1383 { 1384 struct intel_context * const ce = rq->context; 1385 struct intel_engine_cs *old; 1386 1387 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1388 trace_i915_request_in(rq, idx); 1389 1390 old = READ_ONCE(ce->inflight); 1391 do { 1392 if (!old) { 1393 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1394 break; 1395 } 1396 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1397 1398 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1399 return i915_request_get(rq); 1400 } 1401 1402 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1403 { 1404 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1405 struct i915_request *next = READ_ONCE(ve->request); 1406 1407 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1408 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1409 } 1410 1411 static inline void 1412 __execlists_schedule_out(struct i915_request *rq, 1413 struct intel_engine_cs * const engine, 1414 unsigned int ccid) 1415 { 1416 struct intel_context * const ce = rq->context; 1417 1418 /* 1419 * NB process_csb() is not under the engine->active.lock and hence 1420 * schedule_out can race with schedule_in meaning that we should 1421 * refrain from doing non-trivial work here. 1422 */ 1423 1424 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1425 execlists_check_context(ce, engine, "after"); 1426 1427 /* 1428 * If we have just completed this context, the engine may now be 1429 * idle and we want to re-enter powersaving. 1430 */ 1431 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1432 i915_request_completed(rq)) 1433 intel_engine_add_retire(engine, ce->timeline); 1434 1435 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1436 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1437 if (ccid < BITS_PER_LONG) { 1438 GEM_BUG_ON(ccid == 0); 1439 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1440 set_bit(ccid - 1, &engine->context_tag); 1441 } 1442 1443 intel_context_update_runtime(ce); 1444 intel_engine_context_out(engine); 1445 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1446 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1447 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1448 intel_gt_pm_put_async(engine->gt); 1449 1450 /* 1451 * If this is part of a virtual engine, its next request may 1452 * have been blocked waiting for access to the active context. 1453 * We have to kick all the siblings again in case we need to 1454 * switch (e.g. the next request is not runnable on this 1455 * engine). Hopefully, we will already have submitted the next 1456 * request before the tasklet runs and do not need to rebuild 1457 * each virtual tree and kick everyone again. 1458 */ 1459 if (ce->engine != engine) 1460 kick_siblings(rq, ce); 1461 1462 intel_context_put(ce); 1463 } 1464 1465 static inline void 1466 execlists_schedule_out(struct i915_request *rq) 1467 { 1468 struct intel_context * const ce = rq->context; 1469 struct intel_engine_cs *cur, *old; 1470 u32 ccid; 1471 1472 trace_i915_request_out(rq); 1473 1474 ccid = rq->context->lrc.ccid; 1475 old = READ_ONCE(ce->inflight); 1476 do 1477 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1478 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1479 if (!cur) 1480 __execlists_schedule_out(rq, old, ccid); 1481 1482 i915_request_put(rq); 1483 } 1484 1485 static u64 execlists_update_context(struct i915_request *rq) 1486 { 1487 struct intel_context *ce = rq->context; 1488 u64 desc = ce->lrc.desc; 1489 u32 tail, prev; 1490 1491 /* 1492 * WaIdleLiteRestore:bdw,skl 1493 * 1494 * We should never submit the context with the same RING_TAIL twice 1495 * just in case we submit an empty ring, which confuses the HW. 1496 * 1497 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1498 * the normal request to be able to always advance the RING_TAIL on 1499 * subsequent resubmissions (for lite restore). Should that fail us, 1500 * and we try and submit the same tail again, force the context 1501 * reload. 1502 * 1503 * If we need to return to a preempted context, we need to skip the 1504 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1505 * HW has a tendency to ignore us rewinding the TAIL to the end of 1506 * an earlier request. 1507 */ 1508 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1509 prev = rq->ring->tail; 1510 tail = intel_ring_set_tail(rq->ring, rq->tail); 1511 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1512 desc |= CTX_DESC_FORCE_RESTORE; 1513 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1514 rq->tail = rq->wa_tail; 1515 1516 /* 1517 * Make sure the context image is complete before we submit it to HW. 1518 * 1519 * Ostensibly, writes (including the WCB) should be flushed prior to 1520 * an uncached write such as our mmio register access, the empirical 1521 * evidence (esp. on Braswell) suggests that the WC write into memory 1522 * may not be visible to the HW prior to the completion of the UC 1523 * register write and that we may begin execution from the context 1524 * before its image is complete leading to invalid PD chasing. 1525 */ 1526 wmb(); 1527 1528 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1529 return desc; 1530 } 1531 1532 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1533 { 1534 if (execlists->ctrl_reg) { 1535 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1536 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1537 } else { 1538 writel(upper_32_bits(desc), execlists->submit_reg); 1539 writel(lower_32_bits(desc), execlists->submit_reg); 1540 } 1541 } 1542 1543 static __maybe_unused char * 1544 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1545 { 1546 if (!rq) 1547 return ""; 1548 1549 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1550 prefix, 1551 rq->context->lrc.ccid, 1552 rq->fence.context, rq->fence.seqno, 1553 i915_request_completed(rq) ? "!" : 1554 i915_request_started(rq) ? "*" : 1555 "", 1556 rq_prio(rq)); 1557 1558 return buf; 1559 } 1560 1561 static __maybe_unused void 1562 trace_ports(const struct intel_engine_execlists *execlists, 1563 const char *msg, 1564 struct i915_request * const *ports) 1565 { 1566 const struct intel_engine_cs *engine = 1567 container_of(execlists, typeof(*engine), execlists); 1568 char __maybe_unused p0[40], p1[40]; 1569 1570 if (!ports[0]) 1571 return; 1572 1573 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1574 dump_port(p0, sizeof(p0), "", ports[0]), 1575 dump_port(p1, sizeof(p1), ", ", ports[1])); 1576 } 1577 1578 static inline bool 1579 reset_in_progress(const struct intel_engine_execlists *execlists) 1580 { 1581 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1582 } 1583 1584 static __maybe_unused bool 1585 assert_pending_valid(const struct intel_engine_execlists *execlists, 1586 const char *msg) 1587 { 1588 struct intel_engine_cs *engine = 1589 container_of(execlists, typeof(*engine), execlists); 1590 struct i915_request * const *port, *rq; 1591 struct intel_context *ce = NULL; 1592 bool sentinel = false; 1593 u32 ccid = -1; 1594 1595 trace_ports(execlists, msg, execlists->pending); 1596 1597 /* We may be messing around with the lists during reset, lalala */ 1598 if (reset_in_progress(execlists)) 1599 return true; 1600 1601 if (!execlists->pending[0]) { 1602 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1603 engine->name); 1604 return false; 1605 } 1606 1607 if (execlists->pending[execlists_num_ports(execlists)]) { 1608 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1609 engine->name, execlists_num_ports(execlists)); 1610 return false; 1611 } 1612 1613 for (port = execlists->pending; (rq = *port); port++) { 1614 unsigned long flags; 1615 bool ok = true; 1616 1617 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1618 GEM_BUG_ON(!i915_request_is_active(rq)); 1619 1620 if (ce == rq->context) { 1621 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1622 engine->name, 1623 ce->timeline->fence_context, 1624 port - execlists->pending); 1625 return false; 1626 } 1627 ce = rq->context; 1628 1629 if (ccid == ce->lrc.ccid) { 1630 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1631 engine->name, 1632 ccid, ce->timeline->fence_context, 1633 port - execlists->pending); 1634 return false; 1635 } 1636 ccid = ce->lrc.ccid; 1637 1638 /* 1639 * Sentinels are supposed to be the last request so they flush 1640 * the current execution off the HW. Check that they are the only 1641 * request in the pending submission. 1642 */ 1643 if (sentinel) { 1644 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1645 engine->name, 1646 ce->timeline->fence_context, 1647 port - execlists->pending); 1648 return false; 1649 } 1650 sentinel = i915_request_has_sentinel(rq); 1651 1652 /* Hold tightly onto the lock to prevent concurrent retires! */ 1653 if (!spin_trylock_irqsave(&rq->lock, flags)) 1654 continue; 1655 1656 if (i915_request_completed(rq)) 1657 goto unlock; 1658 1659 if (i915_active_is_idle(&ce->active) && 1660 !intel_context_is_barrier(ce)) { 1661 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1662 engine->name, 1663 ce->timeline->fence_context, 1664 port - execlists->pending); 1665 ok = false; 1666 goto unlock; 1667 } 1668 1669 if (!i915_vma_is_pinned(ce->state)) { 1670 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1671 engine->name, 1672 ce->timeline->fence_context, 1673 port - execlists->pending); 1674 ok = false; 1675 goto unlock; 1676 } 1677 1678 if (!i915_vma_is_pinned(ce->ring->vma)) { 1679 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1680 engine->name, 1681 ce->timeline->fence_context, 1682 port - execlists->pending); 1683 ok = false; 1684 goto unlock; 1685 } 1686 1687 unlock: 1688 spin_unlock_irqrestore(&rq->lock, flags); 1689 if (!ok) 1690 return false; 1691 } 1692 1693 return ce; 1694 } 1695 1696 static void execlists_submit_ports(struct intel_engine_cs *engine) 1697 { 1698 struct intel_engine_execlists *execlists = &engine->execlists; 1699 unsigned int n; 1700 1701 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1702 1703 /* 1704 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1705 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1706 * not be relinquished until the device is idle (see 1707 * i915_gem_idle_work_handler()). As a precaution, we make sure 1708 * that all ELSP are drained i.e. we have processed the CSB, 1709 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1710 */ 1711 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1712 1713 /* 1714 * ELSQ note: the submit queue is not cleared after being submitted 1715 * to the HW so we need to make sure we always clean it up. This is 1716 * currently ensured by the fact that we always write the same number 1717 * of elsq entries, keep this in mind before changing the loop below. 1718 */ 1719 for (n = execlists_num_ports(execlists); n--; ) { 1720 struct i915_request *rq = execlists->pending[n]; 1721 1722 write_desc(execlists, 1723 rq ? execlists_update_context(rq) : 0, 1724 n); 1725 } 1726 1727 /* we need to manually load the submit queue */ 1728 if (execlists->ctrl_reg) 1729 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1730 } 1731 1732 static bool ctx_single_port_submission(const struct intel_context *ce) 1733 { 1734 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1735 intel_context_force_single_submission(ce)); 1736 } 1737 1738 static bool can_merge_ctx(const struct intel_context *prev, 1739 const struct intel_context *next) 1740 { 1741 if (prev != next) 1742 return false; 1743 1744 if (ctx_single_port_submission(prev)) 1745 return false; 1746 1747 return true; 1748 } 1749 1750 static unsigned long i915_request_flags(const struct i915_request *rq) 1751 { 1752 return READ_ONCE(rq->fence.flags); 1753 } 1754 1755 static bool can_merge_rq(const struct i915_request *prev, 1756 const struct i915_request *next) 1757 { 1758 GEM_BUG_ON(prev == next); 1759 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1760 1761 /* 1762 * We do not submit known completed requests. Therefore if the next 1763 * request is already completed, we can pretend to merge it in 1764 * with the previous context (and we will skip updating the ELSP 1765 * and tracking). Thus hopefully keeping the ELSP full with active 1766 * contexts, despite the best efforts of preempt-to-busy to confuse 1767 * us. 1768 */ 1769 if (i915_request_completed(next)) 1770 return true; 1771 1772 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1773 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1774 BIT(I915_FENCE_FLAG_SENTINEL)))) 1775 return false; 1776 1777 if (!can_merge_ctx(prev->context, next->context)) 1778 return false; 1779 1780 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1781 return true; 1782 } 1783 1784 static void virtual_update_register_offsets(u32 *regs, 1785 struct intel_engine_cs *engine) 1786 { 1787 set_offsets(regs, reg_offsets(engine), engine, false); 1788 } 1789 1790 static bool virtual_matches(const struct virtual_engine *ve, 1791 const struct i915_request *rq, 1792 const struct intel_engine_cs *engine) 1793 { 1794 const struct intel_engine_cs *inflight; 1795 1796 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1797 return false; 1798 1799 /* 1800 * We track when the HW has completed saving the context image 1801 * (i.e. when we have seen the final CS event switching out of 1802 * the context) and must not overwrite the context image before 1803 * then. This restricts us to only using the active engine 1804 * while the previous virtualized request is inflight (so 1805 * we reuse the register offsets). This is a very small 1806 * hystersis on the greedy seelction algorithm. 1807 */ 1808 inflight = intel_context_inflight(&ve->context); 1809 if (inflight && inflight != engine) 1810 return false; 1811 1812 return true; 1813 } 1814 1815 static void virtual_xfer_context(struct virtual_engine *ve, 1816 struct intel_engine_cs *engine) 1817 { 1818 unsigned int n; 1819 1820 if (likely(engine == ve->siblings[0])) 1821 return; 1822 1823 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1824 if (!intel_engine_has_relative_mmio(engine)) 1825 virtual_update_register_offsets(ve->context.lrc_reg_state, 1826 engine); 1827 1828 /* 1829 * Move the bound engine to the top of the list for 1830 * future execution. We then kick this tasklet first 1831 * before checking others, so that we preferentially 1832 * reuse this set of bound registers. 1833 */ 1834 for (n = 1; n < ve->num_siblings; n++) { 1835 if (ve->siblings[n] == engine) { 1836 swap(ve->siblings[n], ve->siblings[0]); 1837 break; 1838 } 1839 } 1840 } 1841 1842 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1843 { 1844 LIST_HEAD(list); 1845 1846 /* 1847 * We want to move the interrupted request to the back of 1848 * the round-robin list (i.e. its priority level), but 1849 * in doing so, we must then move all requests that were in 1850 * flight and were waiting for the interrupted request to 1851 * be run after it again. 1852 */ 1853 do { 1854 struct i915_dependency *p; 1855 1856 GEM_BUG_ON(i915_request_is_active(rq)); 1857 list_move_tail(&rq->sched.link, pl); 1858 1859 for_each_waiter(p, rq) { 1860 struct i915_request *w = 1861 container_of(p->waiter, typeof(*w), sched); 1862 1863 if (p->flags & I915_DEPENDENCY_WEAK) 1864 continue; 1865 1866 /* Leave semaphores spinning on the other engines */ 1867 if (w->engine != rq->engine) 1868 continue; 1869 1870 /* No waiter should start before its signaler */ 1871 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1872 i915_request_started(w) && 1873 !i915_request_completed(rq)); 1874 1875 GEM_BUG_ON(i915_request_is_active(w)); 1876 if (!i915_request_is_ready(w)) 1877 continue; 1878 1879 if (rq_prio(w) < rq_prio(rq)) 1880 continue; 1881 1882 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1883 list_move_tail(&w->sched.link, &list); 1884 } 1885 1886 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1887 } while (rq); 1888 } 1889 1890 static void defer_active(struct intel_engine_cs *engine) 1891 { 1892 struct i915_request *rq; 1893 1894 rq = __unwind_incomplete_requests(engine); 1895 if (!rq) 1896 return; 1897 1898 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1899 } 1900 1901 static bool 1902 need_timeslice(const struct intel_engine_cs *engine, 1903 const struct i915_request *rq, 1904 const struct rb_node *rb) 1905 { 1906 int hint; 1907 1908 if (!intel_engine_has_timeslices(engine)) 1909 return false; 1910 1911 hint = engine->execlists.queue_priority_hint; 1912 1913 if (rb) { 1914 const struct virtual_engine *ve = 1915 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1916 const struct intel_engine_cs *inflight = 1917 intel_context_inflight(&ve->context); 1918 1919 if (!inflight || inflight == engine) { 1920 struct i915_request *next; 1921 1922 rcu_read_lock(); 1923 next = READ_ONCE(ve->request); 1924 if (next) 1925 hint = max(hint, rq_prio(next)); 1926 rcu_read_unlock(); 1927 } 1928 } 1929 1930 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1931 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1932 1933 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1934 return hint >= effective_prio(rq); 1935 } 1936 1937 static bool 1938 timeslice_yield(const struct intel_engine_execlists *el, 1939 const struct i915_request *rq) 1940 { 1941 /* 1942 * Once bitten, forever smitten! 1943 * 1944 * If the active context ever busy-waited on a semaphore, 1945 * it will be treated as a hog until the end of its timeslice (i.e. 1946 * until it is scheduled out and replaced by a new submission, 1947 * possibly even its own lite-restore). The HW only sends an interrupt 1948 * on the first miss, and we do know if that semaphore has been 1949 * signaled, or even if it is now stuck on another semaphore. Play 1950 * safe, yield if it might be stuck -- it will be given a fresh 1951 * timeslice in the near future. 1952 */ 1953 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1954 } 1955 1956 static bool 1957 timeslice_expired(const struct intel_engine_execlists *el, 1958 const struct i915_request *rq) 1959 { 1960 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1961 } 1962 1963 static int 1964 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1965 { 1966 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1967 return engine->execlists.queue_priority_hint; 1968 1969 return rq_prio(list_next_entry(rq, sched.link)); 1970 } 1971 1972 static inline unsigned long 1973 timeslice(const struct intel_engine_cs *engine) 1974 { 1975 return READ_ONCE(engine->props.timeslice_duration_ms); 1976 } 1977 1978 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1979 { 1980 const struct intel_engine_execlists *execlists = &engine->execlists; 1981 const struct i915_request *rq = *execlists->active; 1982 1983 if (!rq || i915_request_completed(rq)) 1984 return 0; 1985 1986 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1987 return 0; 1988 1989 return timeslice(engine); 1990 } 1991 1992 static void set_timeslice(struct intel_engine_cs *engine) 1993 { 1994 unsigned long duration; 1995 1996 if (!intel_engine_has_timeslices(engine)) 1997 return; 1998 1999 duration = active_timeslice(engine); 2000 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2001 2002 set_timer_ms(&engine->execlists.timer, duration); 2003 } 2004 2005 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2006 { 2007 struct intel_engine_execlists *execlists = &engine->execlists; 2008 unsigned long duration; 2009 2010 if (!intel_engine_has_timeslices(engine)) 2011 return; 2012 2013 WRITE_ONCE(execlists->switch_priority_hint, prio); 2014 if (prio == INT_MIN) 2015 return; 2016 2017 if (timer_pending(&execlists->timer)) 2018 return; 2019 2020 duration = timeslice(engine); 2021 ENGINE_TRACE(engine, 2022 "start timeslicing, prio:%d, interval:%lu", 2023 prio, duration); 2024 2025 set_timer_ms(&execlists->timer, duration); 2026 } 2027 2028 static void record_preemption(struct intel_engine_execlists *execlists) 2029 { 2030 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2031 } 2032 2033 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2034 const struct i915_request *rq) 2035 { 2036 if (!rq) 2037 return 0; 2038 2039 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2040 if (unlikely(intel_context_is_banned(rq->context))) 2041 return 1; 2042 2043 return READ_ONCE(engine->props.preempt_timeout_ms); 2044 } 2045 2046 static void set_preempt_timeout(struct intel_engine_cs *engine, 2047 const struct i915_request *rq) 2048 { 2049 if (!intel_engine_has_preempt_reset(engine)) 2050 return; 2051 2052 set_timer_ms(&engine->execlists.preempt, 2053 active_preempt_timeout(engine, rq)); 2054 } 2055 2056 static inline void clear_ports(struct i915_request **ports, int count) 2057 { 2058 memset_p((void **)ports, NULL, count); 2059 } 2060 2061 static inline void 2062 copy_ports(struct i915_request **dst, struct i915_request **src, int count) 2063 { 2064 /* A memcpy_p() would be very useful here! */ 2065 while (count--) 2066 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ 2067 } 2068 2069 static void execlists_dequeue(struct intel_engine_cs *engine) 2070 { 2071 struct intel_engine_execlists * const execlists = &engine->execlists; 2072 struct i915_request **port = execlists->pending; 2073 struct i915_request ** const last_port = port + execlists->port_mask; 2074 struct i915_request * const *active; 2075 struct i915_request *last; 2076 struct rb_node *rb; 2077 bool submit = false; 2078 2079 /* 2080 * Hardware submission is through 2 ports. Conceptually each port 2081 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2082 * static for a context, and unique to each, so we only execute 2083 * requests belonging to a single context from each ring. RING_HEAD 2084 * is maintained by the CS in the context image, it marks the place 2085 * where it got up to last time, and through RING_TAIL we tell the CS 2086 * where we want to execute up to this time. 2087 * 2088 * In this list the requests are in order of execution. Consecutive 2089 * requests from the same context are adjacent in the ringbuffer. We 2090 * can combine these requests into a single RING_TAIL update: 2091 * 2092 * RING_HEAD...req1...req2 2093 * ^- RING_TAIL 2094 * since to execute req2 the CS must first execute req1. 2095 * 2096 * Our goal then is to point each port to the end of a consecutive 2097 * sequence of requests as being the most optimal (fewest wake ups 2098 * and context switches) submission. 2099 */ 2100 2101 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2102 struct virtual_engine *ve = 2103 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2104 struct i915_request *rq = READ_ONCE(ve->request); 2105 2106 if (!rq) { /* lazily cleanup after another engine handled rq */ 2107 rb_erase_cached(rb, &execlists->virtual); 2108 RB_CLEAR_NODE(rb); 2109 rb = rb_first_cached(&execlists->virtual); 2110 continue; 2111 } 2112 2113 if (!virtual_matches(ve, rq, engine)) { 2114 rb = rb_next(rb); 2115 continue; 2116 } 2117 2118 break; 2119 } 2120 2121 /* 2122 * If the queue is higher priority than the last 2123 * request in the currently active context, submit afresh. 2124 * We will resubmit again afterwards in case we need to split 2125 * the active context to interject the preemption request, 2126 * i.e. we will retrigger preemption following the ack in case 2127 * of trouble. 2128 */ 2129 active = READ_ONCE(execlists->active); 2130 2131 /* 2132 * In theory we can skip over completed contexts that have not 2133 * yet been processed by events (as those events are in flight): 2134 * 2135 * while ((last = *active) && i915_request_completed(last)) 2136 * active++; 2137 * 2138 * However, the GPU cannot handle this as it will ultimately 2139 * find itself trying to jump back into a context it has just 2140 * completed and barf. 2141 */ 2142 2143 if ((last = *active)) { 2144 if (i915_request_completed(last)) { 2145 goto check_secondary; 2146 } else if (need_preempt(engine, last, rb)) { 2147 ENGINE_TRACE(engine, 2148 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2149 last->fence.context, 2150 last->fence.seqno, 2151 last->sched.attr.priority, 2152 execlists->queue_priority_hint); 2153 record_preemption(execlists); 2154 2155 /* 2156 * Don't let the RING_HEAD advance past the breadcrumb 2157 * as we unwind (and until we resubmit) so that we do 2158 * not accidentally tell it to go backwards. 2159 */ 2160 ring_set_paused(engine, 1); 2161 2162 /* 2163 * Note that we have not stopped the GPU at this point, 2164 * so we are unwinding the incomplete requests as they 2165 * remain inflight and so by the time we do complete 2166 * the preemption, some of the unwound requests may 2167 * complete! 2168 */ 2169 __unwind_incomplete_requests(engine); 2170 2171 last = NULL; 2172 } else if (need_timeslice(engine, last, rb) && 2173 timeslice_expired(execlists, last)) { 2174 ENGINE_TRACE(engine, 2175 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2176 last->fence.context, 2177 last->fence.seqno, 2178 last->sched.attr.priority, 2179 execlists->queue_priority_hint, 2180 yesno(timeslice_yield(execlists, last))); 2181 2182 ring_set_paused(engine, 1); 2183 defer_active(engine); 2184 2185 /* 2186 * Unlike for preemption, if we rewind and continue 2187 * executing the same context as previously active, 2188 * the order of execution will remain the same and 2189 * the tail will only advance. We do not need to 2190 * force a full context restore, as a lite-restore 2191 * is sufficient to resample the monotonic TAIL. 2192 * 2193 * If we switch to any other context, similarly we 2194 * will not rewind TAIL of current context, and 2195 * normal save/restore will preserve state and allow 2196 * us to later continue executing the same request. 2197 */ 2198 last = NULL; 2199 } else { 2200 /* 2201 * Otherwise if we already have a request pending 2202 * for execution after the current one, we can 2203 * just wait until the next CS event before 2204 * queuing more. In either case we will force a 2205 * lite-restore preemption event, but if we wait 2206 * we hopefully coalesce several updates into a single 2207 * submission. 2208 */ 2209 check_secondary: 2210 if (!list_is_last(&last->sched.link, 2211 &engine->active.requests)) { 2212 /* 2213 * Even if ELSP[1] is occupied and not worthy 2214 * of timeslices, our queue might be. 2215 */ 2216 start_timeslice(engine, queue_prio(execlists)); 2217 return; 2218 } 2219 } 2220 } 2221 2222 while (rb) { /* XXX virtual is always taking precedence */ 2223 struct virtual_engine *ve = 2224 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2225 struct i915_request *rq; 2226 2227 spin_lock(&ve->base.active.lock); 2228 2229 rq = ve->request; 2230 if (unlikely(!rq)) { /* lost the race to a sibling */ 2231 spin_unlock(&ve->base.active.lock); 2232 rb_erase_cached(rb, &execlists->virtual); 2233 RB_CLEAR_NODE(rb); 2234 rb = rb_first_cached(&execlists->virtual); 2235 continue; 2236 } 2237 2238 GEM_BUG_ON(rq != ve->request); 2239 GEM_BUG_ON(rq->engine != &ve->base); 2240 GEM_BUG_ON(rq->context != &ve->context); 2241 2242 if (rq_prio(rq) >= queue_prio(execlists)) { 2243 if (!virtual_matches(ve, rq, engine)) { 2244 spin_unlock(&ve->base.active.lock); 2245 rb = rb_next(rb); 2246 continue; 2247 } 2248 2249 if (last && !can_merge_rq(last, rq)) { 2250 spin_unlock(&ve->base.active.lock); 2251 start_timeslice(engine, rq_prio(rq)); 2252 return; /* leave this for another sibling */ 2253 } 2254 2255 ENGINE_TRACE(engine, 2256 "virtual rq=%llx:%lld%s, new engine? %s\n", 2257 rq->fence.context, 2258 rq->fence.seqno, 2259 i915_request_completed(rq) ? "!" : 2260 i915_request_started(rq) ? "*" : 2261 "", 2262 yesno(engine != ve->siblings[0])); 2263 2264 WRITE_ONCE(ve->request, NULL); 2265 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2266 INT_MIN); 2267 rb_erase_cached(rb, &execlists->virtual); 2268 RB_CLEAR_NODE(rb); 2269 2270 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2271 WRITE_ONCE(rq->engine, engine); 2272 2273 if (__i915_request_submit(rq)) { 2274 /* 2275 * Only after we confirm that we will submit 2276 * this request (i.e. it has not already 2277 * completed), do we want to update the context. 2278 * 2279 * This serves two purposes. It avoids 2280 * unnecessary work if we are resubmitting an 2281 * already completed request after timeslicing. 2282 * But more importantly, it prevents us altering 2283 * ve->siblings[] on an idle context, where 2284 * we may be using ve->siblings[] in 2285 * virtual_context_enter / virtual_context_exit. 2286 */ 2287 virtual_xfer_context(ve, engine); 2288 GEM_BUG_ON(ve->siblings[0] != engine); 2289 2290 submit = true; 2291 last = rq; 2292 } 2293 i915_request_put(rq); 2294 2295 /* 2296 * Hmm, we have a bunch of virtual engine requests, 2297 * but the first one was already completed (thanks 2298 * preempt-to-busy!). Keep looking at the veng queue 2299 * until we have no more relevant requests (i.e. 2300 * the normal submit queue has higher priority). 2301 */ 2302 if (!submit) { 2303 spin_unlock(&ve->base.active.lock); 2304 rb = rb_first_cached(&execlists->virtual); 2305 continue; 2306 } 2307 } 2308 2309 spin_unlock(&ve->base.active.lock); 2310 break; 2311 } 2312 2313 while ((rb = rb_first_cached(&execlists->queue))) { 2314 struct i915_priolist *p = to_priolist(rb); 2315 struct i915_request *rq, *rn; 2316 int i; 2317 2318 priolist_for_each_request_consume(rq, rn, p, i) { 2319 bool merge = true; 2320 2321 /* 2322 * Can we combine this request with the current port? 2323 * It has to be the same context/ringbuffer and not 2324 * have any exceptions (e.g. GVT saying never to 2325 * combine contexts). 2326 * 2327 * If we can combine the requests, we can execute both 2328 * by updating the RING_TAIL to point to the end of the 2329 * second request, and so we never need to tell the 2330 * hardware about the first. 2331 */ 2332 if (last && !can_merge_rq(last, rq)) { 2333 /* 2334 * If we are on the second port and cannot 2335 * combine this request with the last, then we 2336 * are done. 2337 */ 2338 if (port == last_port) 2339 goto done; 2340 2341 /* 2342 * We must not populate both ELSP[] with the 2343 * same LRCA, i.e. we must submit 2 different 2344 * contexts if we submit 2 ELSP. 2345 */ 2346 if (last->context == rq->context) 2347 goto done; 2348 2349 if (i915_request_has_sentinel(last)) 2350 goto done; 2351 2352 /* 2353 * If GVT overrides us we only ever submit 2354 * port[0], leaving port[1] empty. Note that we 2355 * also have to be careful that we don't queue 2356 * the same context (even though a different 2357 * request) to the second port. 2358 */ 2359 if (ctx_single_port_submission(last->context) || 2360 ctx_single_port_submission(rq->context)) 2361 goto done; 2362 2363 merge = false; 2364 } 2365 2366 if (__i915_request_submit(rq)) { 2367 if (!merge) { 2368 *port = execlists_schedule_in(last, port - execlists->pending); 2369 port++; 2370 last = NULL; 2371 } 2372 2373 GEM_BUG_ON(last && 2374 !can_merge_ctx(last->context, 2375 rq->context)); 2376 GEM_BUG_ON(last && 2377 i915_seqno_passed(last->fence.seqno, 2378 rq->fence.seqno)); 2379 2380 submit = true; 2381 last = rq; 2382 } 2383 } 2384 2385 rb_erase_cached(&p->node, &execlists->queue); 2386 i915_priolist_free(p); 2387 } 2388 2389 done: 2390 /* 2391 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2392 * 2393 * We choose the priority hint such that if we add a request of greater 2394 * priority than this, we kick the submission tasklet to decide on 2395 * the right order of submitting the requests to hardware. We must 2396 * also be prepared to reorder requests as they are in-flight on the 2397 * HW. We derive the priority hint then as the first "hole" in 2398 * the HW submission ports and if there are no available slots, 2399 * the priority of the lowest executing request, i.e. last. 2400 * 2401 * When we do receive a higher priority request ready to run from the 2402 * user, see queue_request(), the priority hint is bumped to that 2403 * request triggering preemption on the next dequeue (or subsequent 2404 * interrupt for secondary ports). 2405 */ 2406 execlists->queue_priority_hint = queue_prio(execlists); 2407 2408 if (submit) { 2409 *port = execlists_schedule_in(last, port - execlists->pending); 2410 execlists->switch_priority_hint = 2411 switch_prio(engine, *execlists->pending); 2412 2413 /* 2414 * Skip if we ended up with exactly the same set of requests, 2415 * e.g. trying to timeslice a pair of ordered contexts 2416 */ 2417 if (!memcmp(active, execlists->pending, 2418 (port - execlists->pending + 1) * sizeof(*port))) { 2419 do 2420 execlists_schedule_out(fetch_and_zero(port)); 2421 while (port-- != execlists->pending); 2422 2423 goto skip_submit; 2424 } 2425 clear_ports(port + 1, last_port - port); 2426 2427 WRITE_ONCE(execlists->yield, -1); 2428 set_preempt_timeout(engine, *active); 2429 execlists_submit_ports(engine); 2430 } else { 2431 start_timeslice(engine, execlists->queue_priority_hint); 2432 skip_submit: 2433 ring_set_paused(engine, 0); 2434 } 2435 } 2436 2437 static void 2438 cancel_port_requests(struct intel_engine_execlists * const execlists) 2439 { 2440 struct i915_request * const *port; 2441 2442 for (port = execlists->pending; *port; port++) 2443 execlists_schedule_out(*port); 2444 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2445 2446 /* Mark the end of active before we overwrite *active */ 2447 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2448 execlists_schedule_out(*port); 2449 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2450 2451 smp_wmb(); /* complete the seqlock for execlists_active() */ 2452 WRITE_ONCE(execlists->active, execlists->inflight); 2453 } 2454 2455 static inline void 2456 invalidate_csb_entries(const u64 *first, const u64 *last) 2457 { 2458 clflush((void *)first); 2459 clflush((void *)last); 2460 } 2461 2462 /* 2463 * Starting with Gen12, the status has a new format: 2464 * 2465 * bit 0: switched to new queue 2466 * bit 1: reserved 2467 * bit 2: semaphore wait mode (poll or signal), only valid when 2468 * switch detail is set to "wait on semaphore" 2469 * bits 3-5: engine class 2470 * bits 6-11: engine instance 2471 * bits 12-14: reserved 2472 * bits 15-25: sw context id of the lrc the GT switched to 2473 * bits 26-31: sw counter of the lrc the GT switched to 2474 * bits 32-35: context switch detail 2475 * - 0: ctx complete 2476 * - 1: wait on sync flip 2477 * - 2: wait on vblank 2478 * - 3: wait on scanline 2479 * - 4: wait on semaphore 2480 * - 5: context preempted (not on SEMAPHORE_WAIT or 2481 * WAIT_FOR_EVENT) 2482 * bit 36: reserved 2483 * bits 37-43: wait detail (for switch detail 1 to 4) 2484 * bits 44-46: reserved 2485 * bits 47-57: sw context id of the lrc the GT switched away from 2486 * bits 58-63: sw counter of the lrc the GT switched away from 2487 */ 2488 static inline bool gen12_csb_parse(const u64 csb) 2489 { 2490 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb)); 2491 bool new_queue = 2492 lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2493 2494 /* 2495 * The context switch detail is not guaranteed to be 5 when a preemption 2496 * occurs, so we can't just check for that. The check below works for 2497 * all the cases we care about, including preemptions of WAIT 2498 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2499 * would require some extra handling, but we don't support that. 2500 */ 2501 if (!ctx_away_valid || new_queue) { 2502 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb))); 2503 return true; 2504 } 2505 2506 /* 2507 * switch detail = 5 is covered by the case above and we do not expect a 2508 * context switch on an unsuccessful wait instruction since we always 2509 * use polling mode. 2510 */ 2511 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb))); 2512 return false; 2513 } 2514 2515 static inline bool gen8_csb_parse(const u64 csb) 2516 { 2517 return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2518 } 2519 2520 static noinline u64 2521 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2522 { 2523 u64 entry; 2524 2525 /* 2526 * Reading from the HWSP has one particular advantage: we can detect 2527 * a stale entry. Since the write into HWSP is broken, we have no reason 2528 * to trust the HW at all, the mmio entry may equally be unordered, so 2529 * we prefer the path that is self-checking and as a last resort, 2530 * return the mmio value. 2531 * 2532 * tgl,dg1:HSDES#22011327657 2533 */ 2534 preempt_disable(); 2535 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) { 2536 int idx = csb - engine->execlists.csb_status; 2537 int status; 2538 2539 status = GEN8_EXECLISTS_STATUS_BUF; 2540 if (idx >= 6) { 2541 status = GEN11_EXECLISTS_STATUS_BUF2; 2542 idx -= 6; 2543 } 2544 status += sizeof(u64) * idx; 2545 2546 entry = intel_uncore_read64(engine->uncore, 2547 _MMIO(engine->mmio_base + status)); 2548 } 2549 preempt_enable(); 2550 2551 return entry; 2552 } 2553 2554 static inline u64 2555 csb_read(const struct intel_engine_cs *engine, u64 * const csb) 2556 { 2557 u64 entry = READ_ONCE(*csb); 2558 2559 /* 2560 * Unfortunately, the GPU does not always serialise its write 2561 * of the CSB entries before its write of the CSB pointer, at least 2562 * from the perspective of the CPU, using what is known as a Global 2563 * Observation Point. We may read a new CSB tail pointer, but then 2564 * read the stale CSB entries, causing us to misinterpret the 2565 * context-switch events, and eventually declare the GPU hung. 2566 * 2567 * icl:HSDES#1806554093 2568 * tgl:HSDES#22011248461 2569 */ 2570 if (unlikely(entry == -1)) 2571 entry = wa_csb_read(engine, csb); 2572 2573 /* Consume this entry so that we can spot its future reuse. */ 2574 WRITE_ONCE(*csb, -1); 2575 2576 /* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */ 2577 return entry; 2578 } 2579 2580 static void process_csb(struct intel_engine_cs *engine) 2581 { 2582 struct intel_engine_execlists * const execlists = &engine->execlists; 2583 u64 * const buf = execlists->csb_status; 2584 const u8 num_entries = execlists->csb_size; 2585 u8 head, tail; 2586 2587 /* 2588 * As we modify our execlists state tracking we require exclusive 2589 * access. Either we are inside the tasklet, or the tasklet is disabled 2590 * and we assume that is only inside the reset paths and so serialised. 2591 */ 2592 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2593 !reset_in_progress(execlists)); 2594 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2595 2596 /* 2597 * Note that csb_write, csb_status may be either in HWSP or mmio. 2598 * When reading from the csb_write mmio register, we have to be 2599 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2600 * the low 4bits. As it happens we know the next 4bits are always 2601 * zero and so we can simply masked off the low u8 of the register 2602 * and treat it identically to reading from the HWSP (without having 2603 * to use explicit shifting and masking, and probably bifurcating 2604 * the code to handle the legacy mmio read). 2605 */ 2606 head = execlists->csb_head; 2607 tail = READ_ONCE(*execlists->csb_write); 2608 if (unlikely(head == tail)) 2609 return; 2610 2611 /* 2612 * We will consume all events from HW, or at least pretend to. 2613 * 2614 * The sequence of events from the HW is deterministic, and derived 2615 * from our writes to the ELSP, with a smidgen of variability for 2616 * the arrival of the asynchronous requests wrt to the inflight 2617 * execution. If the HW sends an event that does not correspond with 2618 * the one we are expecting, we have to abandon all hope as we lose 2619 * all tracking of what the engine is actually executing. We will 2620 * only detect we are out of sequence with the HW when we get an 2621 * 'impossible' event because we have already drained our own 2622 * preemption/promotion queue. If this occurs, we know that we likely 2623 * lost track of execution earlier and must unwind and restart, the 2624 * simplest way is by stop processing the event queue and force the 2625 * engine to reset. 2626 */ 2627 execlists->csb_head = tail; 2628 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2629 2630 /* 2631 * Hopefully paired with a wmb() in HW! 2632 * 2633 * We must complete the read of the write pointer before any reads 2634 * from the CSB, so that we do not see stale values. Without an rmb 2635 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2636 * we perform the READ_ONCE(*csb_write). 2637 */ 2638 rmb(); 2639 do { 2640 bool promote; 2641 u64 csb; 2642 2643 if (++head == num_entries) 2644 head = 0; 2645 2646 /* 2647 * We are flying near dragons again. 2648 * 2649 * We hold a reference to the request in execlist_port[] 2650 * but no more than that. We are operating in softirq 2651 * context and so cannot hold any mutex or sleep. That 2652 * prevents us stopping the requests we are processing 2653 * in port[] from being retired simultaneously (the 2654 * breadcrumb will be complete before we see the 2655 * context-switch). As we only hold the reference to the 2656 * request, any pointer chasing underneath the request 2657 * is subject to a potential use-after-free. Thus we 2658 * store all of the bookkeeping within port[] as 2659 * required, and avoid using unguarded pointers beneath 2660 * request itself. The same applies to the atomic 2661 * status notifier. 2662 */ 2663 2664 csb = csb_read(engine, buf + head); 2665 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2666 head, upper_32_bits(csb), lower_32_bits(csb)); 2667 2668 if (INTEL_GEN(engine->i915) >= 12) 2669 promote = gen12_csb_parse(csb); 2670 else 2671 promote = gen8_csb_parse(csb); 2672 if (promote) { 2673 struct i915_request * const *old = execlists->active; 2674 2675 if (GEM_WARN_ON(!*execlists->pending)) { 2676 execlists->error_interrupt |= ERROR_CSB; 2677 break; 2678 } 2679 2680 ring_set_paused(engine, 0); 2681 2682 /* Point active to the new ELSP; prevent overwriting */ 2683 WRITE_ONCE(execlists->active, execlists->pending); 2684 smp_wmb(); /* notify execlists_active() */ 2685 2686 /* cancel old inflight, prepare for switch */ 2687 trace_ports(execlists, "preempted", old); 2688 while (*old) 2689 execlists_schedule_out(*old++); 2690 2691 /* switch pending to inflight */ 2692 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2693 copy_ports(execlists->inflight, 2694 execlists->pending, 2695 execlists_num_ports(execlists)); 2696 smp_wmb(); /* complete the seqlock */ 2697 WRITE_ONCE(execlists->active, execlists->inflight); 2698 2699 /* XXX Magic delay for tgl */ 2700 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 2701 2702 WRITE_ONCE(execlists->pending[0], NULL); 2703 } else { 2704 if (GEM_WARN_ON(!*execlists->active)) { 2705 execlists->error_interrupt |= ERROR_CSB; 2706 break; 2707 } 2708 2709 /* port0 completed, advanced to port1 */ 2710 trace_ports(execlists, "completed", execlists->active); 2711 2712 /* 2713 * We rely on the hardware being strongly 2714 * ordered, that the breadcrumb write is 2715 * coherent (visible from the CPU) before the 2716 * user interrupt is processed. One might assume 2717 * that the breadcrumb write being before the 2718 * user interrupt and the CS event for the context 2719 * switch would therefore be before the CS event 2720 * itself... 2721 */ 2722 if (GEM_SHOW_DEBUG() && 2723 !i915_request_completed(*execlists->active)) { 2724 struct i915_request *rq = *execlists->active; 2725 const u32 *regs __maybe_unused = 2726 rq->context->lrc_reg_state; 2727 2728 ENGINE_TRACE(engine, 2729 "context completed before request!\n"); 2730 ENGINE_TRACE(engine, 2731 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2732 ENGINE_READ(engine, RING_START), 2733 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2734 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2735 ENGINE_READ(engine, RING_CTL), 2736 ENGINE_READ(engine, RING_MI_MODE)); 2737 ENGINE_TRACE(engine, 2738 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2739 i915_ggtt_offset(rq->ring->vma), 2740 rq->head, rq->tail, 2741 rq->fence.context, 2742 lower_32_bits(rq->fence.seqno), 2743 hwsp_seqno(rq)); 2744 ENGINE_TRACE(engine, 2745 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2746 regs[CTX_RING_START], 2747 regs[CTX_RING_HEAD], 2748 regs[CTX_RING_TAIL]); 2749 } 2750 2751 execlists_schedule_out(*execlists->active++); 2752 2753 GEM_BUG_ON(execlists->active - execlists->inflight > 2754 execlists_num_ports(execlists)); 2755 } 2756 } while (head != tail); 2757 2758 set_timeslice(engine); 2759 2760 /* 2761 * Gen11 has proven to fail wrt global observation point between 2762 * entry and tail update, failing on the ordering and thus 2763 * we see an old entry in the context status buffer. 2764 * 2765 * Forcibly evict out entries for the next gpu csb update, 2766 * to increase the odds that we get a fresh entries with non 2767 * working hardware. The cost for doing so comes out mostly with 2768 * the wash as hardware, working or not, will need to do the 2769 * invalidation before. 2770 */ 2771 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2772 } 2773 2774 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2775 { 2776 lockdep_assert_held(&engine->active.lock); 2777 if (!READ_ONCE(engine->execlists.pending[0])) { 2778 rcu_read_lock(); /* protect peeking at execlists->active */ 2779 execlists_dequeue(engine); 2780 rcu_read_unlock(); 2781 } 2782 } 2783 2784 static void __execlists_hold(struct i915_request *rq) 2785 { 2786 LIST_HEAD(list); 2787 2788 do { 2789 struct i915_dependency *p; 2790 2791 if (i915_request_is_active(rq)) 2792 __i915_request_unsubmit(rq); 2793 2794 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2795 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2796 i915_request_set_hold(rq); 2797 RQ_TRACE(rq, "on hold\n"); 2798 2799 for_each_waiter(p, rq) { 2800 struct i915_request *w = 2801 container_of(p->waiter, typeof(*w), sched); 2802 2803 /* Leave semaphores spinning on the other engines */ 2804 if (w->engine != rq->engine) 2805 continue; 2806 2807 if (!i915_request_is_ready(w)) 2808 continue; 2809 2810 if (i915_request_completed(w)) 2811 continue; 2812 2813 if (i915_request_on_hold(w)) 2814 continue; 2815 2816 list_move_tail(&w->sched.link, &list); 2817 } 2818 2819 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2820 } while (rq); 2821 } 2822 2823 static bool execlists_hold(struct intel_engine_cs *engine, 2824 struct i915_request *rq) 2825 { 2826 if (i915_request_on_hold(rq)) 2827 return false; 2828 2829 spin_lock_irq(&engine->active.lock); 2830 2831 if (i915_request_completed(rq)) { /* too late! */ 2832 rq = NULL; 2833 goto unlock; 2834 } 2835 2836 if (rq->engine != engine) { /* preempted virtual engine */ 2837 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2838 2839 /* 2840 * intel_context_inflight() is only protected by virtue 2841 * of process_csb() being called only by the tasklet (or 2842 * directly from inside reset while the tasklet is suspended). 2843 * Assert that neither of those are allowed to run while we 2844 * poke at the request queues. 2845 */ 2846 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2847 2848 /* 2849 * An unsubmitted request along a virtual engine will 2850 * remain on the active (this) engine until we are able 2851 * to process the context switch away (and so mark the 2852 * context as no longer in flight). That cannot have happened 2853 * yet, otherwise we would not be hanging! 2854 */ 2855 spin_lock(&ve->base.active.lock); 2856 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2857 GEM_BUG_ON(ve->request != rq); 2858 ve->request = NULL; 2859 spin_unlock(&ve->base.active.lock); 2860 i915_request_put(rq); 2861 2862 rq->engine = engine; 2863 } 2864 2865 /* 2866 * Transfer this request onto the hold queue to prevent it 2867 * being resumbitted to HW (and potentially completed) before we have 2868 * released it. Since we may have already submitted following 2869 * requests, we need to remove those as well. 2870 */ 2871 GEM_BUG_ON(i915_request_on_hold(rq)); 2872 GEM_BUG_ON(rq->engine != engine); 2873 __execlists_hold(rq); 2874 GEM_BUG_ON(list_empty(&engine->active.hold)); 2875 2876 unlock: 2877 spin_unlock_irq(&engine->active.lock); 2878 return rq; 2879 } 2880 2881 static bool hold_request(const struct i915_request *rq) 2882 { 2883 struct i915_dependency *p; 2884 bool result = false; 2885 2886 /* 2887 * If one of our ancestors is on hold, we must also be on hold, 2888 * otherwise we will bypass it and execute before it. 2889 */ 2890 rcu_read_lock(); 2891 for_each_signaler(p, rq) { 2892 const struct i915_request *s = 2893 container_of(p->signaler, typeof(*s), sched); 2894 2895 if (s->engine != rq->engine) 2896 continue; 2897 2898 result = i915_request_on_hold(s); 2899 if (result) 2900 break; 2901 } 2902 rcu_read_unlock(); 2903 2904 return result; 2905 } 2906 2907 static void __execlists_unhold(struct i915_request *rq) 2908 { 2909 LIST_HEAD(list); 2910 2911 do { 2912 struct i915_dependency *p; 2913 2914 RQ_TRACE(rq, "hold release\n"); 2915 2916 GEM_BUG_ON(!i915_request_on_hold(rq)); 2917 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2918 2919 i915_request_clear_hold(rq); 2920 list_move_tail(&rq->sched.link, 2921 i915_sched_lookup_priolist(rq->engine, 2922 rq_prio(rq))); 2923 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2924 2925 /* Also release any children on this engine that are ready */ 2926 for_each_waiter(p, rq) { 2927 struct i915_request *w = 2928 container_of(p->waiter, typeof(*w), sched); 2929 2930 /* Propagate any change in error status */ 2931 if (rq->fence.error) 2932 i915_request_set_error_once(w, rq->fence.error); 2933 2934 if (w->engine != rq->engine) 2935 continue; 2936 2937 if (!i915_request_on_hold(w)) 2938 continue; 2939 2940 /* Check that no other parents are also on hold */ 2941 if (hold_request(w)) 2942 continue; 2943 2944 list_move_tail(&w->sched.link, &list); 2945 } 2946 2947 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2948 } while (rq); 2949 } 2950 2951 static void execlists_unhold(struct intel_engine_cs *engine, 2952 struct i915_request *rq) 2953 { 2954 spin_lock_irq(&engine->active.lock); 2955 2956 /* 2957 * Move this request back to the priority queue, and all of its 2958 * children and grandchildren that were suspended along with it. 2959 */ 2960 __execlists_unhold(rq); 2961 2962 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2963 engine->execlists.queue_priority_hint = rq_prio(rq); 2964 tasklet_hi_schedule(&engine->execlists.tasklet); 2965 } 2966 2967 spin_unlock_irq(&engine->active.lock); 2968 } 2969 2970 struct execlists_capture { 2971 struct work_struct work; 2972 struct i915_request *rq; 2973 struct i915_gpu_coredump *error; 2974 }; 2975 2976 static void execlists_capture_work(struct work_struct *work) 2977 { 2978 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2979 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2980 struct intel_engine_cs *engine = cap->rq->engine; 2981 struct intel_gt_coredump *gt = cap->error->gt; 2982 struct intel_engine_capture_vma *vma; 2983 2984 /* Compress all the objects attached to the request, slow! */ 2985 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2986 if (vma) { 2987 struct i915_vma_compress *compress = 2988 i915_vma_capture_prepare(gt); 2989 2990 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2991 i915_vma_capture_finish(gt, compress); 2992 } 2993 2994 gt->simulated = gt->engine->simulated; 2995 cap->error->simulated = gt->simulated; 2996 2997 /* Publish the error state, and announce it to the world */ 2998 i915_error_state_store(cap->error); 2999 i915_gpu_coredump_put(cap->error); 3000 3001 /* Return this request and all that depend upon it for signaling */ 3002 execlists_unhold(engine, cap->rq); 3003 i915_request_put(cap->rq); 3004 3005 kfree(cap); 3006 } 3007 3008 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 3009 { 3010 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 3011 struct execlists_capture *cap; 3012 3013 cap = kmalloc(sizeof(*cap), gfp); 3014 if (!cap) 3015 return NULL; 3016 3017 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 3018 if (!cap->error) 3019 goto err_cap; 3020 3021 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 3022 if (!cap->error->gt) 3023 goto err_gpu; 3024 3025 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 3026 if (!cap->error->gt->engine) 3027 goto err_gt; 3028 3029 cap->error->gt->engine->hung = true; 3030 3031 return cap; 3032 3033 err_gt: 3034 kfree(cap->error->gt); 3035 err_gpu: 3036 kfree(cap->error); 3037 err_cap: 3038 kfree(cap); 3039 return NULL; 3040 } 3041 3042 static struct i915_request * 3043 active_context(struct intel_engine_cs *engine, u32 ccid) 3044 { 3045 const struct intel_engine_execlists * const el = &engine->execlists; 3046 struct i915_request * const *port, *rq; 3047 3048 /* 3049 * Use the most recent result from process_csb(), but just in case 3050 * we trigger an error (via interrupt) before the first CS event has 3051 * been written, peek at the next submission. 3052 */ 3053 3054 for (port = el->active; (rq = *port); port++) { 3055 if (rq->context->lrc.ccid == ccid) { 3056 ENGINE_TRACE(engine, 3057 "ccid found at active:%zd\n", 3058 port - el->active); 3059 return rq; 3060 } 3061 } 3062 3063 for (port = el->pending; (rq = *port); port++) { 3064 if (rq->context->lrc.ccid == ccid) { 3065 ENGINE_TRACE(engine, 3066 "ccid found at pending:%zd\n", 3067 port - el->pending); 3068 return rq; 3069 } 3070 } 3071 3072 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3073 return NULL; 3074 } 3075 3076 static u32 active_ccid(struct intel_engine_cs *engine) 3077 { 3078 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3079 } 3080 3081 static void execlists_capture(struct intel_engine_cs *engine) 3082 { 3083 struct execlists_capture *cap; 3084 3085 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3086 return; 3087 3088 /* 3089 * We need to _quickly_ capture the engine state before we reset. 3090 * We are inside an atomic section (softirq) here and we are delaying 3091 * the forced preemption event. 3092 */ 3093 cap = capture_regs(engine); 3094 if (!cap) 3095 return; 3096 3097 spin_lock_irq(&engine->active.lock); 3098 cap->rq = active_context(engine, active_ccid(engine)); 3099 if (cap->rq) { 3100 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3101 cap->rq = i915_request_get_rcu(cap->rq); 3102 } 3103 spin_unlock_irq(&engine->active.lock); 3104 if (!cap->rq) 3105 goto err_free; 3106 3107 /* 3108 * Remove the request from the execlists queue, and take ownership 3109 * of the request. We pass it to our worker who will _slowly_ compress 3110 * all the pages the _user_ requested for debugging their batch, after 3111 * which we return it to the queue for signaling. 3112 * 3113 * By removing them from the execlists queue, we also remove the 3114 * requests from being processed by __unwind_incomplete_requests() 3115 * during the intel_engine_reset(), and so they will *not* be replayed 3116 * afterwards. 3117 * 3118 * Note that because we have not yet reset the engine at this point, 3119 * it is possible for the request that we have identified as being 3120 * guilty, did in fact complete and we will then hit an arbitration 3121 * point allowing the outstanding preemption to succeed. The likelihood 3122 * of that is very low (as capturing of the engine registers should be 3123 * fast enough to run inside an irq-off atomic section!), so we will 3124 * simply hold that request accountable for being non-preemptible 3125 * long enough to force the reset. 3126 */ 3127 if (!execlists_hold(engine, cap->rq)) 3128 goto err_rq; 3129 3130 INIT_WORK(&cap->work, execlists_capture_work); 3131 schedule_work(&cap->work); 3132 return; 3133 3134 err_rq: 3135 i915_request_put(cap->rq); 3136 err_free: 3137 i915_gpu_coredump_put(cap->error); 3138 kfree(cap); 3139 } 3140 3141 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3142 { 3143 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3144 unsigned long *lock = &engine->gt->reset.flags; 3145 3146 if (!intel_has_reset_engine(engine->gt)) 3147 return; 3148 3149 if (test_and_set_bit(bit, lock)) 3150 return; 3151 3152 ENGINE_TRACE(engine, "reset for %s\n", msg); 3153 3154 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3155 tasklet_disable_nosync(&engine->execlists.tasklet); 3156 3157 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3158 execlists_capture(engine); 3159 intel_engine_reset(engine, msg); 3160 3161 tasklet_enable(&engine->execlists.tasklet); 3162 clear_and_wake_up_bit(bit, lock); 3163 } 3164 3165 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3166 { 3167 const struct timer_list *t = &engine->execlists.preempt; 3168 3169 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3170 return false; 3171 3172 if (!timer_expired(t)) 3173 return false; 3174 3175 return READ_ONCE(engine->execlists.pending[0]); 3176 } 3177 3178 /* 3179 * Check the unread Context Status Buffers and manage the submission of new 3180 * contexts to the ELSP accordingly. 3181 */ 3182 static void execlists_submission_tasklet(unsigned long data) 3183 { 3184 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3185 bool timeout = preempt_timeout(engine); 3186 3187 process_csb(engine); 3188 3189 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3190 const char *msg; 3191 3192 /* Generate the error message in priority wrt to the user! */ 3193 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3194 msg = "CS error"; /* thrown by a user payload */ 3195 else if (engine->execlists.error_interrupt & ERROR_CSB) 3196 msg = "invalid CSB event"; 3197 else 3198 msg = "internal error"; 3199 3200 engine->execlists.error_interrupt = 0; 3201 execlists_reset(engine, msg); 3202 } 3203 3204 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3205 unsigned long flags; 3206 3207 spin_lock_irqsave(&engine->active.lock, flags); 3208 __execlists_submission_tasklet(engine); 3209 spin_unlock_irqrestore(&engine->active.lock, flags); 3210 3211 /* Recheck after serialising with direct-submission */ 3212 if (unlikely(timeout && preempt_timeout(engine))) { 3213 cancel_timer(&engine->execlists.preempt); 3214 execlists_reset(engine, "preemption time out"); 3215 } 3216 } 3217 } 3218 3219 static void __execlists_kick(struct intel_engine_execlists *execlists) 3220 { 3221 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3222 tasklet_hi_schedule(&execlists->tasklet); 3223 } 3224 3225 #define execlists_kick(t, member) \ 3226 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3227 3228 static void execlists_timeslice(struct timer_list *timer) 3229 { 3230 execlists_kick(timer, timer); 3231 } 3232 3233 static void execlists_preempt(struct timer_list *timer) 3234 { 3235 execlists_kick(timer, preempt); 3236 } 3237 3238 static void queue_request(struct intel_engine_cs *engine, 3239 struct i915_request *rq) 3240 { 3241 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3242 list_add_tail(&rq->sched.link, 3243 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3244 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3245 } 3246 3247 static void __submit_queue_imm(struct intel_engine_cs *engine) 3248 { 3249 struct intel_engine_execlists * const execlists = &engine->execlists; 3250 3251 if (reset_in_progress(execlists)) 3252 return; /* defer until we restart the engine following reset */ 3253 3254 __execlists_submission_tasklet(engine); 3255 } 3256 3257 static void submit_queue(struct intel_engine_cs *engine, 3258 const struct i915_request *rq) 3259 { 3260 struct intel_engine_execlists *execlists = &engine->execlists; 3261 3262 if (rq_prio(rq) <= execlists->queue_priority_hint) 3263 return; 3264 3265 execlists->queue_priority_hint = rq_prio(rq); 3266 __submit_queue_imm(engine); 3267 } 3268 3269 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3270 const struct i915_request *rq) 3271 { 3272 GEM_BUG_ON(i915_request_on_hold(rq)); 3273 return !list_empty(&engine->active.hold) && hold_request(rq); 3274 } 3275 3276 static void flush_csb(struct intel_engine_cs *engine) 3277 { 3278 struct intel_engine_execlists *el = &engine->execlists; 3279 3280 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3281 if (!reset_in_progress(el)) 3282 process_csb(engine); 3283 tasklet_unlock(&el->tasklet); 3284 } 3285 } 3286 3287 static void execlists_submit_request(struct i915_request *request) 3288 { 3289 struct intel_engine_cs *engine = request->engine; 3290 unsigned long flags; 3291 3292 /* Hopefully we clear execlists->pending[] to let us through */ 3293 flush_csb(engine); 3294 3295 /* Will be called from irq-context when using foreign fences. */ 3296 spin_lock_irqsave(&engine->active.lock, flags); 3297 3298 if (unlikely(ancestor_on_hold(engine, request))) { 3299 RQ_TRACE(request, "ancestor on hold\n"); 3300 list_add_tail(&request->sched.link, &engine->active.hold); 3301 i915_request_set_hold(request); 3302 } else { 3303 queue_request(engine, request); 3304 3305 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3306 GEM_BUG_ON(list_empty(&request->sched.link)); 3307 3308 submit_queue(engine, request); 3309 } 3310 3311 spin_unlock_irqrestore(&engine->active.lock, flags); 3312 } 3313 3314 static void __execlists_context_fini(struct intel_context *ce) 3315 { 3316 intel_ring_put(ce->ring); 3317 i915_vma_put(ce->state); 3318 } 3319 3320 static void execlists_context_destroy(struct kref *kref) 3321 { 3322 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3323 3324 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3325 GEM_BUG_ON(intel_context_is_pinned(ce)); 3326 3327 if (ce->state) 3328 __execlists_context_fini(ce); 3329 3330 intel_context_fini(ce); 3331 intel_context_free(ce); 3332 } 3333 3334 static void 3335 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3336 { 3337 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3338 return; 3339 3340 vaddr += engine->context_size; 3341 3342 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3343 } 3344 3345 static void 3346 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3347 { 3348 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3349 return; 3350 3351 vaddr += engine->context_size; 3352 3353 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3354 drm_err_once(&engine->i915->drm, 3355 "%s context redzone overwritten!\n", 3356 engine->name); 3357 } 3358 3359 static void execlists_context_unpin(struct intel_context *ce) 3360 { 3361 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3362 ce->engine); 3363 } 3364 3365 static void execlists_context_post_unpin(struct intel_context *ce) 3366 { 3367 i915_gem_object_unpin_map(ce->state->obj); 3368 } 3369 3370 static u32 * 3371 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3372 { 3373 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3374 MI_SRM_LRM_GLOBAL_GTT | 3375 MI_LRI_LRM_CS_MMIO; 3376 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3377 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3378 CTX_TIMESTAMP * sizeof(u32); 3379 *cs++ = 0; 3380 3381 *cs++ = MI_LOAD_REGISTER_REG | 3382 MI_LRR_SOURCE_CS_MMIO | 3383 MI_LRI_LRM_CS_MMIO; 3384 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3385 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3386 3387 *cs++ = MI_LOAD_REGISTER_REG | 3388 MI_LRR_SOURCE_CS_MMIO | 3389 MI_LRI_LRM_CS_MMIO; 3390 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3391 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3392 3393 return cs; 3394 } 3395 3396 static u32 * 3397 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3398 { 3399 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3400 3401 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3402 MI_SRM_LRM_GLOBAL_GTT | 3403 MI_LRI_LRM_CS_MMIO; 3404 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3405 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3406 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3407 *cs++ = 0; 3408 3409 return cs; 3410 } 3411 3412 static u32 * 3413 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3414 { 3415 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3416 3417 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3418 MI_SRM_LRM_GLOBAL_GTT | 3419 MI_LRI_LRM_CS_MMIO; 3420 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3421 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3422 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3423 *cs++ = 0; 3424 3425 *cs++ = MI_LOAD_REGISTER_REG | 3426 MI_LRR_SOURCE_CS_MMIO | 3427 MI_LRI_LRM_CS_MMIO; 3428 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3429 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3430 3431 return cs; 3432 } 3433 3434 static u32 * 3435 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3436 { 3437 cs = gen12_emit_timestamp_wa(ce, cs); 3438 cs = gen12_emit_cmd_buf_wa(ce, cs); 3439 cs = gen12_emit_restore_scratch(ce, cs); 3440 3441 return cs; 3442 } 3443 3444 static u32 * 3445 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3446 { 3447 cs = gen12_emit_timestamp_wa(ce, cs); 3448 cs = gen12_emit_restore_scratch(ce, cs); 3449 3450 return cs; 3451 } 3452 3453 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3454 { 3455 return PAGE_SIZE * ce->wa_bb_page; 3456 } 3457 3458 static u32 *context_indirect_bb(const struct intel_context *ce) 3459 { 3460 void *ptr; 3461 3462 GEM_BUG_ON(!ce->wa_bb_page); 3463 3464 ptr = ce->lrc_reg_state; 3465 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3466 ptr += context_wa_bb_offset(ce); 3467 3468 return ptr; 3469 } 3470 3471 static void 3472 setup_indirect_ctx_bb(const struct intel_context *ce, 3473 const struct intel_engine_cs *engine, 3474 u32 *(*emit)(const struct intel_context *, u32 *)) 3475 { 3476 u32 * const start = context_indirect_bb(ce); 3477 u32 *cs; 3478 3479 cs = emit(ce, start); 3480 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3481 while ((unsigned long)cs % CACHELINE_BYTES) 3482 *cs++ = MI_NOOP; 3483 3484 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3485 i915_ggtt_offset(ce->state) + 3486 context_wa_bb_offset(ce), 3487 (cs - start) * sizeof(*cs)); 3488 } 3489 3490 static void 3491 __execlists_update_reg_state(const struct intel_context *ce, 3492 const struct intel_engine_cs *engine, 3493 u32 head) 3494 { 3495 struct intel_ring *ring = ce->ring; 3496 u32 *regs = ce->lrc_reg_state; 3497 3498 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3499 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3500 3501 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3502 regs[CTX_RING_HEAD] = head; 3503 regs[CTX_RING_TAIL] = ring->tail; 3504 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3505 3506 /* RPCS */ 3507 if (engine->class == RENDER_CLASS) { 3508 regs[CTX_R_PWR_CLK_STATE] = 3509 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3510 3511 i915_oa_init_reg_state(ce, engine); 3512 } 3513 3514 if (ce->wa_bb_page) { 3515 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3516 3517 fn = gen12_emit_indirect_ctx_xcs; 3518 if (ce->engine->class == RENDER_CLASS) 3519 fn = gen12_emit_indirect_ctx_rcs; 3520 3521 /* Mutually exclusive wrt to global indirect bb */ 3522 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3523 setup_indirect_ctx_bb(ce, engine, fn); 3524 } 3525 } 3526 3527 static int 3528 execlists_context_pre_pin(struct intel_context *ce, 3529 struct i915_gem_ww_ctx *ww, void **vaddr) 3530 { 3531 GEM_BUG_ON(!ce->state); 3532 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3533 3534 *vaddr = i915_gem_object_pin_map(ce->state->obj, 3535 i915_coherent_map_type(ce->engine->i915) | 3536 I915_MAP_OVERRIDE); 3537 3538 return PTR_ERR_OR_ZERO(*vaddr); 3539 } 3540 3541 static int 3542 __execlists_context_pin(struct intel_context *ce, 3543 struct intel_engine_cs *engine, 3544 void *vaddr) 3545 { 3546 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3547 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3548 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3549 3550 return 0; 3551 } 3552 3553 static int execlists_context_pin(struct intel_context *ce, void *vaddr) 3554 { 3555 return __execlists_context_pin(ce, ce->engine, vaddr); 3556 } 3557 3558 static int execlists_context_alloc(struct intel_context *ce) 3559 { 3560 return __execlists_context_alloc(ce, ce->engine); 3561 } 3562 3563 static void execlists_context_reset(struct intel_context *ce) 3564 { 3565 CE_TRACE(ce, "reset\n"); 3566 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3567 3568 intel_ring_reset(ce->ring, ce->ring->emit); 3569 3570 /* Scrub away the garbage */ 3571 execlists_init_reg_state(ce->lrc_reg_state, 3572 ce, ce->engine, ce->ring, true); 3573 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3574 3575 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3576 } 3577 3578 static const struct intel_context_ops execlists_context_ops = { 3579 .alloc = execlists_context_alloc, 3580 3581 .pre_pin = execlists_context_pre_pin, 3582 .pin = execlists_context_pin, 3583 .unpin = execlists_context_unpin, 3584 .post_unpin = execlists_context_post_unpin, 3585 3586 .enter = intel_context_enter_engine, 3587 .exit = intel_context_exit_engine, 3588 3589 .reset = execlists_context_reset, 3590 .destroy = execlists_context_destroy, 3591 }; 3592 3593 static u32 hwsp_offset(const struct i915_request *rq) 3594 { 3595 const struct intel_timeline_cacheline *cl; 3596 3597 /* Before the request is executed, the timeline/cachline is fixed */ 3598 3599 cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); 3600 if (cl) 3601 return cl->ggtt_offset; 3602 3603 return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; 3604 } 3605 3606 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3607 { 3608 u32 *cs; 3609 3610 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3611 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3612 return 0; 3613 3614 cs = intel_ring_begin(rq, 6); 3615 if (IS_ERR(cs)) 3616 return PTR_ERR(cs); 3617 3618 /* 3619 * Check if we have been preempted before we even get started. 3620 * 3621 * After this point i915_request_started() reports true, even if 3622 * we get preempted and so are no longer running. 3623 */ 3624 *cs++ = MI_ARB_CHECK; 3625 *cs++ = MI_NOOP; 3626 3627 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3628 *cs++ = hwsp_offset(rq); 3629 *cs++ = 0; 3630 *cs++ = rq->fence.seqno - 1; 3631 3632 intel_ring_advance(rq, cs); 3633 3634 /* Record the updated position of the request's payload */ 3635 rq->infix = intel_ring_offset(rq, cs); 3636 3637 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3638 3639 return 0; 3640 } 3641 3642 static int emit_pdps(struct i915_request *rq) 3643 { 3644 const struct intel_engine_cs * const engine = rq->engine; 3645 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3646 int err, i; 3647 u32 *cs; 3648 3649 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3650 3651 /* 3652 * Beware ye of the dragons, this sequence is magic! 3653 * 3654 * Small changes to this sequence can cause anything from 3655 * GPU hangs to forcewake errors and machine lockups! 3656 */ 3657 3658 /* Flush any residual operations from the context load */ 3659 err = engine->emit_flush(rq, EMIT_FLUSH); 3660 if (err) 3661 return err; 3662 3663 /* Magic required to prevent forcewake errors! */ 3664 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3665 if (err) 3666 return err; 3667 3668 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3669 if (IS_ERR(cs)) 3670 return PTR_ERR(cs); 3671 3672 /* Ensure the LRI have landed before we invalidate & continue */ 3673 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3674 for (i = GEN8_3LVL_PDPES; i--; ) { 3675 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3676 u32 base = engine->mmio_base; 3677 3678 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3679 *cs++ = upper_32_bits(pd_daddr); 3680 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3681 *cs++ = lower_32_bits(pd_daddr); 3682 } 3683 *cs++ = MI_NOOP; 3684 3685 intel_ring_advance(rq, cs); 3686 3687 return 0; 3688 } 3689 3690 static int execlists_request_alloc(struct i915_request *request) 3691 { 3692 int ret; 3693 3694 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3695 3696 /* 3697 * Flush enough space to reduce the likelihood of waiting after 3698 * we start building the request - in which case we will just 3699 * have to repeat work. 3700 */ 3701 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3702 3703 /* 3704 * Note that after this point, we have committed to using 3705 * this request as it is being used to both track the 3706 * state of engine initialisation and liveness of the 3707 * golden renderstate above. Think twice before you try 3708 * to cancel/unwind this request now. 3709 */ 3710 3711 if (!i915_vm_is_4lvl(request->context->vm)) { 3712 ret = emit_pdps(request); 3713 if (ret) 3714 return ret; 3715 } 3716 3717 /* Unconditionally invalidate GPU caches and TLBs. */ 3718 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3719 if (ret) 3720 return ret; 3721 3722 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3723 return 0; 3724 } 3725 3726 /* 3727 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3728 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3729 * but there is a slight complication as this is applied in WA batch where the 3730 * values are only initialized once so we cannot take register value at the 3731 * beginning and reuse it further; hence we save its value to memory, upload a 3732 * constant value with bit21 set and then we restore it back with the saved value. 3733 * To simplify the WA, a constant value is formed by using the default value 3734 * of this register. This shouldn't be a problem because we are only modifying 3735 * it for a short period and this batch in non-premptible. We can ofcourse 3736 * use additional instructions that read the actual value of the register 3737 * at that time and set our bit of interest but it makes the WA complicated. 3738 * 3739 * This WA is also required for Gen9 so extracting as a function avoids 3740 * code duplication. 3741 */ 3742 static u32 * 3743 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3744 { 3745 /* NB no one else is allowed to scribble over scratch + 256! */ 3746 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3747 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3748 *batch++ = intel_gt_scratch_offset(engine->gt, 3749 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3750 *batch++ = 0; 3751 3752 *batch++ = MI_LOAD_REGISTER_IMM(1); 3753 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3754 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3755 3756 batch = gen8_emit_pipe_control(batch, 3757 PIPE_CONTROL_CS_STALL | 3758 PIPE_CONTROL_DC_FLUSH_ENABLE, 3759 0); 3760 3761 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3762 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3763 *batch++ = intel_gt_scratch_offset(engine->gt, 3764 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3765 *batch++ = 0; 3766 3767 return batch; 3768 } 3769 3770 /* 3771 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3772 * initialized at the beginning and shared across all contexts but this field 3773 * helps us to have multiple batches at different offsets and select them based 3774 * on a criteria. At the moment this batch always start at the beginning of the page 3775 * and at this point we don't have multiple wa_ctx batch buffers. 3776 * 3777 * The number of WA applied are not known at the beginning; we use this field 3778 * to return the no of DWORDS written. 3779 * 3780 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3781 * so it adds NOOPs as padding to make it cacheline aligned. 3782 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3783 * makes a complete batch buffer. 3784 */ 3785 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3786 { 3787 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3788 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3789 3790 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3791 if (IS_BROADWELL(engine->i915)) 3792 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3793 3794 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3795 /* Actual scratch location is at 128 bytes offset */ 3796 batch = gen8_emit_pipe_control(batch, 3797 PIPE_CONTROL_FLUSH_L3 | 3798 PIPE_CONTROL_STORE_DATA_INDEX | 3799 PIPE_CONTROL_CS_STALL | 3800 PIPE_CONTROL_QW_WRITE, 3801 LRC_PPHWSP_SCRATCH_ADDR); 3802 3803 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3804 3805 /* Pad to end of cacheline */ 3806 while ((unsigned long)batch % CACHELINE_BYTES) 3807 *batch++ = MI_NOOP; 3808 3809 /* 3810 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3811 * execution depends on the length specified in terms of cache lines 3812 * in the register CTX_RCS_INDIRECT_CTX 3813 */ 3814 3815 return batch; 3816 } 3817 3818 struct lri { 3819 i915_reg_t reg; 3820 u32 value; 3821 }; 3822 3823 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3824 { 3825 GEM_BUG_ON(!count || count > 63); 3826 3827 *batch++ = MI_LOAD_REGISTER_IMM(count); 3828 do { 3829 *batch++ = i915_mmio_reg_offset(lri->reg); 3830 *batch++ = lri->value; 3831 } while (lri++, --count); 3832 *batch++ = MI_NOOP; 3833 3834 return batch; 3835 } 3836 3837 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3838 { 3839 static const struct lri lri[] = { 3840 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3841 { 3842 COMMON_SLICE_CHICKEN2, 3843 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3844 0), 3845 }, 3846 3847 /* BSpec: 11391 */ 3848 { 3849 FF_SLICE_CHICKEN, 3850 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3851 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3852 }, 3853 3854 /* BSpec: 11299 */ 3855 { 3856 _3D_CHICKEN3, 3857 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3858 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3859 } 3860 }; 3861 3862 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3863 3864 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3865 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3866 3867 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3868 batch = gen8_emit_pipe_control(batch, 3869 PIPE_CONTROL_FLUSH_L3 | 3870 PIPE_CONTROL_STORE_DATA_INDEX | 3871 PIPE_CONTROL_CS_STALL | 3872 PIPE_CONTROL_QW_WRITE, 3873 LRC_PPHWSP_SCRATCH_ADDR); 3874 3875 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3876 3877 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3878 if (HAS_POOLED_EU(engine->i915)) { 3879 /* 3880 * EU pool configuration is setup along with golden context 3881 * during context initialization. This value depends on 3882 * device type (2x6 or 3x6) and needs to be updated based 3883 * on which subslice is disabled especially for 2x6 3884 * devices, however it is safe to load default 3885 * configuration of 3x6 device instead of masking off 3886 * corresponding bits because HW ignores bits of a disabled 3887 * subslice and drops down to appropriate config. Please 3888 * see render_state_setup() in i915_gem_render_state.c for 3889 * possible configurations, to avoid duplication they are 3890 * not shown here again. 3891 */ 3892 *batch++ = GEN9_MEDIA_POOL_STATE; 3893 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3894 *batch++ = 0x00777000; 3895 *batch++ = 0; 3896 *batch++ = 0; 3897 *batch++ = 0; 3898 } 3899 3900 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3901 3902 /* Pad to end of cacheline */ 3903 while ((unsigned long)batch % CACHELINE_BYTES) 3904 *batch++ = MI_NOOP; 3905 3906 return batch; 3907 } 3908 3909 static u32 * 3910 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3911 { 3912 int i; 3913 3914 /* 3915 * WaPipeControlBefore3DStateSamplePattern: cnl 3916 * 3917 * Ensure the engine is idle prior to programming a 3918 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3919 */ 3920 batch = gen8_emit_pipe_control(batch, 3921 PIPE_CONTROL_CS_STALL, 3922 0); 3923 /* 3924 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3925 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3926 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3927 * confusing. Since gen8_emit_pipe_control() already advances the 3928 * batch by 6 dwords, we advance the other 10 here, completing a 3929 * cacheline. It's not clear if the workaround requires this padding 3930 * before other commands, or if it's just the regular padding we would 3931 * already have for the workaround bb, so leave it here for now. 3932 */ 3933 for (i = 0; i < 10; i++) 3934 *batch++ = MI_NOOP; 3935 3936 /* Pad to end of cacheline */ 3937 while ((unsigned long)batch % CACHELINE_BYTES) 3938 *batch++ = MI_NOOP; 3939 3940 return batch; 3941 } 3942 3943 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3944 3945 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3946 { 3947 struct drm_i915_gem_object *obj; 3948 struct i915_vma *vma; 3949 int err; 3950 3951 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3952 if (IS_ERR(obj)) 3953 return PTR_ERR(obj); 3954 3955 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3956 if (IS_ERR(vma)) { 3957 err = PTR_ERR(vma); 3958 goto err; 3959 } 3960 3961 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 3962 if (err) 3963 goto err; 3964 3965 engine->wa_ctx.vma = vma; 3966 return 0; 3967 3968 err: 3969 i915_gem_object_put(obj); 3970 return err; 3971 } 3972 3973 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3974 { 3975 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3976 } 3977 3978 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3979 3980 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3981 { 3982 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3983 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3984 &wa_ctx->per_ctx }; 3985 wa_bb_func_t wa_bb_fn[2]; 3986 void *batch, *batch_ptr; 3987 unsigned int i; 3988 int ret; 3989 3990 if (engine->class != RENDER_CLASS) 3991 return 0; 3992 3993 switch (INTEL_GEN(engine->i915)) { 3994 case 12: 3995 case 11: 3996 return 0; 3997 case 10: 3998 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3999 wa_bb_fn[1] = NULL; 4000 break; 4001 case 9: 4002 wa_bb_fn[0] = gen9_init_indirectctx_bb; 4003 wa_bb_fn[1] = NULL; 4004 break; 4005 case 8: 4006 wa_bb_fn[0] = gen8_init_indirectctx_bb; 4007 wa_bb_fn[1] = NULL; 4008 break; 4009 default: 4010 MISSING_CASE(INTEL_GEN(engine->i915)); 4011 return 0; 4012 } 4013 4014 ret = lrc_setup_wa_ctx(engine); 4015 if (ret) { 4016 drm_dbg(&engine->i915->drm, 4017 "Failed to setup context WA page: %d\n", ret); 4018 return ret; 4019 } 4020 4021 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 4022 4023 /* 4024 * Emit the two workaround batch buffers, recording the offset from the 4025 * start of the workaround batch buffer object for each and their 4026 * respective sizes. 4027 */ 4028 batch_ptr = batch; 4029 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 4030 wa_bb[i]->offset = batch_ptr - batch; 4031 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 4032 CACHELINE_BYTES))) { 4033 ret = -EINVAL; 4034 break; 4035 } 4036 if (wa_bb_fn[i]) 4037 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 4038 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 4039 } 4040 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 4041 4042 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 4043 __i915_gem_object_release_map(wa_ctx->vma->obj); 4044 if (ret) 4045 lrc_destroy_wa_ctx(engine); 4046 4047 return ret; 4048 } 4049 4050 static void reset_csb_pointers(struct intel_engine_cs *engine) 4051 { 4052 struct intel_engine_execlists * const execlists = &engine->execlists; 4053 const unsigned int reset_value = execlists->csb_size - 1; 4054 4055 ring_set_paused(engine, 0); 4056 4057 /* 4058 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 4059 * Bludgeon them with a mmio update to be sure. 4060 */ 4061 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4062 0xffff << 16 | reset_value << 8 | reset_value); 4063 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4064 4065 /* 4066 * After a reset, the HW starts writing into CSB entry [0]. We 4067 * therefore have to set our HEAD pointer back one entry so that 4068 * the *first* entry we check is entry 0. To complicate this further, 4069 * as we don't wait for the first interrupt after reset, we have to 4070 * fake the HW write to point back to the last entry so that our 4071 * inline comparison of our cached head position against the last HW 4072 * write works even before the first interrupt. 4073 */ 4074 execlists->csb_head = reset_value; 4075 WRITE_ONCE(*execlists->csb_write, reset_value); 4076 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4077 4078 /* Check that the GPU does indeed update the CSB entries! */ 4079 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); 4080 invalidate_csb_entries(&execlists->csb_status[0], 4081 &execlists->csb_status[reset_value]); 4082 4083 /* Once more for luck and our trusty paranoia */ 4084 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4085 0xffff << 16 | reset_value << 8 | reset_value); 4086 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4087 4088 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4089 } 4090 4091 static void execlists_sanitize(struct intel_engine_cs *engine) 4092 { 4093 GEM_BUG_ON(execlists_active(&engine->execlists)); 4094 4095 /* 4096 * Poison residual state on resume, in case the suspend didn't! 4097 * 4098 * We have to assume that across suspend/resume (or other loss 4099 * of control) that the contents of our pinned buffers has been 4100 * lost, replaced by garbage. Since this doesn't always happen, 4101 * let's poison such state so that we more quickly spot when 4102 * we falsely assume it has been preserved. 4103 */ 4104 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4105 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4106 4107 reset_csb_pointers(engine); 4108 4109 /* 4110 * The kernel_context HWSP is stored in the status_page. As above, 4111 * that may be lost on resume/initialisation, and so we need to 4112 * reset the value in the HWSP. 4113 */ 4114 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4115 4116 /* And scrub the dirty cachelines for the HWSP */ 4117 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4118 } 4119 4120 static void enable_error_interrupt(struct intel_engine_cs *engine) 4121 { 4122 u32 status; 4123 4124 engine->execlists.error_interrupt = 0; 4125 ENGINE_WRITE(engine, RING_EMR, ~0u); 4126 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4127 4128 status = ENGINE_READ(engine, RING_ESR); 4129 if (unlikely(status)) { 4130 drm_err(&engine->i915->drm, 4131 "engine '%s' resumed still in error: %08x\n", 4132 engine->name, status); 4133 __intel_gt_reset(engine->gt, engine->mask); 4134 } 4135 4136 /* 4137 * On current gen8+, we have 2 signals to play with 4138 * 4139 * - I915_ERROR_INSTUCTION (bit 0) 4140 * 4141 * Generate an error if the command parser encounters an invalid 4142 * instruction 4143 * 4144 * This is a fatal error. 4145 * 4146 * - CP_PRIV (bit 2) 4147 * 4148 * Generate an error on privilege violation (where the CP replaces 4149 * the instruction with a no-op). This also fires for writes into 4150 * read-only scratch pages. 4151 * 4152 * This is a non-fatal error, parsing continues. 4153 * 4154 * * there are a few others defined for odd HW that we do not use 4155 * 4156 * Since CP_PRIV fires for cases where we have chosen to ignore the 4157 * error (as the HW is validating and suppressing the mistakes), we 4158 * only unmask the instruction error bit. 4159 */ 4160 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4161 } 4162 4163 static void enable_execlists(struct intel_engine_cs *engine) 4164 { 4165 u32 mode; 4166 4167 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4168 4169 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4170 4171 if (INTEL_GEN(engine->i915) >= 11) 4172 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4173 else 4174 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4175 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4176 4177 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4178 4179 ENGINE_WRITE_FW(engine, 4180 RING_HWS_PGA, 4181 i915_ggtt_offset(engine->status_page.vma)); 4182 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4183 4184 enable_error_interrupt(engine); 4185 4186 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4187 } 4188 4189 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4190 { 4191 bool unexpected = false; 4192 4193 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4194 drm_dbg(&engine->i915->drm, 4195 "STOP_RING still set in RING_MI_MODE\n"); 4196 unexpected = true; 4197 } 4198 4199 return unexpected; 4200 } 4201 4202 static int execlists_resume(struct intel_engine_cs *engine) 4203 { 4204 intel_mocs_init_engine(engine); 4205 4206 intel_breadcrumbs_reset(engine->breadcrumbs); 4207 4208 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4209 struct drm_printer p = drm_debug_printer(__func__); 4210 4211 intel_engine_dump(engine, &p, NULL); 4212 } 4213 4214 enable_execlists(engine); 4215 4216 return 0; 4217 } 4218 4219 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4220 { 4221 struct intel_engine_execlists * const execlists = &engine->execlists; 4222 unsigned long flags; 4223 4224 ENGINE_TRACE(engine, "depth<-%d\n", 4225 atomic_read(&execlists->tasklet.count)); 4226 4227 /* 4228 * Prevent request submission to the hardware until we have 4229 * completed the reset in i915_gem_reset_finish(). If a request 4230 * is completed by one engine, it may then queue a request 4231 * to a second via its execlists->tasklet *just* as we are 4232 * calling engine->resume() and also writing the ELSP. 4233 * Turning off the execlists->tasklet until the reset is over 4234 * prevents the race. 4235 */ 4236 __tasklet_disable_sync_once(&execlists->tasklet); 4237 GEM_BUG_ON(!reset_in_progress(execlists)); 4238 4239 /* And flush any current direct submission. */ 4240 spin_lock_irqsave(&engine->active.lock, flags); 4241 spin_unlock_irqrestore(&engine->active.lock, flags); 4242 4243 /* 4244 * We stop engines, otherwise we might get failed reset and a 4245 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4246 * from system hang if batchbuffer is progressing when 4247 * the reset is issued, regardless of READY_TO_RESET ack. 4248 * Thus assume it is best to stop engines on all gens 4249 * where we have a gpu reset. 4250 * 4251 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4252 * 4253 * FIXME: Wa for more modern gens needs to be validated 4254 */ 4255 ring_set_paused(engine, 1); 4256 intel_engine_stop_cs(engine); 4257 4258 engine->execlists.reset_ccid = active_ccid(engine); 4259 } 4260 4261 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4262 { 4263 int x; 4264 4265 x = lrc_ring_mi_mode(engine); 4266 if (x != -1) { 4267 regs[x + 1] &= ~STOP_RING; 4268 regs[x + 1] |= STOP_RING << 16; 4269 } 4270 } 4271 4272 static void __execlists_reset_reg_state(const struct intel_context *ce, 4273 const struct intel_engine_cs *engine) 4274 { 4275 u32 *regs = ce->lrc_reg_state; 4276 4277 __reset_stop_ring(regs, engine); 4278 } 4279 4280 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4281 { 4282 struct intel_engine_execlists * const execlists = &engine->execlists; 4283 struct intel_context *ce; 4284 struct i915_request *rq; 4285 u32 head; 4286 4287 mb(); /* paranoia: read the CSB pointers from after the reset */ 4288 clflush(execlists->csb_write); 4289 mb(); 4290 4291 process_csb(engine); /* drain preemption events */ 4292 4293 /* Following the reset, we need to reload the CSB read/write pointers */ 4294 reset_csb_pointers(engine); 4295 4296 /* 4297 * Save the currently executing context, even if we completed 4298 * its request, it was still running at the time of the 4299 * reset and will have been clobbered. 4300 */ 4301 rq = active_context(engine, engine->execlists.reset_ccid); 4302 if (!rq) 4303 goto unwind; 4304 4305 ce = rq->context; 4306 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4307 4308 if (i915_request_completed(rq)) { 4309 /* Idle context; tidy up the ring so we can restart afresh */ 4310 head = intel_ring_wrap(ce->ring, rq->tail); 4311 goto out_replay; 4312 } 4313 4314 /* We still have requests in-flight; the engine should be active */ 4315 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4316 4317 /* Context has requests still in-flight; it should not be idle! */ 4318 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4319 4320 rq = active_request(ce->timeline, rq); 4321 head = intel_ring_wrap(ce->ring, rq->head); 4322 GEM_BUG_ON(head == ce->ring->tail); 4323 4324 /* 4325 * If this request hasn't started yet, e.g. it is waiting on a 4326 * semaphore, we need to avoid skipping the request or else we 4327 * break the signaling chain. However, if the context is corrupt 4328 * the request will not restart and we will be stuck with a wedged 4329 * device. It is quite often the case that if we issue a reset 4330 * while the GPU is loading the context image, that the context 4331 * image becomes corrupt. 4332 * 4333 * Otherwise, if we have not started yet, the request should replay 4334 * perfectly and we do not need to flag the result as being erroneous. 4335 */ 4336 if (!i915_request_started(rq)) 4337 goto out_replay; 4338 4339 /* 4340 * If the request was innocent, we leave the request in the ELSP 4341 * and will try to replay it on restarting. The context image may 4342 * have been corrupted by the reset, in which case we may have 4343 * to service a new GPU hang, but more likely we can continue on 4344 * without impact. 4345 * 4346 * If the request was guilty, we presume the context is corrupt 4347 * and have to at least restore the RING register in the context 4348 * image back to the expected values to skip over the guilty request. 4349 */ 4350 __i915_request_reset(rq, stalled); 4351 4352 /* 4353 * We want a simple context + ring to execute the breadcrumb update. 4354 * We cannot rely on the context being intact across the GPU hang, 4355 * so clear it and rebuild just what we need for the breadcrumb. 4356 * All pending requests for this context will be zapped, and any 4357 * future request will be after userspace has had the opportunity 4358 * to recreate its own state. 4359 */ 4360 out_replay: 4361 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4362 head, ce->ring->tail); 4363 __execlists_reset_reg_state(ce, engine); 4364 __execlists_update_reg_state(ce, engine, head); 4365 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4366 4367 unwind: 4368 /* Push back any incomplete requests for replay after the reset. */ 4369 cancel_port_requests(execlists); 4370 __unwind_incomplete_requests(engine); 4371 } 4372 4373 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4374 { 4375 unsigned long flags; 4376 4377 ENGINE_TRACE(engine, "\n"); 4378 4379 spin_lock_irqsave(&engine->active.lock, flags); 4380 4381 __execlists_reset(engine, stalled); 4382 4383 spin_unlock_irqrestore(&engine->active.lock, flags); 4384 } 4385 4386 static void nop_submission_tasklet(unsigned long data) 4387 { 4388 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4389 4390 /* The driver is wedged; don't process any more events. */ 4391 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4392 } 4393 4394 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4395 { 4396 struct intel_engine_execlists * const execlists = &engine->execlists; 4397 struct i915_request *rq, *rn; 4398 struct rb_node *rb; 4399 unsigned long flags; 4400 4401 ENGINE_TRACE(engine, "\n"); 4402 4403 /* 4404 * Before we call engine->cancel_requests(), we should have exclusive 4405 * access to the submission state. This is arranged for us by the 4406 * caller disabling the interrupt generation, the tasklet and other 4407 * threads that may then access the same state, giving us a free hand 4408 * to reset state. However, we still need to let lockdep be aware that 4409 * we know this state may be accessed in hardirq context, so we 4410 * disable the irq around this manipulation and we want to keep 4411 * the spinlock focused on its duties and not accidentally conflate 4412 * coverage to the submission's irq state. (Similarly, although we 4413 * shouldn't need to disable irq around the manipulation of the 4414 * submission's irq state, we also wish to remind ourselves that 4415 * it is irq state.) 4416 */ 4417 spin_lock_irqsave(&engine->active.lock, flags); 4418 4419 __execlists_reset(engine, true); 4420 4421 /* Mark all executing requests as skipped. */ 4422 list_for_each_entry(rq, &engine->active.requests, sched.link) 4423 mark_eio(rq); 4424 intel_engine_signal_breadcrumbs(engine); 4425 4426 /* Flush the queued requests to the timeline list (for retiring). */ 4427 while ((rb = rb_first_cached(&execlists->queue))) { 4428 struct i915_priolist *p = to_priolist(rb); 4429 int i; 4430 4431 priolist_for_each_request_consume(rq, rn, p, i) { 4432 mark_eio(rq); 4433 __i915_request_submit(rq); 4434 } 4435 4436 rb_erase_cached(&p->node, &execlists->queue); 4437 i915_priolist_free(p); 4438 } 4439 4440 /* On-hold requests will be flushed to timeline upon their release */ 4441 list_for_each_entry(rq, &engine->active.hold, sched.link) 4442 mark_eio(rq); 4443 4444 /* Cancel all attached virtual engines */ 4445 while ((rb = rb_first_cached(&execlists->virtual))) { 4446 struct virtual_engine *ve = 4447 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4448 4449 rb_erase_cached(rb, &execlists->virtual); 4450 RB_CLEAR_NODE(rb); 4451 4452 spin_lock(&ve->base.active.lock); 4453 rq = fetch_and_zero(&ve->request); 4454 if (rq) { 4455 mark_eio(rq); 4456 4457 rq->engine = engine; 4458 __i915_request_submit(rq); 4459 i915_request_put(rq); 4460 4461 ve->base.execlists.queue_priority_hint = INT_MIN; 4462 } 4463 spin_unlock(&ve->base.active.lock); 4464 } 4465 4466 /* Remaining _unready_ requests will be nop'ed when submitted */ 4467 4468 execlists->queue_priority_hint = INT_MIN; 4469 execlists->queue = RB_ROOT_CACHED; 4470 4471 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4472 execlists->tasklet.func = nop_submission_tasklet; 4473 4474 spin_unlock_irqrestore(&engine->active.lock, flags); 4475 } 4476 4477 static void execlists_reset_finish(struct intel_engine_cs *engine) 4478 { 4479 struct intel_engine_execlists * const execlists = &engine->execlists; 4480 4481 /* 4482 * After a GPU reset, we may have requests to replay. Do so now while 4483 * we still have the forcewake to be sure that the GPU is not allowed 4484 * to sleep before we restart and reload a context. 4485 */ 4486 GEM_BUG_ON(!reset_in_progress(execlists)); 4487 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4488 execlists->tasklet.func(execlists->tasklet.data); 4489 4490 if (__tasklet_enable(&execlists->tasklet)) 4491 /* And kick in case we missed a new request submission. */ 4492 tasklet_hi_schedule(&execlists->tasklet); 4493 ENGINE_TRACE(engine, "depth->%d\n", 4494 atomic_read(&execlists->tasklet.count)); 4495 } 4496 4497 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4498 u64 offset, u32 len, 4499 const unsigned int flags) 4500 { 4501 u32 *cs; 4502 4503 cs = intel_ring_begin(rq, 4); 4504 if (IS_ERR(cs)) 4505 return PTR_ERR(cs); 4506 4507 /* 4508 * WaDisableCtxRestoreArbitration:bdw,chv 4509 * 4510 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4511 * particular all the gen that do not need the w/a at all!), if we 4512 * took care to make sure that on every switch into this context 4513 * (both ordinary and for preemption) that arbitrartion was enabled 4514 * we would be fine. However, for gen8 there is another w/a that 4515 * requires us to not preempt inside GPGPU execution, so we keep 4516 * arbitration disabled for gen8 batches. Arbitration will be 4517 * re-enabled before we close the request 4518 * (engine->emit_fini_breadcrumb). 4519 */ 4520 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4521 4522 /* FIXME(BDW+): Address space and security selectors. */ 4523 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4524 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4525 *cs++ = lower_32_bits(offset); 4526 *cs++ = upper_32_bits(offset); 4527 4528 intel_ring_advance(rq, cs); 4529 4530 return 0; 4531 } 4532 4533 static int gen8_emit_bb_start(struct i915_request *rq, 4534 u64 offset, u32 len, 4535 const unsigned int flags) 4536 { 4537 u32 *cs; 4538 4539 cs = intel_ring_begin(rq, 6); 4540 if (IS_ERR(cs)) 4541 return PTR_ERR(cs); 4542 4543 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4544 4545 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4546 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4547 *cs++ = lower_32_bits(offset); 4548 *cs++ = upper_32_bits(offset); 4549 4550 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4551 *cs++ = MI_NOOP; 4552 4553 intel_ring_advance(rq, cs); 4554 4555 return 0; 4556 } 4557 4558 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4559 { 4560 ENGINE_WRITE(engine, RING_IMR, 4561 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4562 ENGINE_POSTING_READ(engine, RING_IMR); 4563 } 4564 4565 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4566 { 4567 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4568 } 4569 4570 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4571 { 4572 u32 cmd, *cs; 4573 4574 cs = intel_ring_begin(request, 4); 4575 if (IS_ERR(cs)) 4576 return PTR_ERR(cs); 4577 4578 cmd = MI_FLUSH_DW + 1; 4579 4580 /* We always require a command barrier so that subsequent 4581 * commands, such as breadcrumb interrupts, are strictly ordered 4582 * wrt the contents of the write cache being flushed to memory 4583 * (and thus being coherent from the CPU). 4584 */ 4585 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4586 4587 if (mode & EMIT_INVALIDATE) { 4588 cmd |= MI_INVALIDATE_TLB; 4589 if (request->engine->class == VIDEO_DECODE_CLASS) 4590 cmd |= MI_INVALIDATE_BSD; 4591 } 4592 4593 *cs++ = cmd; 4594 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4595 *cs++ = 0; /* upper addr */ 4596 *cs++ = 0; /* value */ 4597 intel_ring_advance(request, cs); 4598 4599 return 0; 4600 } 4601 4602 static int gen8_emit_flush_render(struct i915_request *request, 4603 u32 mode) 4604 { 4605 bool vf_flush_wa = false, dc_flush_wa = false; 4606 u32 *cs, flags = 0; 4607 int len; 4608 4609 flags |= PIPE_CONTROL_CS_STALL; 4610 4611 if (mode & EMIT_FLUSH) { 4612 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4613 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4614 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4615 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4616 } 4617 4618 if (mode & EMIT_INVALIDATE) { 4619 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4620 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4621 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4622 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4623 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4624 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4625 flags |= PIPE_CONTROL_QW_WRITE; 4626 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4627 4628 /* 4629 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4630 * pipe control. 4631 */ 4632 if (IS_GEN(request->engine->i915, 9)) 4633 vf_flush_wa = true; 4634 4635 /* WaForGAMHang:kbl */ 4636 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4637 dc_flush_wa = true; 4638 } 4639 4640 len = 6; 4641 4642 if (vf_flush_wa) 4643 len += 6; 4644 4645 if (dc_flush_wa) 4646 len += 12; 4647 4648 cs = intel_ring_begin(request, len); 4649 if (IS_ERR(cs)) 4650 return PTR_ERR(cs); 4651 4652 if (vf_flush_wa) 4653 cs = gen8_emit_pipe_control(cs, 0, 0); 4654 4655 if (dc_flush_wa) 4656 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4657 0); 4658 4659 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4660 4661 if (dc_flush_wa) 4662 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4663 4664 intel_ring_advance(request, cs); 4665 4666 return 0; 4667 } 4668 4669 static int gen11_emit_flush_render(struct i915_request *request, 4670 u32 mode) 4671 { 4672 if (mode & EMIT_FLUSH) { 4673 u32 *cs; 4674 u32 flags = 0; 4675 4676 flags |= PIPE_CONTROL_CS_STALL; 4677 4678 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4679 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4680 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4681 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4682 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4683 flags |= PIPE_CONTROL_QW_WRITE; 4684 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4685 4686 cs = intel_ring_begin(request, 6); 4687 if (IS_ERR(cs)) 4688 return PTR_ERR(cs); 4689 4690 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4691 intel_ring_advance(request, cs); 4692 } 4693 4694 if (mode & EMIT_INVALIDATE) { 4695 u32 *cs; 4696 u32 flags = 0; 4697 4698 flags |= PIPE_CONTROL_CS_STALL; 4699 4700 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4701 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4702 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4703 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4704 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4705 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4706 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4707 flags |= PIPE_CONTROL_QW_WRITE; 4708 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4709 4710 cs = intel_ring_begin(request, 6); 4711 if (IS_ERR(cs)) 4712 return PTR_ERR(cs); 4713 4714 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4715 intel_ring_advance(request, cs); 4716 } 4717 4718 return 0; 4719 } 4720 4721 static u32 preparser_disable(bool state) 4722 { 4723 return MI_ARB_CHECK | 1 << 8 | state; 4724 } 4725 4726 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4727 { 4728 static const i915_reg_t vd[] = { 4729 GEN12_VD0_AUX_NV, 4730 GEN12_VD1_AUX_NV, 4731 GEN12_VD2_AUX_NV, 4732 GEN12_VD3_AUX_NV, 4733 }; 4734 4735 static const i915_reg_t ve[] = { 4736 GEN12_VE0_AUX_NV, 4737 GEN12_VE1_AUX_NV, 4738 }; 4739 4740 if (engine->class == VIDEO_DECODE_CLASS) 4741 return vd[engine->instance]; 4742 4743 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4744 return ve[engine->instance]; 4745 4746 GEM_BUG_ON("unknown aux_inv_reg\n"); 4747 4748 return INVALID_MMIO_REG; 4749 } 4750 4751 static u32 * 4752 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4753 { 4754 *cs++ = MI_LOAD_REGISTER_IMM(1); 4755 *cs++ = i915_mmio_reg_offset(inv_reg); 4756 *cs++ = AUX_INV; 4757 *cs++ = MI_NOOP; 4758 4759 return cs; 4760 } 4761 4762 static int gen12_emit_flush_render(struct i915_request *request, 4763 u32 mode) 4764 { 4765 if (mode & EMIT_FLUSH) { 4766 u32 flags = 0; 4767 u32 *cs; 4768 4769 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4770 flags |= PIPE_CONTROL_FLUSH_L3; 4771 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4772 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4773 /* Wa_1409600907:tgl */ 4774 flags |= PIPE_CONTROL_DEPTH_STALL; 4775 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4776 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4777 4778 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4779 flags |= PIPE_CONTROL_QW_WRITE; 4780 4781 flags |= PIPE_CONTROL_CS_STALL; 4782 4783 cs = intel_ring_begin(request, 6); 4784 if (IS_ERR(cs)) 4785 return PTR_ERR(cs); 4786 4787 cs = gen12_emit_pipe_control(cs, 4788 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4789 flags, LRC_PPHWSP_SCRATCH_ADDR); 4790 intel_ring_advance(request, cs); 4791 } 4792 4793 if (mode & EMIT_INVALIDATE) { 4794 u32 flags = 0; 4795 u32 *cs; 4796 4797 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4798 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4799 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4800 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4801 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4802 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4803 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4804 4805 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4806 flags |= PIPE_CONTROL_QW_WRITE; 4807 4808 flags |= PIPE_CONTROL_CS_STALL; 4809 4810 cs = intel_ring_begin(request, 8 + 4); 4811 if (IS_ERR(cs)) 4812 return PTR_ERR(cs); 4813 4814 /* 4815 * Prevent the pre-parser from skipping past the TLB 4816 * invalidate and loading a stale page for the batch 4817 * buffer / request payload. 4818 */ 4819 *cs++ = preparser_disable(true); 4820 4821 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4822 4823 /* hsdes: 1809175790 */ 4824 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4825 4826 *cs++ = preparser_disable(false); 4827 intel_ring_advance(request, cs); 4828 } 4829 4830 return 0; 4831 } 4832 4833 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4834 { 4835 intel_engine_mask_t aux_inv = 0; 4836 u32 cmd, *cs; 4837 4838 cmd = 4; 4839 if (mode & EMIT_INVALIDATE) 4840 cmd += 2; 4841 if (mode & EMIT_INVALIDATE) 4842 aux_inv = request->engine->mask & ~BIT(BCS0); 4843 if (aux_inv) 4844 cmd += 2 * hweight8(aux_inv) + 2; 4845 4846 cs = intel_ring_begin(request, cmd); 4847 if (IS_ERR(cs)) 4848 return PTR_ERR(cs); 4849 4850 if (mode & EMIT_INVALIDATE) 4851 *cs++ = preparser_disable(true); 4852 4853 cmd = MI_FLUSH_DW + 1; 4854 4855 /* We always require a command barrier so that subsequent 4856 * commands, such as breadcrumb interrupts, are strictly ordered 4857 * wrt the contents of the write cache being flushed to memory 4858 * (and thus being coherent from the CPU). 4859 */ 4860 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4861 4862 if (mode & EMIT_INVALIDATE) { 4863 cmd |= MI_INVALIDATE_TLB; 4864 if (request->engine->class == VIDEO_DECODE_CLASS) 4865 cmd |= MI_INVALIDATE_BSD; 4866 } 4867 4868 *cs++ = cmd; 4869 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4870 *cs++ = 0; /* upper addr */ 4871 *cs++ = 0; /* value */ 4872 4873 if (aux_inv) { /* hsdes: 1809175790 */ 4874 struct intel_engine_cs *engine; 4875 unsigned int tmp; 4876 4877 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4878 for_each_engine_masked(engine, request->engine->gt, 4879 aux_inv, tmp) { 4880 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4881 *cs++ = AUX_INV; 4882 } 4883 *cs++ = MI_NOOP; 4884 } 4885 4886 if (mode & EMIT_INVALIDATE) 4887 *cs++ = preparser_disable(false); 4888 4889 intel_ring_advance(request, cs); 4890 4891 return 0; 4892 } 4893 4894 static void assert_request_valid(struct i915_request *rq) 4895 { 4896 struct intel_ring *ring __maybe_unused = rq->ring; 4897 4898 /* Can we unwind this request without appearing to go forwards? */ 4899 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4900 } 4901 4902 /* 4903 * Reserve space for 2 NOOPs at the end of each request to be 4904 * used as a workaround for not being allowed to do lite 4905 * restore with HEAD==TAIL (WaIdleLiteRestore). 4906 */ 4907 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4908 { 4909 /* Ensure there's always at least one preemption point per-request. */ 4910 *cs++ = MI_ARB_CHECK; 4911 *cs++ = MI_NOOP; 4912 request->wa_tail = intel_ring_offset(request, cs); 4913 4914 /* Check that entire request is less than half the ring */ 4915 assert_request_valid(request); 4916 4917 return cs; 4918 } 4919 4920 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4921 { 4922 *cs++ = MI_SEMAPHORE_WAIT | 4923 MI_SEMAPHORE_GLOBAL_GTT | 4924 MI_SEMAPHORE_POLL | 4925 MI_SEMAPHORE_SAD_EQ_SDD; 4926 *cs++ = 0; 4927 *cs++ = intel_hws_preempt_address(request->engine); 4928 *cs++ = 0; 4929 4930 return cs; 4931 } 4932 4933 static __always_inline u32* 4934 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4935 { 4936 *cs++ = MI_USER_INTERRUPT; 4937 4938 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4939 if (intel_engine_has_semaphores(request->engine)) 4940 cs = emit_preempt_busywait(request, cs); 4941 4942 request->tail = intel_ring_offset(request, cs); 4943 assert_ring_tail_valid(request->ring, request->tail); 4944 4945 return gen8_emit_wa_tail(request, cs); 4946 } 4947 4948 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) 4949 { 4950 return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); 4951 } 4952 4953 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4954 { 4955 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4956 } 4957 4958 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4959 { 4960 cs = gen8_emit_pipe_control(cs, 4961 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4962 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4963 PIPE_CONTROL_DC_FLUSH_ENABLE, 4964 0); 4965 4966 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4967 cs = gen8_emit_ggtt_write_rcs(cs, 4968 request->fence.seqno, 4969 hwsp_offset(request), 4970 PIPE_CONTROL_FLUSH_ENABLE | 4971 PIPE_CONTROL_CS_STALL); 4972 4973 return gen8_emit_fini_breadcrumb_tail(request, cs); 4974 } 4975 4976 static u32 * 4977 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4978 { 4979 cs = gen8_emit_ggtt_write_rcs(cs, 4980 request->fence.seqno, 4981 hwsp_offset(request), 4982 PIPE_CONTROL_CS_STALL | 4983 PIPE_CONTROL_TILE_CACHE_FLUSH | 4984 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4985 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4986 PIPE_CONTROL_DC_FLUSH_ENABLE | 4987 PIPE_CONTROL_FLUSH_ENABLE); 4988 4989 return gen8_emit_fini_breadcrumb_tail(request, cs); 4990 } 4991 4992 /* 4993 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4994 * flush and will continue pre-fetching the instructions after it before the 4995 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4996 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4997 * of the next request before the memory has been flushed, we're guaranteed that 4998 * we won't access the batch itself too early. 4999 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 5000 * so, if the current request is modifying an instruction in the next request on 5001 * the same intel_context, we might pre-fetch and then execute the pre-update 5002 * instruction. To avoid this, the users of self-modifying code should either 5003 * disable the parser around the code emitting the memory writes, via a new flag 5004 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 5005 * the in-kernel use-cases we've opted to use a separate context, see 5006 * reloc_gpu() as an example. 5007 * All the above applies only to the instructions themselves. Non-inline data 5008 * used by the instructions is not pre-fetched. 5009 */ 5010 5011 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 5012 { 5013 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 5014 MI_SEMAPHORE_GLOBAL_GTT | 5015 MI_SEMAPHORE_POLL | 5016 MI_SEMAPHORE_SAD_EQ_SDD; 5017 *cs++ = 0; 5018 *cs++ = intel_hws_preempt_address(request->engine); 5019 *cs++ = 0; 5020 *cs++ = 0; 5021 *cs++ = MI_NOOP; 5022 5023 return cs; 5024 } 5025 5026 static __always_inline u32* 5027 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 5028 { 5029 *cs++ = MI_USER_INTERRUPT; 5030 5031 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 5032 if (intel_engine_has_semaphores(request->engine)) 5033 cs = gen12_emit_preempt_busywait(request, cs); 5034 5035 request->tail = intel_ring_offset(request, cs); 5036 assert_ring_tail_valid(request->ring, request->tail); 5037 5038 return gen8_emit_wa_tail(request, cs); 5039 } 5040 5041 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 5042 { 5043 /* XXX Stalling flush before seqno write; post-sync not */ 5044 cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); 5045 return gen12_emit_fini_breadcrumb_tail(rq, cs); 5046 } 5047 5048 static u32 * 5049 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 5050 { 5051 cs = gen12_emit_ggtt_write_rcs(cs, 5052 request->fence.seqno, 5053 hwsp_offset(request), 5054 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 5055 PIPE_CONTROL_CS_STALL | 5056 PIPE_CONTROL_TILE_CACHE_FLUSH | 5057 PIPE_CONTROL_FLUSH_L3 | 5058 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5059 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5060 /* Wa_1409600907:tgl */ 5061 PIPE_CONTROL_DEPTH_STALL | 5062 PIPE_CONTROL_DC_FLUSH_ENABLE | 5063 PIPE_CONTROL_FLUSH_ENABLE); 5064 5065 return gen12_emit_fini_breadcrumb_tail(request, cs); 5066 } 5067 5068 static void execlists_park(struct intel_engine_cs *engine) 5069 { 5070 cancel_timer(&engine->execlists.timer); 5071 cancel_timer(&engine->execlists.preempt); 5072 } 5073 5074 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 5075 { 5076 engine->submit_request = execlists_submit_request; 5077 engine->schedule = i915_schedule; 5078 engine->execlists.tasklet.func = execlists_submission_tasklet; 5079 5080 engine->reset.prepare = execlists_reset_prepare; 5081 engine->reset.rewind = execlists_reset_rewind; 5082 engine->reset.cancel = execlists_reset_cancel; 5083 engine->reset.finish = execlists_reset_finish; 5084 5085 engine->park = execlists_park; 5086 engine->unpark = NULL; 5087 5088 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5089 if (!intel_vgpu_active(engine->i915)) { 5090 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5091 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5092 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5093 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5094 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5095 } 5096 } 5097 5098 if (INTEL_GEN(engine->i915) >= 12) 5099 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5100 5101 if (intel_engine_has_preemption(engine)) 5102 engine->emit_bb_start = gen8_emit_bb_start; 5103 else 5104 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5105 } 5106 5107 static void execlists_shutdown(struct intel_engine_cs *engine) 5108 { 5109 /* Synchronise with residual timers and any softirq they raise */ 5110 del_timer_sync(&engine->execlists.timer); 5111 del_timer_sync(&engine->execlists.preempt); 5112 tasklet_kill(&engine->execlists.tasklet); 5113 } 5114 5115 static void execlists_release(struct intel_engine_cs *engine) 5116 { 5117 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5118 5119 execlists_shutdown(engine); 5120 5121 intel_engine_cleanup_common(engine); 5122 lrc_destroy_wa_ctx(engine); 5123 } 5124 5125 static void 5126 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5127 { 5128 /* Default vfuncs which can be overriden by each engine. */ 5129 5130 engine->resume = execlists_resume; 5131 5132 engine->cops = &execlists_context_ops; 5133 engine->request_alloc = execlists_request_alloc; 5134 5135 engine->emit_flush = gen8_emit_flush; 5136 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5137 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5138 if (INTEL_GEN(engine->i915) >= 12) { 5139 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5140 engine->emit_flush = gen12_emit_flush; 5141 } 5142 engine->set_default_submission = intel_execlists_set_default_submission; 5143 5144 if (INTEL_GEN(engine->i915) < 11) { 5145 engine->irq_enable = gen8_logical_ring_enable_irq; 5146 engine->irq_disable = gen8_logical_ring_disable_irq; 5147 } else { 5148 /* 5149 * TODO: On Gen11 interrupt masks need to be clear 5150 * to allow C6 entry. Keep interrupts enabled at 5151 * and take the hit of generating extra interrupts 5152 * until a more refined solution exists. 5153 */ 5154 } 5155 } 5156 5157 static inline void 5158 logical_ring_default_irqs(struct intel_engine_cs *engine) 5159 { 5160 unsigned int shift = 0; 5161 5162 if (INTEL_GEN(engine->i915) < 11) { 5163 const u8 irq_shifts[] = { 5164 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5165 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5166 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5167 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5168 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5169 }; 5170 5171 shift = irq_shifts[engine->id]; 5172 } 5173 5174 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5175 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5176 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5177 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5178 } 5179 5180 static void rcs_submission_override(struct intel_engine_cs *engine) 5181 { 5182 switch (INTEL_GEN(engine->i915)) { 5183 case 12: 5184 engine->emit_flush = gen12_emit_flush_render; 5185 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5186 break; 5187 case 11: 5188 engine->emit_flush = gen11_emit_flush_render; 5189 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5190 break; 5191 default: 5192 engine->emit_flush = gen8_emit_flush_render; 5193 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5194 break; 5195 } 5196 } 5197 5198 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5199 { 5200 struct intel_engine_execlists * const execlists = &engine->execlists; 5201 struct drm_i915_private *i915 = engine->i915; 5202 struct intel_uncore *uncore = engine->uncore; 5203 u32 base = engine->mmio_base; 5204 5205 tasklet_init(&engine->execlists.tasklet, 5206 execlists_submission_tasklet, (unsigned long)engine); 5207 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5208 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5209 5210 logical_ring_default_vfuncs(engine); 5211 logical_ring_default_irqs(engine); 5212 5213 if (engine->class == RENDER_CLASS) 5214 rcs_submission_override(engine); 5215 5216 if (intel_init_workaround_bb(engine)) 5217 /* 5218 * We continue even if we fail to initialize WA batch 5219 * because we only expect rare glitches but nothing 5220 * critical to prevent us from using GPU 5221 */ 5222 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5223 5224 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5225 execlists->submit_reg = uncore->regs + 5226 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5227 execlists->ctrl_reg = uncore->regs + 5228 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5229 } else { 5230 execlists->submit_reg = uncore->regs + 5231 i915_mmio_reg_offset(RING_ELSP(base)); 5232 } 5233 5234 execlists->csb_status = 5235 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5236 5237 execlists->csb_write = 5238 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5239 5240 if (INTEL_GEN(i915) < 11) 5241 execlists->csb_size = GEN8_CSB_ENTRIES; 5242 else 5243 execlists->csb_size = GEN11_CSB_ENTRIES; 5244 5245 if (INTEL_GEN(engine->i915) >= 11) { 5246 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5247 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5248 } 5249 5250 /* Finally, take ownership and responsibility for cleanup! */ 5251 engine->sanitize = execlists_sanitize; 5252 engine->release = execlists_release; 5253 5254 return 0; 5255 } 5256 5257 static void init_common_reg_state(u32 * const regs, 5258 const struct intel_engine_cs *engine, 5259 const struct intel_ring *ring, 5260 bool inhibit) 5261 { 5262 u32 ctl; 5263 5264 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5265 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5266 if (inhibit) 5267 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5268 if (INTEL_GEN(engine->i915) < 11) 5269 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5270 CTX_CTRL_RS_CTX_ENABLE); 5271 regs[CTX_CONTEXT_CONTROL] = ctl; 5272 5273 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5274 regs[CTX_TIMESTAMP] = 0; 5275 } 5276 5277 static void init_wa_bb_reg_state(u32 * const regs, 5278 const struct intel_engine_cs *engine) 5279 { 5280 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5281 5282 if (wa_ctx->per_ctx.size) { 5283 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5284 5285 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5286 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5287 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5288 } 5289 5290 if (wa_ctx->indirect_ctx.size) { 5291 lrc_ring_setup_indirect_ctx(regs, engine, 5292 i915_ggtt_offset(wa_ctx->vma) + 5293 wa_ctx->indirect_ctx.offset, 5294 wa_ctx->indirect_ctx.size); 5295 } 5296 } 5297 5298 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5299 { 5300 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5301 /* 64b PPGTT (48bit canonical) 5302 * PDP0_DESCRIPTOR contains the base address to PML4 and 5303 * other PDP Descriptors are ignored. 5304 */ 5305 ASSIGN_CTX_PML4(ppgtt, regs); 5306 } else { 5307 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5308 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5309 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5310 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5311 } 5312 } 5313 5314 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5315 { 5316 if (i915_is_ggtt(vm)) 5317 return i915_vm_to_ggtt(vm)->alias; 5318 else 5319 return i915_vm_to_ppgtt(vm); 5320 } 5321 5322 static void execlists_init_reg_state(u32 *regs, 5323 const struct intel_context *ce, 5324 const struct intel_engine_cs *engine, 5325 const struct intel_ring *ring, 5326 bool inhibit) 5327 { 5328 /* 5329 * A context is actually a big batch buffer with several 5330 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5331 * values we are setting here are only for the first context restore: 5332 * on a subsequent save, the GPU will recreate this batchbuffer with new 5333 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5334 * we are not initializing here). 5335 * 5336 * Must keep consistent with virtual_update_register_offsets(). 5337 */ 5338 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5339 5340 init_common_reg_state(regs, engine, ring, inhibit); 5341 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5342 5343 init_wa_bb_reg_state(regs, engine); 5344 5345 __reset_stop_ring(regs, engine); 5346 } 5347 5348 static int 5349 populate_lr_context(struct intel_context *ce, 5350 struct drm_i915_gem_object *ctx_obj, 5351 struct intel_engine_cs *engine, 5352 struct intel_ring *ring) 5353 { 5354 bool inhibit = true; 5355 void *vaddr; 5356 5357 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5358 if (IS_ERR(vaddr)) { 5359 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5360 return PTR_ERR(vaddr); 5361 } 5362 5363 set_redzone(vaddr, engine); 5364 5365 if (engine->default_state) { 5366 shmem_read(engine->default_state, 0, 5367 vaddr, engine->context_size); 5368 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5369 inhibit = false; 5370 } 5371 5372 /* Clear the ppHWSP (inc. per-context counters) */ 5373 memset(vaddr, 0, PAGE_SIZE); 5374 5375 /* 5376 * The second page of the context object contains some registers which 5377 * must be set up prior to the first execution. 5378 */ 5379 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5380 ce, engine, ring, inhibit); 5381 5382 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5383 i915_gem_object_unpin_map(ctx_obj); 5384 return 0; 5385 } 5386 5387 static struct intel_timeline *pinned_timeline(struct intel_context *ce) 5388 { 5389 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 5390 5391 return intel_timeline_create_from_engine(ce->engine, 5392 page_unmask_bits(tl)); 5393 } 5394 5395 static int __execlists_context_alloc(struct intel_context *ce, 5396 struct intel_engine_cs *engine) 5397 { 5398 struct drm_i915_gem_object *ctx_obj; 5399 struct intel_ring *ring; 5400 struct i915_vma *vma; 5401 u32 context_size; 5402 int ret; 5403 5404 GEM_BUG_ON(ce->state); 5405 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5406 5407 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5408 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5409 5410 if (INTEL_GEN(engine->i915) == 12) { 5411 ce->wa_bb_page = context_size / PAGE_SIZE; 5412 context_size += PAGE_SIZE; 5413 } 5414 5415 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5416 if (IS_ERR(ctx_obj)) 5417 return PTR_ERR(ctx_obj); 5418 5419 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5420 if (IS_ERR(vma)) { 5421 ret = PTR_ERR(vma); 5422 goto error_deref_obj; 5423 } 5424 5425 if (!page_mask_bits(ce->timeline)) { 5426 struct intel_timeline *tl; 5427 5428 /* 5429 * Use the static global HWSP for the kernel context, and 5430 * a dynamically allocated cacheline for everyone else. 5431 */ 5432 if (unlikely(ce->timeline)) 5433 tl = pinned_timeline(ce); 5434 else 5435 tl = intel_timeline_create(engine->gt); 5436 if (IS_ERR(tl)) { 5437 ret = PTR_ERR(tl); 5438 goto error_deref_obj; 5439 } 5440 5441 ce->timeline = tl; 5442 } 5443 5444 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5445 if (IS_ERR(ring)) { 5446 ret = PTR_ERR(ring); 5447 goto error_deref_obj; 5448 } 5449 5450 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5451 if (ret) { 5452 drm_dbg(&engine->i915->drm, 5453 "Failed to populate LRC: %d\n", ret); 5454 goto error_ring_free; 5455 } 5456 5457 ce->ring = ring; 5458 ce->state = vma; 5459 5460 return 0; 5461 5462 error_ring_free: 5463 intel_ring_put(ring); 5464 error_deref_obj: 5465 i915_gem_object_put(ctx_obj); 5466 return ret; 5467 } 5468 5469 static struct list_head *virtual_queue(struct virtual_engine *ve) 5470 { 5471 return &ve->base.execlists.default_priolist.requests[0]; 5472 } 5473 5474 static void rcu_virtual_context_destroy(struct work_struct *wrk) 5475 { 5476 struct virtual_engine *ve = 5477 container_of(wrk, typeof(*ve), rcu.work); 5478 unsigned int n; 5479 5480 GEM_BUG_ON(ve->context.inflight); 5481 5482 /* Preempt-to-busy may leave a stale request behind. */ 5483 if (unlikely(ve->request)) { 5484 struct i915_request *old; 5485 5486 spin_lock_irq(&ve->base.active.lock); 5487 5488 old = fetch_and_zero(&ve->request); 5489 if (old) { 5490 GEM_BUG_ON(!i915_request_completed(old)); 5491 __i915_request_submit(old); 5492 i915_request_put(old); 5493 } 5494 5495 spin_unlock_irq(&ve->base.active.lock); 5496 } 5497 5498 /* 5499 * Flush the tasklet in case it is still running on another core. 5500 * 5501 * This needs to be done before we remove ourselves from the siblings' 5502 * rbtrees as in the case it is running in parallel, it may reinsert 5503 * the rb_node into a sibling. 5504 */ 5505 tasklet_kill(&ve->base.execlists.tasklet); 5506 5507 /* Decouple ourselves from the siblings, no more access allowed. */ 5508 for (n = 0; n < ve->num_siblings; n++) { 5509 struct intel_engine_cs *sibling = ve->siblings[n]; 5510 struct rb_node *node = &ve->nodes[sibling->id].rb; 5511 5512 if (RB_EMPTY_NODE(node)) 5513 continue; 5514 5515 spin_lock_irq(&sibling->active.lock); 5516 5517 /* Detachment is lazily performed in the execlists tasklet */ 5518 if (!RB_EMPTY_NODE(node)) 5519 rb_erase_cached(node, &sibling->execlists.virtual); 5520 5521 spin_unlock_irq(&sibling->active.lock); 5522 } 5523 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5524 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5525 5526 if (ve->context.state) 5527 __execlists_context_fini(&ve->context); 5528 intel_context_fini(&ve->context); 5529 5530 intel_breadcrumbs_free(ve->base.breadcrumbs); 5531 intel_engine_free_request_pool(&ve->base); 5532 5533 kfree(ve->bonds); 5534 kfree(ve); 5535 } 5536 5537 static void virtual_context_destroy(struct kref *kref) 5538 { 5539 struct virtual_engine *ve = 5540 container_of(kref, typeof(*ve), context.ref); 5541 5542 GEM_BUG_ON(!list_empty(&ve->context.signals)); 5543 5544 /* 5545 * When destroying the virtual engine, we have to be aware that 5546 * it may still be in use from an hardirq/softirq context causing 5547 * the resubmission of a completed request (background completion 5548 * due to preempt-to-busy). Before we can free the engine, we need 5549 * to flush the submission code and tasklets that are still potentially 5550 * accessing the engine. Flushing the tasklets requires process context, 5551 * and since we can guard the resubmit onto the engine with an RCU read 5552 * lock, we can delegate the free of the engine to an RCU worker. 5553 */ 5554 INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy); 5555 queue_rcu_work(system_wq, &ve->rcu); 5556 } 5557 5558 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5559 { 5560 int swp; 5561 5562 /* 5563 * Pick a random sibling on starting to help spread the load around. 5564 * 5565 * New contexts are typically created with exactly the same order 5566 * of siblings, and often started in batches. Due to the way we iterate 5567 * the array of sibling when submitting requests, sibling[0] is 5568 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5569 * randomised across the system, we also help spread the load by the 5570 * first engine we inspect being different each time. 5571 * 5572 * NB This does not force us to execute on this engine, it will just 5573 * typically be the first we inspect for submission. 5574 */ 5575 swp = prandom_u32_max(ve->num_siblings); 5576 if (swp) 5577 swap(ve->siblings[swp], ve->siblings[0]); 5578 } 5579 5580 static int virtual_context_alloc(struct intel_context *ce) 5581 { 5582 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5583 5584 return __execlists_context_alloc(ce, ve->siblings[0]); 5585 } 5586 5587 static int virtual_context_pin(struct intel_context *ce, void *vaddr) 5588 { 5589 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5590 5591 /* Note: we must use a real engine class for setting up reg state */ 5592 return __execlists_context_pin(ce, ve->siblings[0], vaddr); 5593 } 5594 5595 static void virtual_context_enter(struct intel_context *ce) 5596 { 5597 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5598 unsigned int n; 5599 5600 for (n = 0; n < ve->num_siblings; n++) 5601 intel_engine_pm_get(ve->siblings[n]); 5602 5603 intel_timeline_enter(ce->timeline); 5604 } 5605 5606 static void virtual_context_exit(struct intel_context *ce) 5607 { 5608 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5609 unsigned int n; 5610 5611 intel_timeline_exit(ce->timeline); 5612 5613 for (n = 0; n < ve->num_siblings; n++) 5614 intel_engine_pm_put(ve->siblings[n]); 5615 } 5616 5617 static const struct intel_context_ops virtual_context_ops = { 5618 .alloc = virtual_context_alloc, 5619 5620 .pre_pin = execlists_context_pre_pin, 5621 .pin = virtual_context_pin, 5622 .unpin = execlists_context_unpin, 5623 .post_unpin = execlists_context_post_unpin, 5624 5625 .enter = virtual_context_enter, 5626 .exit = virtual_context_exit, 5627 5628 .destroy = virtual_context_destroy, 5629 }; 5630 5631 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5632 { 5633 struct i915_request *rq; 5634 intel_engine_mask_t mask; 5635 5636 rq = READ_ONCE(ve->request); 5637 if (!rq) 5638 return 0; 5639 5640 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5641 mask = rq->execution_mask; 5642 if (unlikely(!mask)) { 5643 /* Invalid selection, submit to a random engine in error */ 5644 i915_request_set_error_once(rq, -ENODEV); 5645 mask = ve->siblings[0]->mask; 5646 } 5647 5648 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5649 rq->fence.context, rq->fence.seqno, 5650 mask, ve->base.execlists.queue_priority_hint); 5651 5652 return mask; 5653 } 5654 5655 static void virtual_submission_tasklet(unsigned long data) 5656 { 5657 struct virtual_engine * const ve = (struct virtual_engine *)data; 5658 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5659 intel_engine_mask_t mask; 5660 unsigned int n; 5661 5662 rcu_read_lock(); 5663 mask = virtual_submission_mask(ve); 5664 rcu_read_unlock(); 5665 if (unlikely(!mask)) 5666 return; 5667 5668 local_irq_disable(); 5669 for (n = 0; n < ve->num_siblings; n++) { 5670 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5671 struct ve_node * const node = &ve->nodes[sibling->id]; 5672 struct rb_node **parent, *rb; 5673 bool first; 5674 5675 if (!READ_ONCE(ve->request)) 5676 break; /* already handled by a sibling's tasklet */ 5677 5678 if (unlikely(!(mask & sibling->mask))) { 5679 if (!RB_EMPTY_NODE(&node->rb)) { 5680 spin_lock(&sibling->active.lock); 5681 rb_erase_cached(&node->rb, 5682 &sibling->execlists.virtual); 5683 RB_CLEAR_NODE(&node->rb); 5684 spin_unlock(&sibling->active.lock); 5685 } 5686 continue; 5687 } 5688 5689 spin_lock(&sibling->active.lock); 5690 5691 if (!RB_EMPTY_NODE(&node->rb)) { 5692 /* 5693 * Cheat and avoid rebalancing the tree if we can 5694 * reuse this node in situ. 5695 */ 5696 first = rb_first_cached(&sibling->execlists.virtual) == 5697 &node->rb; 5698 if (prio == node->prio || (prio > node->prio && first)) 5699 goto submit_engine; 5700 5701 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5702 } 5703 5704 rb = NULL; 5705 first = true; 5706 parent = &sibling->execlists.virtual.rb_root.rb_node; 5707 while (*parent) { 5708 struct ve_node *other; 5709 5710 rb = *parent; 5711 other = rb_entry(rb, typeof(*other), rb); 5712 if (prio > other->prio) { 5713 parent = &rb->rb_left; 5714 } else { 5715 parent = &rb->rb_right; 5716 first = false; 5717 } 5718 } 5719 5720 rb_link_node(&node->rb, rb, parent); 5721 rb_insert_color_cached(&node->rb, 5722 &sibling->execlists.virtual, 5723 first); 5724 5725 submit_engine: 5726 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5727 node->prio = prio; 5728 if (first && prio > sibling->execlists.queue_priority_hint) 5729 tasklet_hi_schedule(&sibling->execlists.tasklet); 5730 5731 spin_unlock(&sibling->active.lock); 5732 } 5733 local_irq_enable(); 5734 } 5735 5736 static void virtual_submit_request(struct i915_request *rq) 5737 { 5738 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5739 struct i915_request *old; 5740 unsigned long flags; 5741 5742 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5743 rq->fence.context, 5744 rq->fence.seqno); 5745 5746 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5747 5748 spin_lock_irqsave(&ve->base.active.lock, flags); 5749 5750 old = ve->request; 5751 if (old) { /* background completion event from preempt-to-busy */ 5752 GEM_BUG_ON(!i915_request_completed(old)); 5753 __i915_request_submit(old); 5754 i915_request_put(old); 5755 } 5756 5757 if (i915_request_completed(rq)) { 5758 __i915_request_submit(rq); 5759 5760 ve->base.execlists.queue_priority_hint = INT_MIN; 5761 ve->request = NULL; 5762 } else { 5763 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5764 ve->request = i915_request_get(rq); 5765 5766 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5767 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5768 5769 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5770 } 5771 5772 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5773 } 5774 5775 static struct ve_bond * 5776 virtual_find_bond(struct virtual_engine *ve, 5777 const struct intel_engine_cs *master) 5778 { 5779 int i; 5780 5781 for (i = 0; i < ve->num_bonds; i++) { 5782 if (ve->bonds[i].master == master) 5783 return &ve->bonds[i]; 5784 } 5785 5786 return NULL; 5787 } 5788 5789 static void 5790 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5791 { 5792 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5793 intel_engine_mask_t allowed, exec; 5794 struct ve_bond *bond; 5795 5796 allowed = ~to_request(signal)->engine->mask; 5797 5798 bond = virtual_find_bond(ve, to_request(signal)->engine); 5799 if (bond) 5800 allowed &= bond->sibling_mask; 5801 5802 /* Restrict the bonded request to run on only the available engines */ 5803 exec = READ_ONCE(rq->execution_mask); 5804 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5805 ; 5806 5807 /* Prevent the master from being re-run on the bonded engines */ 5808 to_request(signal)->execution_mask &= ~allowed; 5809 } 5810 5811 struct intel_context * 5812 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5813 unsigned int count) 5814 { 5815 struct virtual_engine *ve; 5816 unsigned int n; 5817 int err; 5818 5819 if (count == 0) 5820 return ERR_PTR(-EINVAL); 5821 5822 if (count == 1) 5823 return intel_context_create(siblings[0]); 5824 5825 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5826 if (!ve) 5827 return ERR_PTR(-ENOMEM); 5828 5829 ve->base.i915 = siblings[0]->i915; 5830 ve->base.gt = siblings[0]->gt; 5831 ve->base.uncore = siblings[0]->uncore; 5832 ve->base.id = -1; 5833 5834 ve->base.class = OTHER_CLASS; 5835 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5836 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5837 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5838 5839 /* 5840 * The decision on whether to submit a request using semaphores 5841 * depends on the saturated state of the engine. We only compute 5842 * this during HW submission of the request, and we need for this 5843 * state to be globally applied to all requests being submitted 5844 * to this engine. Virtual engines encompass more than one physical 5845 * engine and so we cannot accurately tell in advance if one of those 5846 * engines is already saturated and so cannot afford to use a semaphore 5847 * and be pessimized in priority for doing so -- if we are the only 5848 * context using semaphores after all other clients have stopped, we 5849 * will be starved on the saturated system. Such a global switch for 5850 * semaphores is less than ideal, but alas is the current compromise. 5851 */ 5852 ve->base.saturated = ALL_ENGINES; 5853 5854 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5855 5856 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5857 intel_engine_init_execlists(&ve->base); 5858 5859 ve->base.cops = &virtual_context_ops; 5860 ve->base.request_alloc = execlists_request_alloc; 5861 5862 ve->base.schedule = i915_schedule; 5863 ve->base.submit_request = virtual_submit_request; 5864 ve->base.bond_execute = virtual_bond_execute; 5865 5866 INIT_LIST_HEAD(virtual_queue(ve)); 5867 ve->base.execlists.queue_priority_hint = INT_MIN; 5868 tasklet_init(&ve->base.execlists.tasklet, 5869 virtual_submission_tasklet, 5870 (unsigned long)ve); 5871 5872 intel_context_init(&ve->context, &ve->base); 5873 5874 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); 5875 if (!ve->base.breadcrumbs) { 5876 err = -ENOMEM; 5877 goto err_put; 5878 } 5879 5880 for (n = 0; n < count; n++) { 5881 struct intel_engine_cs *sibling = siblings[n]; 5882 5883 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5884 if (sibling->mask & ve->base.mask) { 5885 DRM_DEBUG("duplicate %s entry in load balancer\n", 5886 sibling->name); 5887 err = -EINVAL; 5888 goto err_put; 5889 } 5890 5891 /* 5892 * The virtual engine implementation is tightly coupled to 5893 * the execlists backend -- we push out request directly 5894 * into a tree inside each physical engine. We could support 5895 * layering if we handle cloning of the requests and 5896 * submitting a copy into each backend. 5897 */ 5898 if (sibling->execlists.tasklet.func != 5899 execlists_submission_tasklet) { 5900 err = -ENODEV; 5901 goto err_put; 5902 } 5903 5904 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5905 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5906 5907 ve->siblings[ve->num_siblings++] = sibling; 5908 ve->base.mask |= sibling->mask; 5909 5910 /* 5911 * All physical engines must be compatible for their emission 5912 * functions (as we build the instructions during request 5913 * construction and do not alter them before submission 5914 * on the physical engine). We use the engine class as a guide 5915 * here, although that could be refined. 5916 */ 5917 if (ve->base.class != OTHER_CLASS) { 5918 if (ve->base.class != sibling->class) { 5919 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5920 sibling->class, ve->base.class); 5921 err = -EINVAL; 5922 goto err_put; 5923 } 5924 continue; 5925 } 5926 5927 ve->base.class = sibling->class; 5928 ve->base.uabi_class = sibling->uabi_class; 5929 snprintf(ve->base.name, sizeof(ve->base.name), 5930 "v%dx%d", ve->base.class, count); 5931 ve->base.context_size = sibling->context_size; 5932 5933 ve->base.emit_bb_start = sibling->emit_bb_start; 5934 ve->base.emit_flush = sibling->emit_flush; 5935 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5936 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5937 ve->base.emit_fini_breadcrumb_dw = 5938 sibling->emit_fini_breadcrumb_dw; 5939 5940 ve->base.flags = sibling->flags; 5941 } 5942 5943 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5944 5945 virtual_engine_initial_hint(ve); 5946 return &ve->context; 5947 5948 err_put: 5949 intel_context_put(&ve->context); 5950 return ERR_PTR(err); 5951 } 5952 5953 struct intel_context * 5954 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5955 { 5956 struct virtual_engine *se = to_virtual_engine(src); 5957 struct intel_context *dst; 5958 5959 dst = intel_execlists_create_virtual(se->siblings, 5960 se->num_siblings); 5961 if (IS_ERR(dst)) 5962 return dst; 5963 5964 if (se->num_bonds) { 5965 struct virtual_engine *de = to_virtual_engine(dst->engine); 5966 5967 de->bonds = kmemdup(se->bonds, 5968 sizeof(*se->bonds) * se->num_bonds, 5969 GFP_KERNEL); 5970 if (!de->bonds) { 5971 intel_context_put(dst); 5972 return ERR_PTR(-ENOMEM); 5973 } 5974 5975 de->num_bonds = se->num_bonds; 5976 } 5977 5978 return dst; 5979 } 5980 5981 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5982 const struct intel_engine_cs *master, 5983 const struct intel_engine_cs *sibling) 5984 { 5985 struct virtual_engine *ve = to_virtual_engine(engine); 5986 struct ve_bond *bond; 5987 int n; 5988 5989 /* Sanity check the sibling is part of the virtual engine */ 5990 for (n = 0; n < ve->num_siblings; n++) 5991 if (sibling == ve->siblings[n]) 5992 break; 5993 if (n == ve->num_siblings) 5994 return -EINVAL; 5995 5996 bond = virtual_find_bond(ve, master); 5997 if (bond) { 5998 bond->sibling_mask |= sibling->mask; 5999 return 0; 6000 } 6001 6002 bond = krealloc(ve->bonds, 6003 sizeof(*bond) * (ve->num_bonds + 1), 6004 GFP_KERNEL); 6005 if (!bond) 6006 return -ENOMEM; 6007 6008 bond[ve->num_bonds].master = master; 6009 bond[ve->num_bonds].sibling_mask = sibling->mask; 6010 6011 ve->bonds = bond; 6012 ve->num_bonds++; 6013 6014 return 0; 6015 } 6016 6017 void intel_execlists_show_requests(struct intel_engine_cs *engine, 6018 struct drm_printer *m, 6019 void (*show_request)(struct drm_printer *m, 6020 const struct i915_request *rq, 6021 const char *prefix, 6022 int indent), 6023 unsigned int max) 6024 { 6025 const struct intel_engine_execlists *execlists = &engine->execlists; 6026 struct i915_request *rq, *last; 6027 unsigned long flags; 6028 unsigned int count; 6029 struct rb_node *rb; 6030 6031 spin_lock_irqsave(&engine->active.lock, flags); 6032 6033 last = NULL; 6034 count = 0; 6035 list_for_each_entry(rq, &engine->active.requests, sched.link) { 6036 if (count++ < max - 1) 6037 show_request(m, rq, "\t\t", 0); 6038 else 6039 last = rq; 6040 } 6041 if (last) { 6042 if (count > max) { 6043 drm_printf(m, 6044 "\t\t...skipping %d executing requests...\n", 6045 count - max); 6046 } 6047 show_request(m, last, "\t\t", 0); 6048 } 6049 6050 if (execlists->switch_priority_hint != INT_MIN) 6051 drm_printf(m, "\t\tSwitch priority hint: %d\n", 6052 READ_ONCE(execlists->switch_priority_hint)); 6053 if (execlists->queue_priority_hint != INT_MIN) 6054 drm_printf(m, "\t\tQueue priority hint: %d\n", 6055 READ_ONCE(execlists->queue_priority_hint)); 6056 6057 last = NULL; 6058 count = 0; 6059 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 6060 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 6061 int i; 6062 6063 priolist_for_each_request(rq, p, i) { 6064 if (count++ < max - 1) 6065 show_request(m, rq, "\t\t", 0); 6066 else 6067 last = rq; 6068 } 6069 } 6070 if (last) { 6071 if (count > max) { 6072 drm_printf(m, 6073 "\t\t...skipping %d queued requests...\n", 6074 count - max); 6075 } 6076 show_request(m, last, "\t\t", 0); 6077 } 6078 6079 last = NULL; 6080 count = 0; 6081 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 6082 struct virtual_engine *ve = 6083 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 6084 struct i915_request *rq = READ_ONCE(ve->request); 6085 6086 if (rq) { 6087 if (count++ < max - 1) 6088 show_request(m, rq, "\t\t", 0); 6089 else 6090 last = rq; 6091 } 6092 } 6093 if (last) { 6094 if (count > max) { 6095 drm_printf(m, 6096 "\t\t...skipping %d virtual requests...\n", 6097 count - max); 6098 } 6099 show_request(m, last, "\t\t", 0); 6100 } 6101 6102 spin_unlock_irqrestore(&engine->active.lock, flags); 6103 } 6104 6105 void intel_lr_context_reset(struct intel_engine_cs *engine, 6106 struct intel_context *ce, 6107 u32 head, 6108 bool scrub) 6109 { 6110 GEM_BUG_ON(!intel_context_is_pinned(ce)); 6111 6112 /* 6113 * We want a simple context + ring to execute the breadcrumb update. 6114 * We cannot rely on the context being intact across the GPU hang, 6115 * so clear it and rebuild just what we need for the breadcrumb. 6116 * All pending requests for this context will be zapped, and any 6117 * future request will be after userspace has had the opportunity 6118 * to recreate its own state. 6119 */ 6120 if (scrub) 6121 restore_default_state(ce, engine); 6122 6123 /* Rerun the request; its payload has been neutered (if guilty). */ 6124 __execlists_update_reg_state(ce, engine, head); 6125 } 6126 6127 bool 6128 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 6129 { 6130 return engine->set_default_submission == 6131 intel_execlists_set_default_submission; 6132 } 6133 6134 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 6135 #include "selftest_lrc.c" 6136 #endif 6137