1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_breadcrumbs.h" 141 #include "intel_context.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 #include "shmem_utils.h" 152 153 #define RING_EXECLIST_QFULL (1 << 0x2) 154 #define RING_EXECLIST1_VALID (1 << 0x3) 155 #define RING_EXECLIST0_VALID (1 << 0x4) 156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 157 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 158 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 159 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 166 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 169 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 171 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 175 #define GEN12_IDLE_CTX_ID 0x7FF 176 #define GEN12_CSB_CTX_VALID(csb_dw) \ 177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 178 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine, 241 u32 head); 242 243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 244 { 245 if (INTEL_GEN(engine->i915) >= 12) 246 return 0x60; 247 else if (INTEL_GEN(engine->i915) >= 9) 248 return 0x54; 249 else if (engine->class == RENDER_CLASS) 250 return 0x58; 251 else 252 return -1; 253 } 254 255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 256 { 257 if (INTEL_GEN(engine->i915) >= 12) 258 return 0x74; 259 else if (INTEL_GEN(engine->i915) >= 9) 260 return 0x68; 261 else if (engine->class == RENDER_CLASS) 262 return 0xd8; 263 else 264 return -1; 265 } 266 267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 268 { 269 if (INTEL_GEN(engine->i915) >= 12) 270 return 0x12; 271 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 272 return 0x18; 273 else 274 return -1; 275 } 276 277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 278 { 279 int x; 280 281 x = lrc_ring_wa_bb_per_ctx(engine); 282 if (x < 0) 283 return x; 284 285 return x + 2; 286 } 287 288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 289 { 290 int x; 291 292 x = lrc_ring_indirect_ptr(engine); 293 if (x < 0) 294 return x; 295 296 return x + 2; 297 } 298 299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 300 { 301 if (engine->class != RENDER_CLASS) 302 return -1; 303 304 if (INTEL_GEN(engine->i915) >= 12) 305 return 0xb6; 306 else if (INTEL_GEN(engine->i915) >= 11) 307 return 0xaa; 308 else 309 return -1; 310 } 311 312 static u32 313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 314 { 315 switch (INTEL_GEN(engine->i915)) { 316 default: 317 MISSING_CASE(INTEL_GEN(engine->i915)); 318 fallthrough; 319 case 12: 320 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 321 case 11: 322 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 323 case 10: 324 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 325 case 9: 326 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 327 case 8: 328 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 329 } 330 } 331 332 static void 333 lrc_ring_setup_indirect_ctx(u32 *regs, 334 const struct intel_engine_cs *engine, 335 u32 ctx_bb_ggtt_addr, 336 u32 size) 337 { 338 GEM_BUG_ON(!size); 339 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 340 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 341 regs[lrc_ring_indirect_ptr(engine) + 1] = 342 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 343 344 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 345 regs[lrc_ring_indirect_offset(engine) + 1] = 346 lrc_ring_indirect_offset_default(engine) << 6; 347 } 348 349 static u32 intel_context_get_runtime(const struct intel_context *ce) 350 { 351 /* 352 * We can use either ppHWSP[16] which is recorded before the context 353 * switch (and so excludes the cost of context switches) or use the 354 * value from the context image itself, which is saved/restored earlier 355 * and so includes the cost of the save. 356 */ 357 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 358 } 359 360 static void mark_eio(struct i915_request *rq) 361 { 362 if (i915_request_completed(rq)) 363 return; 364 365 GEM_BUG_ON(i915_request_signaled(rq)); 366 367 i915_request_set_error_once(rq, -EIO); 368 i915_request_mark_complete(rq); 369 } 370 371 static struct i915_request * 372 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 373 { 374 struct i915_request *active = rq; 375 376 rcu_read_lock(); 377 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 378 if (i915_request_completed(rq)) 379 break; 380 381 active = rq; 382 } 383 rcu_read_unlock(); 384 385 return active; 386 } 387 388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 389 { 390 return (i915_ggtt_offset(engine->status_page.vma) + 391 I915_GEM_HWS_PREEMPT_ADDR); 392 } 393 394 static inline void 395 ring_set_paused(const struct intel_engine_cs *engine, int state) 396 { 397 /* 398 * We inspect HWS_PREEMPT with a semaphore inside 399 * engine->emit_fini_breadcrumb. If the dword is true, 400 * the ring is paused as the semaphore will busywait 401 * until the dword is false. 402 */ 403 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 404 if (state) 405 wmb(); 406 } 407 408 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 409 { 410 return rb_entry(rb, struct i915_priolist, node); 411 } 412 413 static inline int rq_prio(const struct i915_request *rq) 414 { 415 return READ_ONCE(rq->sched.attr.priority); 416 } 417 418 static int effective_prio(const struct i915_request *rq) 419 { 420 int prio = rq_prio(rq); 421 422 /* 423 * If this request is special and must not be interrupted at any 424 * cost, so be it. Note we are only checking the most recent request 425 * in the context and so may be masking an earlier vip request. It 426 * is hoped that under the conditions where nopreempt is used, this 427 * will not matter (i.e. all requests to that context will be 428 * nopreempt for as long as desired). 429 */ 430 if (i915_request_has_nopreempt(rq)) 431 prio = I915_PRIORITY_UNPREEMPTABLE; 432 433 return prio; 434 } 435 436 static int queue_prio(const struct intel_engine_execlists *execlists) 437 { 438 struct i915_priolist *p; 439 struct rb_node *rb; 440 441 rb = rb_first_cached(&execlists->queue); 442 if (!rb) 443 return INT_MIN; 444 445 /* 446 * As the priolist[] are inverted, with the highest priority in [0], 447 * we have to flip the index value to become priority. 448 */ 449 p = to_priolist(rb); 450 if (!I915_USER_PRIORITY_SHIFT) 451 return p->priority; 452 453 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 454 } 455 456 static inline bool need_preempt(const struct intel_engine_cs *engine, 457 const struct i915_request *rq, 458 struct rb_node *rb) 459 { 460 int last_prio; 461 462 if (!intel_engine_has_semaphores(engine)) 463 return false; 464 465 /* 466 * Check if the current priority hint merits a preemption attempt. 467 * 468 * We record the highest value priority we saw during rescheduling 469 * prior to this dequeue, therefore we know that if it is strictly 470 * less than the current tail of ESLP[0], we do not need to force 471 * a preempt-to-idle cycle. 472 * 473 * However, the priority hint is a mere hint that we may need to 474 * preempt. If that hint is stale or we may be trying to preempt 475 * ourselves, ignore the request. 476 * 477 * More naturally we would write 478 * prio >= max(0, last); 479 * except that we wish to prevent triggering preemption at the same 480 * priority level: the task that is running should remain running 481 * to preserve FIFO ordering of dependencies. 482 */ 483 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 484 if (engine->execlists.queue_priority_hint <= last_prio) 485 return false; 486 487 /* 488 * Check against the first request in ELSP[1], it will, thanks to the 489 * power of PI, be the highest priority of that context. 490 */ 491 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 492 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 493 return true; 494 495 if (rb) { 496 struct virtual_engine *ve = 497 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 498 bool preempt = false; 499 500 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 501 struct i915_request *next; 502 503 rcu_read_lock(); 504 next = READ_ONCE(ve->request); 505 if (next) 506 preempt = rq_prio(next) > last_prio; 507 rcu_read_unlock(); 508 } 509 510 if (preempt) 511 return preempt; 512 } 513 514 /* 515 * If the inflight context did not trigger the preemption, then maybe 516 * it was the set of queued requests? Pick the highest priority in 517 * the queue (the first active priolist) and see if it deserves to be 518 * running instead of ELSP[0]. 519 * 520 * The highest priority request in the queue can not be either 521 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 522 * context, it's priority would not exceed ELSP[0] aka last_prio. 523 */ 524 return queue_prio(&engine->execlists) > last_prio; 525 } 526 527 __maybe_unused static inline bool 528 assert_priority_queue(const struct i915_request *prev, 529 const struct i915_request *next) 530 { 531 /* 532 * Without preemption, the prev may refer to the still active element 533 * which we refuse to let go. 534 * 535 * Even with preemption, there are times when we think it is better not 536 * to preempt and leave an ostensibly lower priority request in flight. 537 */ 538 if (i915_request_is_active(prev)) 539 return true; 540 541 return rq_prio(prev) >= rq_prio(next); 542 } 543 544 /* 545 * The context descriptor encodes various attributes of a context, 546 * including its GTT address and some flags. Because it's fairly 547 * expensive to calculate, we'll just do it once and cache the result, 548 * which remains valid until the context is unpinned. 549 * 550 * This is what a descriptor looks like, from LSB to MSB:: 551 * 552 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 553 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 554 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 555 * bits 53-54: mbz, reserved for use by hardware 556 * bits 55-63: group ID, currently unused and set to 0 557 * 558 * Starting from Gen11, the upper dword of the descriptor has a new format: 559 * 560 * bits 32-36: reserved 561 * bits 37-47: SW context ID 562 * bits 48:53: engine instance 563 * bit 54: mbz, reserved for use by hardware 564 * bits 55-60: SW counter 565 * bits 61-63: engine class 566 * 567 * engine info, SW context ID and SW counter need to form a unique number 568 * (Context ID) per lrc. 569 */ 570 static u32 571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 572 { 573 u32 desc; 574 575 desc = INTEL_LEGACY_32B_CONTEXT; 576 if (i915_vm_is_4lvl(ce->vm)) 577 desc = INTEL_LEGACY_64B_CONTEXT; 578 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 579 580 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 581 if (IS_GEN(engine->i915, 8)) 582 desc |= GEN8_CTX_L3LLC_COHERENT; 583 584 return i915_ggtt_offset(ce->state) | desc; 585 } 586 587 static inline unsigned int dword_in_page(void *addr) 588 { 589 return offset_in_page(addr) / sizeof(u32); 590 } 591 592 static void set_offsets(u32 *regs, 593 const u8 *data, 594 const struct intel_engine_cs *engine, 595 bool clear) 596 #define NOP(x) (BIT(7) | (x)) 597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 598 #define POSTED BIT(0) 599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 600 #define REG16(x) \ 601 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 602 (((x) >> 2) & 0x7f) 603 #define END(total_state_size) 0, (total_state_size) 604 { 605 const u32 base = engine->mmio_base; 606 607 while (*data) { 608 u8 count, flags; 609 610 if (*data & BIT(7)) { /* skip */ 611 count = *data++ & ~BIT(7); 612 if (clear) 613 memset32(regs, MI_NOOP, count); 614 regs += count; 615 continue; 616 } 617 618 count = *data & 0x3f; 619 flags = *data >> 6; 620 data++; 621 622 *regs = MI_LOAD_REGISTER_IMM(count); 623 if (flags & POSTED) 624 *regs |= MI_LRI_FORCE_POSTED; 625 if (INTEL_GEN(engine->i915) >= 11) 626 *regs |= MI_LRI_LRM_CS_MMIO; 627 regs++; 628 629 GEM_BUG_ON(!count); 630 do { 631 u32 offset = 0; 632 u8 v; 633 634 do { 635 v = *data++; 636 offset <<= 7; 637 offset |= v & ~BIT(7); 638 } while (v & BIT(7)); 639 640 regs[0] = base + (offset << 2); 641 if (clear) 642 regs[1] = 0; 643 regs += 2; 644 } while (--count); 645 } 646 647 if (clear) { 648 u8 count = *++data; 649 650 /* Clear past the tail for HW access */ 651 GEM_BUG_ON(dword_in_page(regs) > count); 652 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 653 654 /* Close the batch; used mainly by live_lrc_layout() */ 655 *regs = MI_BATCH_BUFFER_END; 656 if (INTEL_GEN(engine->i915) >= 10) 657 *regs |= BIT(0); 658 } 659 } 660 661 static const u8 gen8_xcs_offsets[] = { 662 NOP(1), 663 LRI(11, 0), 664 REG16(0x244), 665 REG(0x034), 666 REG(0x030), 667 REG(0x038), 668 REG(0x03c), 669 REG(0x168), 670 REG(0x140), 671 REG(0x110), 672 REG(0x11c), 673 REG(0x114), 674 REG(0x118), 675 676 NOP(9), 677 LRI(9, 0), 678 REG16(0x3a8), 679 REG16(0x28c), 680 REG16(0x288), 681 REG16(0x284), 682 REG16(0x280), 683 REG16(0x27c), 684 REG16(0x278), 685 REG16(0x274), 686 REG16(0x270), 687 688 NOP(13), 689 LRI(2, 0), 690 REG16(0x200), 691 REG(0x028), 692 693 END(80) 694 }; 695 696 static const u8 gen9_xcs_offsets[] = { 697 NOP(1), 698 LRI(14, POSTED), 699 REG16(0x244), 700 REG(0x034), 701 REG(0x030), 702 REG(0x038), 703 REG(0x03c), 704 REG(0x168), 705 REG(0x140), 706 REG(0x110), 707 REG(0x11c), 708 REG(0x114), 709 REG(0x118), 710 REG(0x1c0), 711 REG(0x1c4), 712 REG(0x1c8), 713 714 NOP(3), 715 LRI(9, POSTED), 716 REG16(0x3a8), 717 REG16(0x28c), 718 REG16(0x288), 719 REG16(0x284), 720 REG16(0x280), 721 REG16(0x27c), 722 REG16(0x278), 723 REG16(0x274), 724 REG16(0x270), 725 726 NOP(13), 727 LRI(1, POSTED), 728 REG16(0x200), 729 730 NOP(13), 731 LRI(44, POSTED), 732 REG(0x028), 733 REG(0x09c), 734 REG(0x0c0), 735 REG(0x178), 736 REG(0x17c), 737 REG16(0x358), 738 REG(0x170), 739 REG(0x150), 740 REG(0x154), 741 REG(0x158), 742 REG16(0x41c), 743 REG16(0x600), 744 REG16(0x604), 745 REG16(0x608), 746 REG16(0x60c), 747 REG16(0x610), 748 REG16(0x614), 749 REG16(0x618), 750 REG16(0x61c), 751 REG16(0x620), 752 REG16(0x624), 753 REG16(0x628), 754 REG16(0x62c), 755 REG16(0x630), 756 REG16(0x634), 757 REG16(0x638), 758 REG16(0x63c), 759 REG16(0x640), 760 REG16(0x644), 761 REG16(0x648), 762 REG16(0x64c), 763 REG16(0x650), 764 REG16(0x654), 765 REG16(0x658), 766 REG16(0x65c), 767 REG16(0x660), 768 REG16(0x664), 769 REG16(0x668), 770 REG16(0x66c), 771 REG16(0x670), 772 REG16(0x674), 773 REG16(0x678), 774 REG16(0x67c), 775 REG(0x068), 776 777 END(176) 778 }; 779 780 static const u8 gen12_xcs_offsets[] = { 781 NOP(1), 782 LRI(13, POSTED), 783 REG16(0x244), 784 REG(0x034), 785 REG(0x030), 786 REG(0x038), 787 REG(0x03c), 788 REG(0x168), 789 REG(0x140), 790 REG(0x110), 791 REG(0x1c0), 792 REG(0x1c4), 793 REG(0x1c8), 794 REG(0x180), 795 REG16(0x2b4), 796 797 NOP(5), 798 LRI(9, POSTED), 799 REG16(0x3a8), 800 REG16(0x28c), 801 REG16(0x288), 802 REG16(0x284), 803 REG16(0x280), 804 REG16(0x27c), 805 REG16(0x278), 806 REG16(0x274), 807 REG16(0x270), 808 809 END(80) 810 }; 811 812 static const u8 gen8_rcs_offsets[] = { 813 NOP(1), 814 LRI(14, POSTED), 815 REG16(0x244), 816 REG(0x034), 817 REG(0x030), 818 REG(0x038), 819 REG(0x03c), 820 REG(0x168), 821 REG(0x140), 822 REG(0x110), 823 REG(0x11c), 824 REG(0x114), 825 REG(0x118), 826 REG(0x1c0), 827 REG(0x1c4), 828 REG(0x1c8), 829 830 NOP(3), 831 LRI(9, POSTED), 832 REG16(0x3a8), 833 REG16(0x28c), 834 REG16(0x288), 835 REG16(0x284), 836 REG16(0x280), 837 REG16(0x27c), 838 REG16(0x278), 839 REG16(0x274), 840 REG16(0x270), 841 842 NOP(13), 843 LRI(1, 0), 844 REG(0x0c8), 845 846 END(80) 847 }; 848 849 static const u8 gen9_rcs_offsets[] = { 850 NOP(1), 851 LRI(14, POSTED), 852 REG16(0x244), 853 REG(0x34), 854 REG(0x30), 855 REG(0x38), 856 REG(0x3c), 857 REG(0x168), 858 REG(0x140), 859 REG(0x110), 860 REG(0x11c), 861 REG(0x114), 862 REG(0x118), 863 REG(0x1c0), 864 REG(0x1c4), 865 REG(0x1c8), 866 867 NOP(3), 868 LRI(9, POSTED), 869 REG16(0x3a8), 870 REG16(0x28c), 871 REG16(0x288), 872 REG16(0x284), 873 REG16(0x280), 874 REG16(0x27c), 875 REG16(0x278), 876 REG16(0x274), 877 REG16(0x270), 878 879 NOP(13), 880 LRI(1, 0), 881 REG(0xc8), 882 883 NOP(13), 884 LRI(44, POSTED), 885 REG(0x28), 886 REG(0x9c), 887 REG(0xc0), 888 REG(0x178), 889 REG(0x17c), 890 REG16(0x358), 891 REG(0x170), 892 REG(0x150), 893 REG(0x154), 894 REG(0x158), 895 REG16(0x41c), 896 REG16(0x600), 897 REG16(0x604), 898 REG16(0x608), 899 REG16(0x60c), 900 REG16(0x610), 901 REG16(0x614), 902 REG16(0x618), 903 REG16(0x61c), 904 REG16(0x620), 905 REG16(0x624), 906 REG16(0x628), 907 REG16(0x62c), 908 REG16(0x630), 909 REG16(0x634), 910 REG16(0x638), 911 REG16(0x63c), 912 REG16(0x640), 913 REG16(0x644), 914 REG16(0x648), 915 REG16(0x64c), 916 REG16(0x650), 917 REG16(0x654), 918 REG16(0x658), 919 REG16(0x65c), 920 REG16(0x660), 921 REG16(0x664), 922 REG16(0x668), 923 REG16(0x66c), 924 REG16(0x670), 925 REG16(0x674), 926 REG16(0x678), 927 REG16(0x67c), 928 REG(0x68), 929 930 END(176) 931 }; 932 933 static const u8 gen11_rcs_offsets[] = { 934 NOP(1), 935 LRI(15, POSTED), 936 REG16(0x244), 937 REG(0x034), 938 REG(0x030), 939 REG(0x038), 940 REG(0x03c), 941 REG(0x168), 942 REG(0x140), 943 REG(0x110), 944 REG(0x11c), 945 REG(0x114), 946 REG(0x118), 947 REG(0x1c0), 948 REG(0x1c4), 949 REG(0x1c8), 950 REG(0x180), 951 952 NOP(1), 953 LRI(9, POSTED), 954 REG16(0x3a8), 955 REG16(0x28c), 956 REG16(0x288), 957 REG16(0x284), 958 REG16(0x280), 959 REG16(0x27c), 960 REG16(0x278), 961 REG16(0x274), 962 REG16(0x270), 963 964 LRI(1, POSTED), 965 REG(0x1b0), 966 967 NOP(10), 968 LRI(1, 0), 969 REG(0x0c8), 970 971 END(80) 972 }; 973 974 static const u8 gen12_rcs_offsets[] = { 975 NOP(1), 976 LRI(13, POSTED), 977 REG16(0x244), 978 REG(0x034), 979 REG(0x030), 980 REG(0x038), 981 REG(0x03c), 982 REG(0x168), 983 REG(0x140), 984 REG(0x110), 985 REG(0x1c0), 986 REG(0x1c4), 987 REG(0x1c8), 988 REG(0x180), 989 REG16(0x2b4), 990 991 NOP(5), 992 LRI(9, POSTED), 993 REG16(0x3a8), 994 REG16(0x28c), 995 REG16(0x288), 996 REG16(0x284), 997 REG16(0x280), 998 REG16(0x27c), 999 REG16(0x278), 1000 REG16(0x274), 1001 REG16(0x270), 1002 1003 LRI(3, POSTED), 1004 REG(0x1b0), 1005 REG16(0x5a8), 1006 REG16(0x5ac), 1007 1008 NOP(6), 1009 LRI(1, 0), 1010 REG(0x0c8), 1011 NOP(3 + 9 + 1), 1012 1013 LRI(51, POSTED), 1014 REG16(0x588), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG16(0x588), 1020 REG(0x028), 1021 REG(0x09c), 1022 REG(0x0c0), 1023 REG(0x178), 1024 REG(0x17c), 1025 REG16(0x358), 1026 REG(0x170), 1027 REG(0x150), 1028 REG(0x154), 1029 REG(0x158), 1030 REG16(0x41c), 1031 REG16(0x600), 1032 REG16(0x604), 1033 REG16(0x608), 1034 REG16(0x60c), 1035 REG16(0x610), 1036 REG16(0x614), 1037 REG16(0x618), 1038 REG16(0x61c), 1039 REG16(0x620), 1040 REG16(0x624), 1041 REG16(0x628), 1042 REG16(0x62c), 1043 REG16(0x630), 1044 REG16(0x634), 1045 REG16(0x638), 1046 REG16(0x63c), 1047 REG16(0x640), 1048 REG16(0x644), 1049 REG16(0x648), 1050 REG16(0x64c), 1051 REG16(0x650), 1052 REG16(0x654), 1053 REG16(0x658), 1054 REG16(0x65c), 1055 REG16(0x660), 1056 REG16(0x664), 1057 REG16(0x668), 1058 REG16(0x66c), 1059 REG16(0x670), 1060 REG16(0x674), 1061 REG16(0x678), 1062 REG16(0x67c), 1063 REG(0x068), 1064 REG(0x084), 1065 NOP(1), 1066 1067 END(192) 1068 }; 1069 1070 #undef END 1071 #undef REG16 1072 #undef REG 1073 #undef LRI 1074 #undef NOP 1075 1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1077 { 1078 /* 1079 * The gen12+ lists only have the registers we program in the basic 1080 * default state. We rely on the context image using relative 1081 * addressing to automatic fixup the register state between the 1082 * physical engines for virtual engine. 1083 */ 1084 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1085 !intel_engine_has_relative_mmio(engine)); 1086 1087 if (engine->class == RENDER_CLASS) { 1088 if (INTEL_GEN(engine->i915) >= 12) 1089 return gen12_rcs_offsets; 1090 else if (INTEL_GEN(engine->i915) >= 11) 1091 return gen11_rcs_offsets; 1092 else if (INTEL_GEN(engine->i915) >= 9) 1093 return gen9_rcs_offsets; 1094 else 1095 return gen8_rcs_offsets; 1096 } else { 1097 if (INTEL_GEN(engine->i915) >= 12) 1098 return gen12_xcs_offsets; 1099 else if (INTEL_GEN(engine->i915) >= 9) 1100 return gen9_xcs_offsets; 1101 else 1102 return gen8_xcs_offsets; 1103 } 1104 } 1105 1106 static struct i915_request * 1107 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1108 { 1109 struct i915_request *rq, *rn, *active = NULL; 1110 struct list_head *pl; 1111 int prio = I915_PRIORITY_INVALID; 1112 1113 lockdep_assert_held(&engine->active.lock); 1114 1115 list_for_each_entry_safe_reverse(rq, rn, 1116 &engine->active.requests, 1117 sched.link) { 1118 if (i915_request_completed(rq)) 1119 continue; /* XXX */ 1120 1121 __i915_request_unsubmit(rq); 1122 1123 /* 1124 * Push the request back into the queue for later resubmission. 1125 * If this request is not native to this physical engine (i.e. 1126 * it came from a virtual source), push it back onto the virtual 1127 * engine so that it can be moved across onto another physical 1128 * engine as load dictates. 1129 */ 1130 if (likely(rq->execution_mask == engine->mask)) { 1131 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1132 if (rq_prio(rq) != prio) { 1133 prio = rq_prio(rq); 1134 pl = i915_sched_lookup_priolist(engine, prio); 1135 } 1136 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1137 1138 list_move(&rq->sched.link, pl); 1139 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1140 1141 /* Check in case we rollback so far we wrap [size/2] */ 1142 if (intel_ring_direction(rq->ring, 1143 rq->tail, 1144 rq->ring->tail + 8) > 0) 1145 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1146 1147 active = rq; 1148 } else { 1149 struct intel_engine_cs *owner = rq->context->engine; 1150 1151 WRITE_ONCE(rq->engine, owner); 1152 owner->submit_request(rq); 1153 active = NULL; 1154 } 1155 } 1156 1157 return active; 1158 } 1159 1160 struct i915_request * 1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1162 { 1163 struct intel_engine_cs *engine = 1164 container_of(execlists, typeof(*engine), execlists); 1165 1166 return __unwind_incomplete_requests(engine); 1167 } 1168 1169 static inline void 1170 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1171 { 1172 /* 1173 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1174 * The compiler should eliminate this function as dead-code. 1175 */ 1176 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1177 return; 1178 1179 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1180 status, rq); 1181 } 1182 1183 static void intel_engine_context_in(struct intel_engine_cs *engine) 1184 { 1185 unsigned long flags; 1186 1187 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1188 return; 1189 1190 write_seqlock_irqsave(&engine->stats.lock, flags); 1191 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1192 engine->stats.start = ktime_get(); 1193 atomic_inc(&engine->stats.active); 1194 } 1195 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1196 } 1197 1198 static void intel_engine_context_out(struct intel_engine_cs *engine) 1199 { 1200 unsigned long flags; 1201 1202 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1203 1204 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1205 return; 1206 1207 write_seqlock_irqsave(&engine->stats.lock, flags); 1208 if (atomic_dec_and_test(&engine->stats.active)) { 1209 engine->stats.total = 1210 ktime_add(engine->stats.total, 1211 ktime_sub(ktime_get(), engine->stats.start)); 1212 } 1213 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1214 } 1215 1216 static void 1217 execlists_check_context(const struct intel_context *ce, 1218 const struct intel_engine_cs *engine) 1219 { 1220 const struct intel_ring *ring = ce->ring; 1221 u32 *regs = ce->lrc_reg_state; 1222 bool valid = true; 1223 int x; 1224 1225 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1226 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1227 engine->name, 1228 regs[CTX_RING_START], 1229 i915_ggtt_offset(ring->vma)); 1230 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1231 valid = false; 1232 } 1233 1234 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1235 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1236 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1237 engine->name, 1238 regs[CTX_RING_CTL], 1239 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1240 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1241 valid = false; 1242 } 1243 1244 x = lrc_ring_mi_mode(engine); 1245 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1246 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1247 engine->name, regs[x + 1]); 1248 regs[x + 1] &= ~STOP_RING; 1249 regs[x + 1] |= STOP_RING << 16; 1250 valid = false; 1251 } 1252 1253 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1254 } 1255 1256 static void restore_default_state(struct intel_context *ce, 1257 struct intel_engine_cs *engine) 1258 { 1259 u32 *regs; 1260 1261 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1262 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1263 1264 ce->runtime.last = intel_context_get_runtime(ce); 1265 } 1266 1267 static void reset_active(struct i915_request *rq, 1268 struct intel_engine_cs *engine) 1269 { 1270 struct intel_context * const ce = rq->context; 1271 u32 head; 1272 1273 /* 1274 * The executing context has been cancelled. We want to prevent 1275 * further execution along this context and propagate the error on 1276 * to anything depending on its results. 1277 * 1278 * In __i915_request_submit(), we apply the -EIO and remove the 1279 * requests' payloads for any banned requests. But first, we must 1280 * rewind the context back to the start of the incomplete request so 1281 * that we do not jump back into the middle of the batch. 1282 * 1283 * We preserve the breadcrumbs and semaphores of the incomplete 1284 * requests so that inter-timeline dependencies (i.e other timelines) 1285 * remain correctly ordered. And we defer to __i915_request_submit() 1286 * so that all asynchronous waits are correctly handled. 1287 */ 1288 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1289 rq->fence.context, rq->fence.seqno); 1290 1291 /* On resubmission of the active request, payload will be scrubbed */ 1292 if (i915_request_completed(rq)) 1293 head = rq->tail; 1294 else 1295 head = active_request(ce->timeline, rq)->head; 1296 head = intel_ring_wrap(ce->ring, head); 1297 1298 /* Scrub the context image to prevent replaying the previous batch */ 1299 restore_default_state(ce, engine); 1300 __execlists_update_reg_state(ce, engine, head); 1301 1302 /* We've switched away, so this should be a no-op, but intent matters */ 1303 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1304 } 1305 1306 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1307 { 1308 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1309 ce->runtime.num_underflow += dt < 0; 1310 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1311 #endif 1312 } 1313 1314 static void intel_context_update_runtime(struct intel_context *ce) 1315 { 1316 u32 old; 1317 s32 dt; 1318 1319 if (intel_context_is_barrier(ce)) 1320 return; 1321 1322 old = ce->runtime.last; 1323 ce->runtime.last = intel_context_get_runtime(ce); 1324 dt = ce->runtime.last - old; 1325 1326 if (unlikely(dt <= 0)) { 1327 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1328 old, ce->runtime.last, dt); 1329 st_update_runtime_underflow(ce, dt); 1330 return; 1331 } 1332 1333 ewma_runtime_add(&ce->runtime.avg, dt); 1334 ce->runtime.total += dt; 1335 } 1336 1337 static inline struct intel_engine_cs * 1338 __execlists_schedule_in(struct i915_request *rq) 1339 { 1340 struct intel_engine_cs * const engine = rq->engine; 1341 struct intel_context * const ce = rq->context; 1342 1343 intel_context_get(ce); 1344 1345 if (unlikely(intel_context_is_banned(ce))) 1346 reset_active(rq, engine); 1347 1348 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1349 execlists_check_context(ce, engine); 1350 1351 if (ce->tag) { 1352 /* Use a fixed tag for OA and friends */ 1353 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1354 ce->lrc.ccid = ce->tag; 1355 } else { 1356 /* We don't need a strict matching tag, just different values */ 1357 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1358 1359 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1360 clear_bit(tag - 1, &engine->context_tag); 1361 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1362 1363 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1364 } 1365 1366 ce->lrc.ccid |= engine->execlists.ccid; 1367 1368 __intel_gt_pm_get(engine->gt); 1369 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1370 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1371 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1372 intel_engine_context_in(engine); 1373 1374 return engine; 1375 } 1376 1377 static inline struct i915_request * 1378 execlists_schedule_in(struct i915_request *rq, int idx) 1379 { 1380 struct intel_context * const ce = rq->context; 1381 struct intel_engine_cs *old; 1382 1383 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1384 trace_i915_request_in(rq, idx); 1385 1386 old = READ_ONCE(ce->inflight); 1387 do { 1388 if (!old) { 1389 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1390 break; 1391 } 1392 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1393 1394 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1395 return i915_request_get(rq); 1396 } 1397 1398 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1399 { 1400 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1401 struct i915_request *next = READ_ONCE(ve->request); 1402 1403 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1404 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1405 } 1406 1407 static inline void 1408 __execlists_schedule_out(struct i915_request *rq, 1409 struct intel_engine_cs * const engine, 1410 unsigned int ccid) 1411 { 1412 struct intel_context * const ce = rq->context; 1413 1414 /* 1415 * NB process_csb() is not under the engine->active.lock and hence 1416 * schedule_out can race with schedule_in meaning that we should 1417 * refrain from doing non-trivial work here. 1418 */ 1419 1420 /* 1421 * If we have just completed this context, the engine may now be 1422 * idle and we want to re-enter powersaving. 1423 */ 1424 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1425 i915_request_completed(rq)) 1426 intel_engine_add_retire(engine, ce->timeline); 1427 1428 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1429 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1430 if (ccid < BITS_PER_LONG) { 1431 GEM_BUG_ON(ccid == 0); 1432 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1433 set_bit(ccid - 1, &engine->context_tag); 1434 } 1435 1436 intel_context_update_runtime(ce); 1437 intel_engine_context_out(engine); 1438 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1439 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1440 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1441 intel_gt_pm_put_async(engine->gt); 1442 1443 /* 1444 * If this is part of a virtual engine, its next request may 1445 * have been blocked waiting for access to the active context. 1446 * We have to kick all the siblings again in case we need to 1447 * switch (e.g. the next request is not runnable on this 1448 * engine). Hopefully, we will already have submitted the next 1449 * request before the tasklet runs and do not need to rebuild 1450 * each virtual tree and kick everyone again. 1451 */ 1452 if (ce->engine != engine) 1453 kick_siblings(rq, ce); 1454 1455 intel_context_put(ce); 1456 } 1457 1458 static inline void 1459 execlists_schedule_out(struct i915_request *rq) 1460 { 1461 struct intel_context * const ce = rq->context; 1462 struct intel_engine_cs *cur, *old; 1463 u32 ccid; 1464 1465 trace_i915_request_out(rq); 1466 1467 ccid = rq->context->lrc.ccid; 1468 old = READ_ONCE(ce->inflight); 1469 do 1470 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1471 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1472 if (!cur) 1473 __execlists_schedule_out(rq, old, ccid); 1474 1475 i915_request_put(rq); 1476 } 1477 1478 static u64 execlists_update_context(struct i915_request *rq) 1479 { 1480 struct intel_context *ce = rq->context; 1481 u64 desc = ce->lrc.desc; 1482 u32 tail, prev; 1483 1484 /* 1485 * WaIdleLiteRestore:bdw,skl 1486 * 1487 * We should never submit the context with the same RING_TAIL twice 1488 * just in case we submit an empty ring, which confuses the HW. 1489 * 1490 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1491 * the normal request to be able to always advance the RING_TAIL on 1492 * subsequent resubmissions (for lite restore). Should that fail us, 1493 * and we try and submit the same tail again, force the context 1494 * reload. 1495 * 1496 * If we need to return to a preempted context, we need to skip the 1497 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1498 * HW has a tendency to ignore us rewinding the TAIL to the end of 1499 * an earlier request. 1500 */ 1501 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1502 prev = rq->ring->tail; 1503 tail = intel_ring_set_tail(rq->ring, rq->tail); 1504 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1505 desc |= CTX_DESC_FORCE_RESTORE; 1506 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1507 rq->tail = rq->wa_tail; 1508 1509 /* 1510 * Make sure the context image is complete before we submit it to HW. 1511 * 1512 * Ostensibly, writes (including the WCB) should be flushed prior to 1513 * an uncached write such as our mmio register access, the empirical 1514 * evidence (esp. on Braswell) suggests that the WC write into memory 1515 * may not be visible to the HW prior to the completion of the UC 1516 * register write and that we may begin execution from the context 1517 * before its image is complete leading to invalid PD chasing. 1518 */ 1519 wmb(); 1520 1521 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1522 return desc; 1523 } 1524 1525 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1526 { 1527 if (execlists->ctrl_reg) { 1528 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1529 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1530 } else { 1531 writel(upper_32_bits(desc), execlists->submit_reg); 1532 writel(lower_32_bits(desc), execlists->submit_reg); 1533 } 1534 } 1535 1536 static __maybe_unused char * 1537 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1538 { 1539 if (!rq) 1540 return ""; 1541 1542 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1543 prefix, 1544 rq->context->lrc.ccid, 1545 rq->fence.context, rq->fence.seqno, 1546 i915_request_completed(rq) ? "!" : 1547 i915_request_started(rq) ? "*" : 1548 "", 1549 rq_prio(rq)); 1550 1551 return buf; 1552 } 1553 1554 static __maybe_unused void 1555 trace_ports(const struct intel_engine_execlists *execlists, 1556 const char *msg, 1557 struct i915_request * const *ports) 1558 { 1559 const struct intel_engine_cs *engine = 1560 container_of(execlists, typeof(*engine), execlists); 1561 char __maybe_unused p0[40], p1[40]; 1562 1563 if (!ports[0]) 1564 return; 1565 1566 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1567 dump_port(p0, sizeof(p0), "", ports[0]), 1568 dump_port(p1, sizeof(p1), ", ", ports[1])); 1569 } 1570 1571 static inline bool 1572 reset_in_progress(const struct intel_engine_execlists *execlists) 1573 { 1574 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1575 } 1576 1577 static __maybe_unused bool 1578 assert_pending_valid(const struct intel_engine_execlists *execlists, 1579 const char *msg) 1580 { 1581 struct intel_engine_cs *engine = 1582 container_of(execlists, typeof(*engine), execlists); 1583 struct i915_request * const *port, *rq; 1584 struct intel_context *ce = NULL; 1585 bool sentinel = false; 1586 u32 ccid = -1; 1587 1588 trace_ports(execlists, msg, execlists->pending); 1589 1590 /* We may be messing around with the lists during reset, lalala */ 1591 if (reset_in_progress(execlists)) 1592 return true; 1593 1594 if (!execlists->pending[0]) { 1595 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1596 engine->name); 1597 return false; 1598 } 1599 1600 if (execlists->pending[execlists_num_ports(execlists)]) { 1601 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1602 engine->name, execlists_num_ports(execlists)); 1603 return false; 1604 } 1605 1606 for (port = execlists->pending; (rq = *port); port++) { 1607 unsigned long flags; 1608 bool ok = true; 1609 1610 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1611 GEM_BUG_ON(!i915_request_is_active(rq)); 1612 1613 if (ce == rq->context) { 1614 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1615 engine->name, 1616 ce->timeline->fence_context, 1617 port - execlists->pending); 1618 return false; 1619 } 1620 ce = rq->context; 1621 1622 if (ccid == ce->lrc.ccid) { 1623 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1624 engine->name, 1625 ccid, ce->timeline->fence_context, 1626 port - execlists->pending); 1627 return false; 1628 } 1629 ccid = ce->lrc.ccid; 1630 1631 /* 1632 * Sentinels are supposed to be the last request so they flush 1633 * the current execution off the HW. Check that they are the only 1634 * request in the pending submission. 1635 */ 1636 if (sentinel) { 1637 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1638 engine->name, 1639 ce->timeline->fence_context, 1640 port - execlists->pending); 1641 return false; 1642 } 1643 sentinel = i915_request_has_sentinel(rq); 1644 1645 /* Hold tightly onto the lock to prevent concurrent retires! */ 1646 if (!spin_trylock_irqsave(&rq->lock, flags)) 1647 continue; 1648 1649 if (i915_request_completed(rq)) 1650 goto unlock; 1651 1652 if (i915_active_is_idle(&ce->active) && 1653 !intel_context_is_barrier(ce)) { 1654 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1655 engine->name, 1656 ce->timeline->fence_context, 1657 port - execlists->pending); 1658 ok = false; 1659 goto unlock; 1660 } 1661 1662 if (!i915_vma_is_pinned(ce->state)) { 1663 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1664 engine->name, 1665 ce->timeline->fence_context, 1666 port - execlists->pending); 1667 ok = false; 1668 goto unlock; 1669 } 1670 1671 if (!i915_vma_is_pinned(ce->ring->vma)) { 1672 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1673 engine->name, 1674 ce->timeline->fence_context, 1675 port - execlists->pending); 1676 ok = false; 1677 goto unlock; 1678 } 1679 1680 unlock: 1681 spin_unlock_irqrestore(&rq->lock, flags); 1682 if (!ok) 1683 return false; 1684 } 1685 1686 return ce; 1687 } 1688 1689 static void execlists_submit_ports(struct intel_engine_cs *engine) 1690 { 1691 struct intel_engine_execlists *execlists = &engine->execlists; 1692 unsigned int n; 1693 1694 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1695 1696 /* 1697 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1698 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1699 * not be relinquished until the device is idle (see 1700 * i915_gem_idle_work_handler()). As a precaution, we make sure 1701 * that all ELSP are drained i.e. we have processed the CSB, 1702 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1703 */ 1704 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1705 1706 /* 1707 * ELSQ note: the submit queue is not cleared after being submitted 1708 * to the HW so we need to make sure we always clean it up. This is 1709 * currently ensured by the fact that we always write the same number 1710 * of elsq entries, keep this in mind before changing the loop below. 1711 */ 1712 for (n = execlists_num_ports(execlists); n--; ) { 1713 struct i915_request *rq = execlists->pending[n]; 1714 1715 write_desc(execlists, 1716 rq ? execlists_update_context(rq) : 0, 1717 n); 1718 } 1719 1720 /* we need to manually load the submit queue */ 1721 if (execlists->ctrl_reg) 1722 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1723 } 1724 1725 static bool ctx_single_port_submission(const struct intel_context *ce) 1726 { 1727 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1728 intel_context_force_single_submission(ce)); 1729 } 1730 1731 static bool can_merge_ctx(const struct intel_context *prev, 1732 const struct intel_context *next) 1733 { 1734 if (prev != next) 1735 return false; 1736 1737 if (ctx_single_port_submission(prev)) 1738 return false; 1739 1740 return true; 1741 } 1742 1743 static unsigned long i915_request_flags(const struct i915_request *rq) 1744 { 1745 return READ_ONCE(rq->fence.flags); 1746 } 1747 1748 static bool can_merge_rq(const struct i915_request *prev, 1749 const struct i915_request *next) 1750 { 1751 GEM_BUG_ON(prev == next); 1752 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1753 1754 /* 1755 * We do not submit known completed requests. Therefore if the next 1756 * request is already completed, we can pretend to merge it in 1757 * with the previous context (and we will skip updating the ELSP 1758 * and tracking). Thus hopefully keeping the ELSP full with active 1759 * contexts, despite the best efforts of preempt-to-busy to confuse 1760 * us. 1761 */ 1762 if (i915_request_completed(next)) 1763 return true; 1764 1765 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1766 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1767 BIT(I915_FENCE_FLAG_SENTINEL)))) 1768 return false; 1769 1770 if (!can_merge_ctx(prev->context, next->context)) 1771 return false; 1772 1773 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1774 return true; 1775 } 1776 1777 static void virtual_update_register_offsets(u32 *regs, 1778 struct intel_engine_cs *engine) 1779 { 1780 set_offsets(regs, reg_offsets(engine), engine, false); 1781 } 1782 1783 static bool virtual_matches(const struct virtual_engine *ve, 1784 const struct i915_request *rq, 1785 const struct intel_engine_cs *engine) 1786 { 1787 const struct intel_engine_cs *inflight; 1788 1789 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1790 return false; 1791 1792 /* 1793 * We track when the HW has completed saving the context image 1794 * (i.e. when we have seen the final CS event switching out of 1795 * the context) and must not overwrite the context image before 1796 * then. This restricts us to only using the active engine 1797 * while the previous virtualized request is inflight (so 1798 * we reuse the register offsets). This is a very small 1799 * hystersis on the greedy seelction algorithm. 1800 */ 1801 inflight = intel_context_inflight(&ve->context); 1802 if (inflight && inflight != engine) 1803 return false; 1804 1805 return true; 1806 } 1807 1808 static void virtual_xfer_context(struct virtual_engine *ve, 1809 struct intel_engine_cs *engine) 1810 { 1811 unsigned int n; 1812 1813 if (likely(engine == ve->siblings[0])) 1814 return; 1815 1816 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1817 if (!intel_engine_has_relative_mmio(engine)) 1818 virtual_update_register_offsets(ve->context.lrc_reg_state, 1819 engine); 1820 1821 /* 1822 * Move the bound engine to the top of the list for 1823 * future execution. We then kick this tasklet first 1824 * before checking others, so that we preferentially 1825 * reuse this set of bound registers. 1826 */ 1827 for (n = 1; n < ve->num_siblings; n++) { 1828 if (ve->siblings[n] == engine) { 1829 swap(ve->siblings[n], ve->siblings[0]); 1830 break; 1831 } 1832 } 1833 } 1834 1835 #define for_each_waiter(p__, rq__) \ 1836 list_for_each_entry_lockless(p__, \ 1837 &(rq__)->sched.waiters_list, \ 1838 wait_link) 1839 1840 #define for_each_signaler(p__, rq__) \ 1841 list_for_each_entry_rcu(p__, \ 1842 &(rq__)->sched.signalers_list, \ 1843 signal_link) 1844 1845 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1846 { 1847 LIST_HEAD(list); 1848 1849 /* 1850 * We want to move the interrupted request to the back of 1851 * the round-robin list (i.e. its priority level), but 1852 * in doing so, we must then move all requests that were in 1853 * flight and were waiting for the interrupted request to 1854 * be run after it again. 1855 */ 1856 do { 1857 struct i915_dependency *p; 1858 1859 GEM_BUG_ON(i915_request_is_active(rq)); 1860 list_move_tail(&rq->sched.link, pl); 1861 1862 for_each_waiter(p, rq) { 1863 struct i915_request *w = 1864 container_of(p->waiter, typeof(*w), sched); 1865 1866 if (p->flags & I915_DEPENDENCY_WEAK) 1867 continue; 1868 1869 /* Leave semaphores spinning on the other engines */ 1870 if (w->engine != rq->engine) 1871 continue; 1872 1873 /* No waiter should start before its signaler */ 1874 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1875 i915_request_started(w) && 1876 !i915_request_completed(rq)); 1877 1878 GEM_BUG_ON(i915_request_is_active(w)); 1879 if (!i915_request_is_ready(w)) 1880 continue; 1881 1882 if (rq_prio(w) < rq_prio(rq)) 1883 continue; 1884 1885 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1886 list_move_tail(&w->sched.link, &list); 1887 } 1888 1889 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1890 } while (rq); 1891 } 1892 1893 static void defer_active(struct intel_engine_cs *engine) 1894 { 1895 struct i915_request *rq; 1896 1897 rq = __unwind_incomplete_requests(engine); 1898 if (!rq) 1899 return; 1900 1901 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1902 } 1903 1904 static bool 1905 need_timeslice(const struct intel_engine_cs *engine, 1906 const struct i915_request *rq, 1907 const struct rb_node *rb) 1908 { 1909 int hint; 1910 1911 if (!intel_engine_has_timeslices(engine)) 1912 return false; 1913 1914 hint = engine->execlists.queue_priority_hint; 1915 1916 if (rb) { 1917 const struct virtual_engine *ve = 1918 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1919 const struct intel_engine_cs *inflight = 1920 intel_context_inflight(&ve->context); 1921 1922 if (!inflight || inflight == engine) { 1923 struct i915_request *next; 1924 1925 rcu_read_lock(); 1926 next = READ_ONCE(ve->request); 1927 if (next) 1928 hint = max(hint, rq_prio(next)); 1929 rcu_read_unlock(); 1930 } 1931 } 1932 1933 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1934 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1935 1936 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1937 return hint >= effective_prio(rq); 1938 } 1939 1940 static bool 1941 timeslice_yield(const struct intel_engine_execlists *el, 1942 const struct i915_request *rq) 1943 { 1944 /* 1945 * Once bitten, forever smitten! 1946 * 1947 * If the active context ever busy-waited on a semaphore, 1948 * it will be treated as a hog until the end of its timeslice (i.e. 1949 * until it is scheduled out and replaced by a new submission, 1950 * possibly even its own lite-restore). The HW only sends an interrupt 1951 * on the first miss, and we do know if that semaphore has been 1952 * signaled, or even if it is now stuck on another semaphore. Play 1953 * safe, yield if it might be stuck -- it will be given a fresh 1954 * timeslice in the near future. 1955 */ 1956 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1957 } 1958 1959 static bool 1960 timeslice_expired(const struct intel_engine_execlists *el, 1961 const struct i915_request *rq) 1962 { 1963 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1964 } 1965 1966 static int 1967 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1968 { 1969 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1970 return engine->execlists.queue_priority_hint; 1971 1972 return rq_prio(list_next_entry(rq, sched.link)); 1973 } 1974 1975 static inline unsigned long 1976 timeslice(const struct intel_engine_cs *engine) 1977 { 1978 return READ_ONCE(engine->props.timeslice_duration_ms); 1979 } 1980 1981 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1982 { 1983 const struct intel_engine_execlists *execlists = &engine->execlists; 1984 const struct i915_request *rq = *execlists->active; 1985 1986 if (!rq || i915_request_completed(rq)) 1987 return 0; 1988 1989 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1990 return 0; 1991 1992 return timeslice(engine); 1993 } 1994 1995 static void set_timeslice(struct intel_engine_cs *engine) 1996 { 1997 unsigned long duration; 1998 1999 if (!intel_engine_has_timeslices(engine)) 2000 return; 2001 2002 duration = active_timeslice(engine); 2003 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2004 2005 set_timer_ms(&engine->execlists.timer, duration); 2006 } 2007 2008 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2009 { 2010 struct intel_engine_execlists *execlists = &engine->execlists; 2011 unsigned long duration; 2012 2013 if (!intel_engine_has_timeslices(engine)) 2014 return; 2015 2016 WRITE_ONCE(execlists->switch_priority_hint, prio); 2017 if (prio == INT_MIN) 2018 return; 2019 2020 if (timer_pending(&execlists->timer)) 2021 return; 2022 2023 duration = timeslice(engine); 2024 ENGINE_TRACE(engine, 2025 "start timeslicing, prio:%d, interval:%lu", 2026 prio, duration); 2027 2028 set_timer_ms(&execlists->timer, duration); 2029 } 2030 2031 static void record_preemption(struct intel_engine_execlists *execlists) 2032 { 2033 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2034 } 2035 2036 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2037 const struct i915_request *rq) 2038 { 2039 if (!rq) 2040 return 0; 2041 2042 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2043 if (unlikely(intel_context_is_banned(rq->context))) 2044 return 1; 2045 2046 return READ_ONCE(engine->props.preempt_timeout_ms); 2047 } 2048 2049 static void set_preempt_timeout(struct intel_engine_cs *engine, 2050 const struct i915_request *rq) 2051 { 2052 if (!intel_engine_has_preempt_reset(engine)) 2053 return; 2054 2055 set_timer_ms(&engine->execlists.preempt, 2056 active_preempt_timeout(engine, rq)); 2057 } 2058 2059 static inline void clear_ports(struct i915_request **ports, int count) 2060 { 2061 memset_p((void **)ports, NULL, count); 2062 } 2063 2064 static inline void 2065 copy_ports(struct i915_request **dst, struct i915_request **src, int count) 2066 { 2067 /* A memcpy_p() would be very useful here! */ 2068 while (count--) 2069 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ 2070 } 2071 2072 static void execlists_dequeue(struct intel_engine_cs *engine) 2073 { 2074 struct intel_engine_execlists * const execlists = &engine->execlists; 2075 struct i915_request **port = execlists->pending; 2076 struct i915_request ** const last_port = port + execlists->port_mask; 2077 struct i915_request * const *active; 2078 struct i915_request *last; 2079 struct rb_node *rb; 2080 bool submit = false; 2081 2082 /* 2083 * Hardware submission is through 2 ports. Conceptually each port 2084 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2085 * static for a context, and unique to each, so we only execute 2086 * requests belonging to a single context from each ring. RING_HEAD 2087 * is maintained by the CS in the context image, it marks the place 2088 * where it got up to last time, and through RING_TAIL we tell the CS 2089 * where we want to execute up to this time. 2090 * 2091 * In this list the requests are in order of execution. Consecutive 2092 * requests from the same context are adjacent in the ringbuffer. We 2093 * can combine these requests into a single RING_TAIL update: 2094 * 2095 * RING_HEAD...req1...req2 2096 * ^- RING_TAIL 2097 * since to execute req2 the CS must first execute req1. 2098 * 2099 * Our goal then is to point each port to the end of a consecutive 2100 * sequence of requests as being the most optimal (fewest wake ups 2101 * and context switches) submission. 2102 */ 2103 2104 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2105 struct virtual_engine *ve = 2106 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2107 struct i915_request *rq = READ_ONCE(ve->request); 2108 2109 if (!rq) { /* lazily cleanup after another engine handled rq */ 2110 rb_erase_cached(rb, &execlists->virtual); 2111 RB_CLEAR_NODE(rb); 2112 rb = rb_first_cached(&execlists->virtual); 2113 continue; 2114 } 2115 2116 if (!virtual_matches(ve, rq, engine)) { 2117 rb = rb_next(rb); 2118 continue; 2119 } 2120 2121 break; 2122 } 2123 2124 /* 2125 * If the queue is higher priority than the last 2126 * request in the currently active context, submit afresh. 2127 * We will resubmit again afterwards in case we need to split 2128 * the active context to interject the preemption request, 2129 * i.e. we will retrigger preemption following the ack in case 2130 * of trouble. 2131 */ 2132 active = READ_ONCE(execlists->active); 2133 2134 /* 2135 * In theory we can skip over completed contexts that have not 2136 * yet been processed by events (as those events are in flight): 2137 * 2138 * while ((last = *active) && i915_request_completed(last)) 2139 * active++; 2140 * 2141 * However, the GPU cannot handle this as it will ultimately 2142 * find itself trying to jump back into a context it has just 2143 * completed and barf. 2144 */ 2145 2146 if ((last = *active)) { 2147 if (need_preempt(engine, last, rb)) { 2148 if (i915_request_completed(last)) { 2149 tasklet_hi_schedule(&execlists->tasklet); 2150 return; 2151 } 2152 2153 ENGINE_TRACE(engine, 2154 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2155 last->fence.context, 2156 last->fence.seqno, 2157 last->sched.attr.priority, 2158 execlists->queue_priority_hint); 2159 record_preemption(execlists); 2160 2161 /* 2162 * Don't let the RING_HEAD advance past the breadcrumb 2163 * as we unwind (and until we resubmit) so that we do 2164 * not accidentally tell it to go backwards. 2165 */ 2166 ring_set_paused(engine, 1); 2167 2168 /* 2169 * Note that we have not stopped the GPU at this point, 2170 * so we are unwinding the incomplete requests as they 2171 * remain inflight and so by the time we do complete 2172 * the preemption, some of the unwound requests may 2173 * complete! 2174 */ 2175 __unwind_incomplete_requests(engine); 2176 2177 last = NULL; 2178 } else if (need_timeslice(engine, last, rb) && 2179 timeslice_expired(execlists, last)) { 2180 if (i915_request_completed(last)) { 2181 tasklet_hi_schedule(&execlists->tasklet); 2182 return; 2183 } 2184 2185 ENGINE_TRACE(engine, 2186 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2187 last->fence.context, 2188 last->fence.seqno, 2189 last->sched.attr.priority, 2190 execlists->queue_priority_hint, 2191 yesno(timeslice_yield(execlists, last))); 2192 2193 ring_set_paused(engine, 1); 2194 defer_active(engine); 2195 2196 /* 2197 * Unlike for preemption, if we rewind and continue 2198 * executing the same context as previously active, 2199 * the order of execution will remain the same and 2200 * the tail will only advance. We do not need to 2201 * force a full context restore, as a lite-restore 2202 * is sufficient to resample the monotonic TAIL. 2203 * 2204 * If we switch to any other context, similarly we 2205 * will not rewind TAIL of current context, and 2206 * normal save/restore will preserve state and allow 2207 * us to later continue executing the same request. 2208 */ 2209 last = NULL; 2210 } else { 2211 /* 2212 * Otherwise if we already have a request pending 2213 * for execution after the current one, we can 2214 * just wait until the next CS event before 2215 * queuing more. In either case we will force a 2216 * lite-restore preemption event, but if we wait 2217 * we hopefully coalesce several updates into a single 2218 * submission. 2219 */ 2220 if (!list_is_last(&last->sched.link, 2221 &engine->active.requests)) { 2222 /* 2223 * Even if ELSP[1] is occupied and not worthy 2224 * of timeslices, our queue might be. 2225 */ 2226 start_timeslice(engine, queue_prio(execlists)); 2227 return; 2228 } 2229 } 2230 } 2231 2232 while (rb) { /* XXX virtual is always taking precedence */ 2233 struct virtual_engine *ve = 2234 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2235 struct i915_request *rq; 2236 2237 spin_lock(&ve->base.active.lock); 2238 2239 rq = ve->request; 2240 if (unlikely(!rq)) { /* lost the race to a sibling */ 2241 spin_unlock(&ve->base.active.lock); 2242 rb_erase_cached(rb, &execlists->virtual); 2243 RB_CLEAR_NODE(rb); 2244 rb = rb_first_cached(&execlists->virtual); 2245 continue; 2246 } 2247 2248 GEM_BUG_ON(rq != ve->request); 2249 GEM_BUG_ON(rq->engine != &ve->base); 2250 GEM_BUG_ON(rq->context != &ve->context); 2251 2252 if (rq_prio(rq) >= queue_prio(execlists)) { 2253 if (!virtual_matches(ve, rq, engine)) { 2254 spin_unlock(&ve->base.active.lock); 2255 rb = rb_next(rb); 2256 continue; 2257 } 2258 2259 if (last && !can_merge_rq(last, rq)) { 2260 spin_unlock(&ve->base.active.lock); 2261 start_timeslice(engine, rq_prio(rq)); 2262 return; /* leave this for another sibling */ 2263 } 2264 2265 ENGINE_TRACE(engine, 2266 "virtual rq=%llx:%lld%s, new engine? %s\n", 2267 rq->fence.context, 2268 rq->fence.seqno, 2269 i915_request_completed(rq) ? "!" : 2270 i915_request_started(rq) ? "*" : 2271 "", 2272 yesno(engine != ve->siblings[0])); 2273 2274 WRITE_ONCE(ve->request, NULL); 2275 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2276 INT_MIN); 2277 rb_erase_cached(rb, &execlists->virtual); 2278 RB_CLEAR_NODE(rb); 2279 2280 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2281 WRITE_ONCE(rq->engine, engine); 2282 2283 if (__i915_request_submit(rq)) { 2284 /* 2285 * Only after we confirm that we will submit 2286 * this request (i.e. it has not already 2287 * completed), do we want to update the context. 2288 * 2289 * This serves two purposes. It avoids 2290 * unnecessary work if we are resubmitting an 2291 * already completed request after timeslicing. 2292 * But more importantly, it prevents us altering 2293 * ve->siblings[] on an idle context, where 2294 * we may be using ve->siblings[] in 2295 * virtual_context_enter / virtual_context_exit. 2296 */ 2297 virtual_xfer_context(ve, engine); 2298 GEM_BUG_ON(ve->siblings[0] != engine); 2299 2300 submit = true; 2301 last = rq; 2302 } 2303 i915_request_put(rq); 2304 2305 /* 2306 * Hmm, we have a bunch of virtual engine requests, 2307 * but the first one was already completed (thanks 2308 * preempt-to-busy!). Keep looking at the veng queue 2309 * until we have no more relevant requests (i.e. 2310 * the normal submit queue has higher priority). 2311 */ 2312 if (!submit) { 2313 spin_unlock(&ve->base.active.lock); 2314 rb = rb_first_cached(&execlists->virtual); 2315 continue; 2316 } 2317 } 2318 2319 spin_unlock(&ve->base.active.lock); 2320 break; 2321 } 2322 2323 while ((rb = rb_first_cached(&execlists->queue))) { 2324 struct i915_priolist *p = to_priolist(rb); 2325 struct i915_request *rq, *rn; 2326 int i; 2327 2328 priolist_for_each_request_consume(rq, rn, p, i) { 2329 bool merge = true; 2330 2331 /* 2332 * Can we combine this request with the current port? 2333 * It has to be the same context/ringbuffer and not 2334 * have any exceptions (e.g. GVT saying never to 2335 * combine contexts). 2336 * 2337 * If we can combine the requests, we can execute both 2338 * by updating the RING_TAIL to point to the end of the 2339 * second request, and so we never need to tell the 2340 * hardware about the first. 2341 */ 2342 if (last && !can_merge_rq(last, rq)) { 2343 /* 2344 * If we are on the second port and cannot 2345 * combine this request with the last, then we 2346 * are done. 2347 */ 2348 if (port == last_port) 2349 goto done; 2350 2351 /* 2352 * We must not populate both ELSP[] with the 2353 * same LRCA, i.e. we must submit 2 different 2354 * contexts if we submit 2 ELSP. 2355 */ 2356 if (last->context == rq->context) 2357 goto done; 2358 2359 if (i915_request_has_sentinel(last)) 2360 goto done; 2361 2362 /* 2363 * If GVT overrides us we only ever submit 2364 * port[0], leaving port[1] empty. Note that we 2365 * also have to be careful that we don't queue 2366 * the same context (even though a different 2367 * request) to the second port. 2368 */ 2369 if (ctx_single_port_submission(last->context) || 2370 ctx_single_port_submission(rq->context)) 2371 goto done; 2372 2373 merge = false; 2374 } 2375 2376 if (__i915_request_submit(rq)) { 2377 if (!merge) { 2378 *port = execlists_schedule_in(last, port - execlists->pending); 2379 port++; 2380 last = NULL; 2381 } 2382 2383 GEM_BUG_ON(last && 2384 !can_merge_ctx(last->context, 2385 rq->context)); 2386 GEM_BUG_ON(last && 2387 i915_seqno_passed(last->fence.seqno, 2388 rq->fence.seqno)); 2389 2390 submit = true; 2391 last = rq; 2392 } 2393 } 2394 2395 rb_erase_cached(&p->node, &execlists->queue); 2396 i915_priolist_free(p); 2397 } 2398 2399 done: 2400 /* 2401 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2402 * 2403 * We choose the priority hint such that if we add a request of greater 2404 * priority than this, we kick the submission tasklet to decide on 2405 * the right order of submitting the requests to hardware. We must 2406 * also be prepared to reorder requests as they are in-flight on the 2407 * HW. We derive the priority hint then as the first "hole" in 2408 * the HW submission ports and if there are no available slots, 2409 * the priority of the lowest executing request, i.e. last. 2410 * 2411 * When we do receive a higher priority request ready to run from the 2412 * user, see queue_request(), the priority hint is bumped to that 2413 * request triggering preemption on the next dequeue (or subsequent 2414 * interrupt for secondary ports). 2415 */ 2416 execlists->queue_priority_hint = queue_prio(execlists); 2417 2418 if (submit) { 2419 *port = execlists_schedule_in(last, port - execlists->pending); 2420 execlists->switch_priority_hint = 2421 switch_prio(engine, *execlists->pending); 2422 2423 /* 2424 * Skip if we ended up with exactly the same set of requests, 2425 * e.g. trying to timeslice a pair of ordered contexts 2426 */ 2427 if (!memcmp(active, execlists->pending, 2428 (port - execlists->pending + 1) * sizeof(*port))) { 2429 do 2430 execlists_schedule_out(fetch_and_zero(port)); 2431 while (port-- != execlists->pending); 2432 2433 goto skip_submit; 2434 } 2435 clear_ports(port + 1, last_port - port); 2436 2437 WRITE_ONCE(execlists->yield, -1); 2438 set_preempt_timeout(engine, *active); 2439 execlists_submit_ports(engine); 2440 } else { 2441 start_timeslice(engine, execlists->queue_priority_hint); 2442 skip_submit: 2443 ring_set_paused(engine, 0); 2444 } 2445 } 2446 2447 static void 2448 cancel_port_requests(struct intel_engine_execlists * const execlists) 2449 { 2450 struct i915_request * const *port; 2451 2452 for (port = execlists->pending; *port; port++) 2453 execlists_schedule_out(*port); 2454 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2455 2456 /* Mark the end of active before we overwrite *active */ 2457 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2458 execlists_schedule_out(*port); 2459 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2460 2461 smp_wmb(); /* complete the seqlock for execlists_active() */ 2462 WRITE_ONCE(execlists->active, execlists->inflight); 2463 } 2464 2465 static inline void 2466 invalidate_csb_entries(const u64 *first, const u64 *last) 2467 { 2468 clflush((void *)first); 2469 clflush((void *)last); 2470 } 2471 2472 /* 2473 * Starting with Gen12, the status has a new format: 2474 * 2475 * bit 0: switched to new queue 2476 * bit 1: reserved 2477 * bit 2: semaphore wait mode (poll or signal), only valid when 2478 * switch detail is set to "wait on semaphore" 2479 * bits 3-5: engine class 2480 * bits 6-11: engine instance 2481 * bits 12-14: reserved 2482 * bits 15-25: sw context id of the lrc the GT switched to 2483 * bits 26-31: sw counter of the lrc the GT switched to 2484 * bits 32-35: context switch detail 2485 * - 0: ctx complete 2486 * - 1: wait on sync flip 2487 * - 2: wait on vblank 2488 * - 3: wait on scanline 2489 * - 4: wait on semaphore 2490 * - 5: context preempted (not on SEMAPHORE_WAIT or 2491 * WAIT_FOR_EVENT) 2492 * bit 36: reserved 2493 * bits 37-43: wait detail (for switch detail 1 to 4) 2494 * bits 44-46: reserved 2495 * bits 47-57: sw context id of the lrc the GT switched away from 2496 * bits 58-63: sw counter of the lrc the GT switched away from 2497 */ 2498 static inline bool gen12_csb_parse(const u64 *csb) 2499 { 2500 bool ctx_away_valid; 2501 bool new_queue; 2502 u64 entry; 2503 2504 /* HSD#22011248461 */ 2505 entry = READ_ONCE(*csb); 2506 if (unlikely(entry == -1)) { 2507 preempt_disable(); 2508 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) 2509 GEM_WARN_ON("50us CSB timeout"); 2510 preempt_enable(); 2511 } 2512 WRITE_ONCE(*(u64 *)csb, -1); 2513 2514 ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); 2515 new_queue = 2516 lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2517 2518 /* 2519 * The context switch detail is not guaranteed to be 5 when a preemption 2520 * occurs, so we can't just check for that. The check below works for 2521 * all the cases we care about, including preemptions of WAIT 2522 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2523 * would require some extra handling, but we don't support that. 2524 */ 2525 if (!ctx_away_valid || new_queue) { 2526 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry))); 2527 return true; 2528 } 2529 2530 /* 2531 * switch detail = 5 is covered by the case above and we do not expect a 2532 * context switch on an unsuccessful wait instruction since we always 2533 * use polling mode. 2534 */ 2535 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry))); 2536 return false; 2537 } 2538 2539 static inline bool gen8_csb_parse(const u64 *csb) 2540 { 2541 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2542 } 2543 2544 static void process_csb(struct intel_engine_cs *engine) 2545 { 2546 struct intel_engine_execlists * const execlists = &engine->execlists; 2547 const u64 * const buf = execlists->csb_status; 2548 const u8 num_entries = execlists->csb_size; 2549 u8 head, tail; 2550 2551 /* 2552 * As we modify our execlists state tracking we require exclusive 2553 * access. Either we are inside the tasklet, or the tasklet is disabled 2554 * and we assume that is only inside the reset paths and so serialised. 2555 */ 2556 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2557 !reset_in_progress(execlists)); 2558 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2559 2560 /* 2561 * Note that csb_write, csb_status may be either in HWSP or mmio. 2562 * When reading from the csb_write mmio register, we have to be 2563 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2564 * the low 4bits. As it happens we know the next 4bits are always 2565 * zero and so we can simply masked off the low u8 of the register 2566 * and treat it identically to reading from the HWSP (without having 2567 * to use explicit shifting and masking, and probably bifurcating 2568 * the code to handle the legacy mmio read). 2569 */ 2570 head = execlists->csb_head; 2571 tail = READ_ONCE(*execlists->csb_write); 2572 if (unlikely(head == tail)) 2573 return; 2574 2575 /* 2576 * We will consume all events from HW, or at least pretend to. 2577 * 2578 * The sequence of events from the HW is deterministic, and derived 2579 * from our writes to the ELSP, with a smidgen of variability for 2580 * the arrival of the asynchronous requests wrt to the inflight 2581 * execution. If the HW sends an event that does not correspond with 2582 * the one we are expecting, we have to abandon all hope as we lose 2583 * all tracking of what the engine is actually executing. We will 2584 * only detect we are out of sequence with the HW when we get an 2585 * 'impossible' event because we have already drained our own 2586 * preemption/promotion queue. If this occurs, we know that we likely 2587 * lost track of execution earlier and must unwind and restart, the 2588 * simplest way is by stop processing the event queue and force the 2589 * engine to reset. 2590 */ 2591 execlists->csb_head = tail; 2592 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2593 2594 /* 2595 * Hopefully paired with a wmb() in HW! 2596 * 2597 * We must complete the read of the write pointer before any reads 2598 * from the CSB, so that we do not see stale values. Without an rmb 2599 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2600 * we perform the READ_ONCE(*csb_write). 2601 */ 2602 rmb(); 2603 do { 2604 bool promote; 2605 2606 if (++head == num_entries) 2607 head = 0; 2608 2609 /* 2610 * We are flying near dragons again. 2611 * 2612 * We hold a reference to the request in execlist_port[] 2613 * but no more than that. We are operating in softirq 2614 * context and so cannot hold any mutex or sleep. That 2615 * prevents us stopping the requests we are processing 2616 * in port[] from being retired simultaneously (the 2617 * breadcrumb will be complete before we see the 2618 * context-switch). As we only hold the reference to the 2619 * request, any pointer chasing underneath the request 2620 * is subject to a potential use-after-free. Thus we 2621 * store all of the bookkeeping within port[] as 2622 * required, and avoid using unguarded pointers beneath 2623 * request itself. The same applies to the atomic 2624 * status notifier. 2625 */ 2626 2627 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2628 head, 2629 upper_32_bits(buf[head]), 2630 lower_32_bits(buf[head])); 2631 2632 if (INTEL_GEN(engine->i915) >= 12) 2633 promote = gen12_csb_parse(buf + head); 2634 else 2635 promote = gen8_csb_parse(buf + head); 2636 if (promote) { 2637 struct i915_request * const *old = execlists->active; 2638 2639 if (GEM_WARN_ON(!*execlists->pending)) { 2640 execlists->error_interrupt |= ERROR_CSB; 2641 break; 2642 } 2643 2644 ring_set_paused(engine, 0); 2645 2646 /* Point active to the new ELSP; prevent overwriting */ 2647 WRITE_ONCE(execlists->active, execlists->pending); 2648 smp_wmb(); /* notify execlists_active() */ 2649 2650 /* cancel old inflight, prepare for switch */ 2651 trace_ports(execlists, "preempted", old); 2652 while (*old) 2653 execlists_schedule_out(*old++); 2654 2655 /* switch pending to inflight */ 2656 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2657 copy_ports(execlists->inflight, 2658 execlists->pending, 2659 execlists_num_ports(execlists)); 2660 smp_wmb(); /* complete the seqlock */ 2661 WRITE_ONCE(execlists->active, execlists->inflight); 2662 2663 /* XXX Magic delay for tgl */ 2664 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 2665 2666 WRITE_ONCE(execlists->pending[0], NULL); 2667 } else { 2668 if (GEM_WARN_ON(!*execlists->active)) { 2669 execlists->error_interrupt |= ERROR_CSB; 2670 break; 2671 } 2672 2673 /* port0 completed, advanced to port1 */ 2674 trace_ports(execlists, "completed", execlists->active); 2675 2676 /* 2677 * We rely on the hardware being strongly 2678 * ordered, that the breadcrumb write is 2679 * coherent (visible from the CPU) before the 2680 * user interrupt is processed. One might assume 2681 * that the breadcrumb write being before the 2682 * user interrupt and the CS event for the context 2683 * switch would therefore be before the CS event 2684 * itself... 2685 */ 2686 if (GEM_SHOW_DEBUG() && 2687 !i915_request_completed(*execlists->active)) { 2688 struct i915_request *rq = *execlists->active; 2689 const u32 *regs __maybe_unused = 2690 rq->context->lrc_reg_state; 2691 2692 ENGINE_TRACE(engine, 2693 "context completed before request!\n"); 2694 ENGINE_TRACE(engine, 2695 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2696 ENGINE_READ(engine, RING_START), 2697 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2698 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2699 ENGINE_READ(engine, RING_CTL), 2700 ENGINE_READ(engine, RING_MI_MODE)); 2701 ENGINE_TRACE(engine, 2702 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2703 i915_ggtt_offset(rq->ring->vma), 2704 rq->head, rq->tail, 2705 rq->fence.context, 2706 lower_32_bits(rq->fence.seqno), 2707 hwsp_seqno(rq)); 2708 ENGINE_TRACE(engine, 2709 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2710 regs[CTX_RING_START], 2711 regs[CTX_RING_HEAD], 2712 regs[CTX_RING_TAIL]); 2713 } 2714 2715 execlists_schedule_out(*execlists->active++); 2716 2717 GEM_BUG_ON(execlists->active - execlists->inflight > 2718 execlists_num_ports(execlists)); 2719 } 2720 } while (head != tail); 2721 2722 set_timeslice(engine); 2723 2724 /* 2725 * Gen11 has proven to fail wrt global observation point between 2726 * entry and tail update, failing on the ordering and thus 2727 * we see an old entry in the context status buffer. 2728 * 2729 * Forcibly evict out entries for the next gpu csb update, 2730 * to increase the odds that we get a fresh entries with non 2731 * working hardware. The cost for doing so comes out mostly with 2732 * the wash as hardware, working or not, will need to do the 2733 * invalidation before. 2734 */ 2735 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2736 } 2737 2738 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2739 { 2740 lockdep_assert_held(&engine->active.lock); 2741 if (!READ_ONCE(engine->execlists.pending[0])) { 2742 rcu_read_lock(); /* protect peeking at execlists->active */ 2743 execlists_dequeue(engine); 2744 rcu_read_unlock(); 2745 } 2746 } 2747 2748 static void __execlists_hold(struct i915_request *rq) 2749 { 2750 LIST_HEAD(list); 2751 2752 do { 2753 struct i915_dependency *p; 2754 2755 if (i915_request_is_active(rq)) 2756 __i915_request_unsubmit(rq); 2757 2758 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2759 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2760 i915_request_set_hold(rq); 2761 RQ_TRACE(rq, "on hold\n"); 2762 2763 for_each_waiter(p, rq) { 2764 struct i915_request *w = 2765 container_of(p->waiter, typeof(*w), sched); 2766 2767 /* Leave semaphores spinning on the other engines */ 2768 if (w->engine != rq->engine) 2769 continue; 2770 2771 if (!i915_request_is_ready(w)) 2772 continue; 2773 2774 if (i915_request_completed(w)) 2775 continue; 2776 2777 if (i915_request_on_hold(w)) 2778 continue; 2779 2780 list_move_tail(&w->sched.link, &list); 2781 } 2782 2783 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2784 } while (rq); 2785 } 2786 2787 static bool execlists_hold(struct intel_engine_cs *engine, 2788 struct i915_request *rq) 2789 { 2790 spin_lock_irq(&engine->active.lock); 2791 2792 if (i915_request_completed(rq)) { /* too late! */ 2793 rq = NULL; 2794 goto unlock; 2795 } 2796 2797 if (rq->engine != engine) { /* preempted virtual engine */ 2798 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2799 2800 /* 2801 * intel_context_inflight() is only protected by virtue 2802 * of process_csb() being called only by the tasklet (or 2803 * directly from inside reset while the tasklet is suspended). 2804 * Assert that neither of those are allowed to run while we 2805 * poke at the request queues. 2806 */ 2807 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2808 2809 /* 2810 * An unsubmitted request along a virtual engine will 2811 * remain on the active (this) engine until we are able 2812 * to process the context switch away (and so mark the 2813 * context as no longer in flight). That cannot have happened 2814 * yet, otherwise we would not be hanging! 2815 */ 2816 spin_lock(&ve->base.active.lock); 2817 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2818 GEM_BUG_ON(ve->request != rq); 2819 ve->request = NULL; 2820 spin_unlock(&ve->base.active.lock); 2821 i915_request_put(rq); 2822 2823 rq->engine = engine; 2824 } 2825 2826 /* 2827 * Transfer this request onto the hold queue to prevent it 2828 * being resumbitted to HW (and potentially completed) before we have 2829 * released it. Since we may have already submitted following 2830 * requests, we need to remove those as well. 2831 */ 2832 GEM_BUG_ON(i915_request_on_hold(rq)); 2833 GEM_BUG_ON(rq->engine != engine); 2834 __execlists_hold(rq); 2835 GEM_BUG_ON(list_empty(&engine->active.hold)); 2836 2837 unlock: 2838 spin_unlock_irq(&engine->active.lock); 2839 return rq; 2840 } 2841 2842 static bool hold_request(const struct i915_request *rq) 2843 { 2844 struct i915_dependency *p; 2845 bool result = false; 2846 2847 /* 2848 * If one of our ancestors is on hold, we must also be on hold, 2849 * otherwise we will bypass it and execute before it. 2850 */ 2851 rcu_read_lock(); 2852 for_each_signaler(p, rq) { 2853 const struct i915_request *s = 2854 container_of(p->signaler, typeof(*s), sched); 2855 2856 if (s->engine != rq->engine) 2857 continue; 2858 2859 result = i915_request_on_hold(s); 2860 if (result) 2861 break; 2862 } 2863 rcu_read_unlock(); 2864 2865 return result; 2866 } 2867 2868 static void __execlists_unhold(struct i915_request *rq) 2869 { 2870 LIST_HEAD(list); 2871 2872 do { 2873 struct i915_dependency *p; 2874 2875 RQ_TRACE(rq, "hold release\n"); 2876 2877 GEM_BUG_ON(!i915_request_on_hold(rq)); 2878 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2879 2880 i915_request_clear_hold(rq); 2881 list_move_tail(&rq->sched.link, 2882 i915_sched_lookup_priolist(rq->engine, 2883 rq_prio(rq))); 2884 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2885 2886 /* Also release any children on this engine that are ready */ 2887 for_each_waiter(p, rq) { 2888 struct i915_request *w = 2889 container_of(p->waiter, typeof(*w), sched); 2890 2891 /* Propagate any change in error status */ 2892 if (rq->fence.error) 2893 i915_request_set_error_once(w, rq->fence.error); 2894 2895 if (w->engine != rq->engine) 2896 continue; 2897 2898 if (!i915_request_on_hold(w)) 2899 continue; 2900 2901 /* Check that no other parents are also on hold */ 2902 if (hold_request(w)) 2903 continue; 2904 2905 list_move_tail(&w->sched.link, &list); 2906 } 2907 2908 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2909 } while (rq); 2910 } 2911 2912 static void execlists_unhold(struct intel_engine_cs *engine, 2913 struct i915_request *rq) 2914 { 2915 spin_lock_irq(&engine->active.lock); 2916 2917 /* 2918 * Move this request back to the priority queue, and all of its 2919 * children and grandchildren that were suspended along with it. 2920 */ 2921 __execlists_unhold(rq); 2922 2923 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2924 engine->execlists.queue_priority_hint = rq_prio(rq); 2925 tasklet_hi_schedule(&engine->execlists.tasklet); 2926 } 2927 2928 spin_unlock_irq(&engine->active.lock); 2929 } 2930 2931 struct execlists_capture { 2932 struct work_struct work; 2933 struct i915_request *rq; 2934 struct i915_gpu_coredump *error; 2935 }; 2936 2937 static void execlists_capture_work(struct work_struct *work) 2938 { 2939 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2940 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2941 struct intel_engine_cs *engine = cap->rq->engine; 2942 struct intel_gt_coredump *gt = cap->error->gt; 2943 struct intel_engine_capture_vma *vma; 2944 2945 /* Compress all the objects attached to the request, slow! */ 2946 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2947 if (vma) { 2948 struct i915_vma_compress *compress = 2949 i915_vma_capture_prepare(gt); 2950 2951 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2952 i915_vma_capture_finish(gt, compress); 2953 } 2954 2955 gt->simulated = gt->engine->simulated; 2956 cap->error->simulated = gt->simulated; 2957 2958 /* Publish the error state, and announce it to the world */ 2959 i915_error_state_store(cap->error); 2960 i915_gpu_coredump_put(cap->error); 2961 2962 /* Return this request and all that depend upon it for signaling */ 2963 execlists_unhold(engine, cap->rq); 2964 i915_request_put(cap->rq); 2965 2966 kfree(cap); 2967 } 2968 2969 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2970 { 2971 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2972 struct execlists_capture *cap; 2973 2974 cap = kmalloc(sizeof(*cap), gfp); 2975 if (!cap) 2976 return NULL; 2977 2978 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2979 if (!cap->error) 2980 goto err_cap; 2981 2982 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2983 if (!cap->error->gt) 2984 goto err_gpu; 2985 2986 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2987 if (!cap->error->gt->engine) 2988 goto err_gt; 2989 2990 return cap; 2991 2992 err_gt: 2993 kfree(cap->error->gt); 2994 err_gpu: 2995 kfree(cap->error); 2996 err_cap: 2997 kfree(cap); 2998 return NULL; 2999 } 3000 3001 static struct i915_request * 3002 active_context(struct intel_engine_cs *engine, u32 ccid) 3003 { 3004 const struct intel_engine_execlists * const el = &engine->execlists; 3005 struct i915_request * const *port, *rq; 3006 3007 /* 3008 * Use the most recent result from process_csb(), but just in case 3009 * we trigger an error (via interrupt) before the first CS event has 3010 * been written, peek at the next submission. 3011 */ 3012 3013 for (port = el->active; (rq = *port); port++) { 3014 if (rq->context->lrc.ccid == ccid) { 3015 ENGINE_TRACE(engine, 3016 "ccid found at active:%zd\n", 3017 port - el->active); 3018 return rq; 3019 } 3020 } 3021 3022 for (port = el->pending; (rq = *port); port++) { 3023 if (rq->context->lrc.ccid == ccid) { 3024 ENGINE_TRACE(engine, 3025 "ccid found at pending:%zd\n", 3026 port - el->pending); 3027 return rq; 3028 } 3029 } 3030 3031 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3032 return NULL; 3033 } 3034 3035 static u32 active_ccid(struct intel_engine_cs *engine) 3036 { 3037 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3038 } 3039 3040 static void execlists_capture(struct intel_engine_cs *engine) 3041 { 3042 struct execlists_capture *cap; 3043 3044 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3045 return; 3046 3047 /* 3048 * We need to _quickly_ capture the engine state before we reset. 3049 * We are inside an atomic section (softirq) here and we are delaying 3050 * the forced preemption event. 3051 */ 3052 cap = capture_regs(engine); 3053 if (!cap) 3054 return; 3055 3056 spin_lock_irq(&engine->active.lock); 3057 cap->rq = active_context(engine, active_ccid(engine)); 3058 if (cap->rq) { 3059 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3060 cap->rq = i915_request_get_rcu(cap->rq); 3061 } 3062 spin_unlock_irq(&engine->active.lock); 3063 if (!cap->rq) 3064 goto err_free; 3065 3066 /* 3067 * Remove the request from the execlists queue, and take ownership 3068 * of the request. We pass it to our worker who will _slowly_ compress 3069 * all the pages the _user_ requested for debugging their batch, after 3070 * which we return it to the queue for signaling. 3071 * 3072 * By removing them from the execlists queue, we also remove the 3073 * requests from being processed by __unwind_incomplete_requests() 3074 * during the intel_engine_reset(), and so they will *not* be replayed 3075 * afterwards. 3076 * 3077 * Note that because we have not yet reset the engine at this point, 3078 * it is possible for the request that we have identified as being 3079 * guilty, did in fact complete and we will then hit an arbitration 3080 * point allowing the outstanding preemption to succeed. The likelihood 3081 * of that is very low (as capturing of the engine registers should be 3082 * fast enough to run inside an irq-off atomic section!), so we will 3083 * simply hold that request accountable for being non-preemptible 3084 * long enough to force the reset. 3085 */ 3086 if (!execlists_hold(engine, cap->rq)) 3087 goto err_rq; 3088 3089 INIT_WORK(&cap->work, execlists_capture_work); 3090 schedule_work(&cap->work); 3091 return; 3092 3093 err_rq: 3094 i915_request_put(cap->rq); 3095 err_free: 3096 i915_gpu_coredump_put(cap->error); 3097 kfree(cap); 3098 } 3099 3100 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3101 { 3102 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3103 unsigned long *lock = &engine->gt->reset.flags; 3104 3105 if (!intel_has_reset_engine(engine->gt)) 3106 return; 3107 3108 if (test_and_set_bit(bit, lock)) 3109 return; 3110 3111 ENGINE_TRACE(engine, "reset for %s\n", msg); 3112 3113 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3114 tasklet_disable_nosync(&engine->execlists.tasklet); 3115 3116 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3117 execlists_capture(engine); 3118 intel_engine_reset(engine, msg); 3119 3120 tasklet_enable(&engine->execlists.tasklet); 3121 clear_and_wake_up_bit(bit, lock); 3122 } 3123 3124 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3125 { 3126 const struct timer_list *t = &engine->execlists.preempt; 3127 3128 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3129 return false; 3130 3131 if (!timer_expired(t)) 3132 return false; 3133 3134 return READ_ONCE(engine->execlists.pending[0]); 3135 } 3136 3137 /* 3138 * Check the unread Context Status Buffers and manage the submission of new 3139 * contexts to the ELSP accordingly. 3140 */ 3141 static void execlists_submission_tasklet(unsigned long data) 3142 { 3143 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3144 bool timeout = preempt_timeout(engine); 3145 3146 process_csb(engine); 3147 3148 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3149 const char *msg; 3150 3151 /* Generate the error message in priority wrt to the user! */ 3152 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3153 msg = "CS error"; /* thrown by a user payload */ 3154 else if (engine->execlists.error_interrupt & ERROR_CSB) 3155 msg = "invalid CSB event"; 3156 else 3157 msg = "internal error"; 3158 3159 engine->execlists.error_interrupt = 0; 3160 execlists_reset(engine, msg); 3161 } 3162 3163 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3164 unsigned long flags; 3165 3166 spin_lock_irqsave(&engine->active.lock, flags); 3167 __execlists_submission_tasklet(engine); 3168 spin_unlock_irqrestore(&engine->active.lock, flags); 3169 3170 /* Recheck after serialising with direct-submission */ 3171 if (unlikely(timeout && preempt_timeout(engine))) 3172 execlists_reset(engine, "preemption time out"); 3173 } 3174 } 3175 3176 static void __execlists_kick(struct intel_engine_execlists *execlists) 3177 { 3178 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3179 tasklet_hi_schedule(&execlists->tasklet); 3180 } 3181 3182 #define execlists_kick(t, member) \ 3183 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3184 3185 static void execlists_timeslice(struct timer_list *timer) 3186 { 3187 execlists_kick(timer, timer); 3188 } 3189 3190 static void execlists_preempt(struct timer_list *timer) 3191 { 3192 execlists_kick(timer, preempt); 3193 } 3194 3195 static void queue_request(struct intel_engine_cs *engine, 3196 struct i915_request *rq) 3197 { 3198 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3199 list_add_tail(&rq->sched.link, 3200 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3201 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3202 } 3203 3204 static void __submit_queue_imm(struct intel_engine_cs *engine) 3205 { 3206 struct intel_engine_execlists * const execlists = &engine->execlists; 3207 3208 if (reset_in_progress(execlists)) 3209 return; /* defer until we restart the engine following reset */ 3210 3211 __execlists_submission_tasklet(engine); 3212 } 3213 3214 static void submit_queue(struct intel_engine_cs *engine, 3215 const struct i915_request *rq) 3216 { 3217 struct intel_engine_execlists *execlists = &engine->execlists; 3218 3219 if (rq_prio(rq) <= execlists->queue_priority_hint) 3220 return; 3221 3222 execlists->queue_priority_hint = rq_prio(rq); 3223 __submit_queue_imm(engine); 3224 } 3225 3226 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3227 const struct i915_request *rq) 3228 { 3229 GEM_BUG_ON(i915_request_on_hold(rq)); 3230 return !list_empty(&engine->active.hold) && hold_request(rq); 3231 } 3232 3233 static void flush_csb(struct intel_engine_cs *engine) 3234 { 3235 struct intel_engine_execlists *el = &engine->execlists; 3236 3237 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3238 if (!reset_in_progress(el)) 3239 process_csb(engine); 3240 tasklet_unlock(&el->tasklet); 3241 } 3242 } 3243 3244 static void execlists_submit_request(struct i915_request *request) 3245 { 3246 struct intel_engine_cs *engine = request->engine; 3247 unsigned long flags; 3248 3249 /* Hopefully we clear execlists->pending[] to let us through */ 3250 flush_csb(engine); 3251 3252 /* Will be called from irq-context when using foreign fences. */ 3253 spin_lock_irqsave(&engine->active.lock, flags); 3254 3255 if (unlikely(ancestor_on_hold(engine, request))) { 3256 RQ_TRACE(request, "ancestor on hold\n"); 3257 list_add_tail(&request->sched.link, &engine->active.hold); 3258 i915_request_set_hold(request); 3259 } else { 3260 queue_request(engine, request); 3261 3262 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3263 GEM_BUG_ON(list_empty(&request->sched.link)); 3264 3265 submit_queue(engine, request); 3266 } 3267 3268 spin_unlock_irqrestore(&engine->active.lock, flags); 3269 } 3270 3271 static void __execlists_context_fini(struct intel_context *ce) 3272 { 3273 intel_ring_put(ce->ring); 3274 i915_vma_put(ce->state); 3275 } 3276 3277 static void execlists_context_destroy(struct kref *kref) 3278 { 3279 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3280 3281 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3282 GEM_BUG_ON(intel_context_is_pinned(ce)); 3283 3284 if (ce->state) 3285 __execlists_context_fini(ce); 3286 3287 intel_context_fini(ce); 3288 intel_context_free(ce); 3289 } 3290 3291 static void 3292 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3293 { 3294 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3295 return; 3296 3297 vaddr += engine->context_size; 3298 3299 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3300 } 3301 3302 static void 3303 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3304 { 3305 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3306 return; 3307 3308 vaddr += engine->context_size; 3309 3310 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3311 drm_err_once(&engine->i915->drm, 3312 "%s context redzone overwritten!\n", 3313 engine->name); 3314 } 3315 3316 static void execlists_context_unpin(struct intel_context *ce) 3317 { 3318 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3319 ce->engine); 3320 } 3321 3322 static void execlists_context_post_unpin(struct intel_context *ce) 3323 { 3324 i915_gem_object_unpin_map(ce->state->obj); 3325 } 3326 3327 static u32 * 3328 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3329 { 3330 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3331 MI_SRM_LRM_GLOBAL_GTT | 3332 MI_LRI_LRM_CS_MMIO; 3333 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3334 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3335 CTX_TIMESTAMP * sizeof(u32); 3336 *cs++ = 0; 3337 3338 *cs++ = MI_LOAD_REGISTER_REG | 3339 MI_LRR_SOURCE_CS_MMIO | 3340 MI_LRI_LRM_CS_MMIO; 3341 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3342 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3343 3344 *cs++ = MI_LOAD_REGISTER_REG | 3345 MI_LRR_SOURCE_CS_MMIO | 3346 MI_LRI_LRM_CS_MMIO; 3347 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3348 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3349 3350 return cs; 3351 } 3352 3353 static u32 * 3354 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3355 { 3356 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3357 3358 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3359 MI_SRM_LRM_GLOBAL_GTT | 3360 MI_LRI_LRM_CS_MMIO; 3361 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3362 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3363 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3364 *cs++ = 0; 3365 3366 return cs; 3367 } 3368 3369 static u32 * 3370 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3371 { 3372 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3373 3374 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3375 MI_SRM_LRM_GLOBAL_GTT | 3376 MI_LRI_LRM_CS_MMIO; 3377 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3378 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3379 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3380 *cs++ = 0; 3381 3382 *cs++ = MI_LOAD_REGISTER_REG | 3383 MI_LRR_SOURCE_CS_MMIO | 3384 MI_LRI_LRM_CS_MMIO; 3385 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3386 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3387 3388 return cs; 3389 } 3390 3391 static u32 * 3392 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3393 { 3394 cs = gen12_emit_timestamp_wa(ce, cs); 3395 cs = gen12_emit_cmd_buf_wa(ce, cs); 3396 cs = gen12_emit_restore_scratch(ce, cs); 3397 3398 return cs; 3399 } 3400 3401 static u32 * 3402 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3403 { 3404 cs = gen12_emit_timestamp_wa(ce, cs); 3405 cs = gen12_emit_restore_scratch(ce, cs); 3406 3407 return cs; 3408 } 3409 3410 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3411 { 3412 return PAGE_SIZE * ce->wa_bb_page; 3413 } 3414 3415 static u32 *context_indirect_bb(const struct intel_context *ce) 3416 { 3417 void *ptr; 3418 3419 GEM_BUG_ON(!ce->wa_bb_page); 3420 3421 ptr = ce->lrc_reg_state; 3422 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3423 ptr += context_wa_bb_offset(ce); 3424 3425 return ptr; 3426 } 3427 3428 static void 3429 setup_indirect_ctx_bb(const struct intel_context *ce, 3430 const struct intel_engine_cs *engine, 3431 u32 *(*emit)(const struct intel_context *, u32 *)) 3432 { 3433 u32 * const start = context_indirect_bb(ce); 3434 u32 *cs; 3435 3436 cs = emit(ce, start); 3437 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3438 while ((unsigned long)cs % CACHELINE_BYTES) 3439 *cs++ = MI_NOOP; 3440 3441 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3442 i915_ggtt_offset(ce->state) + 3443 context_wa_bb_offset(ce), 3444 (cs - start) * sizeof(*cs)); 3445 } 3446 3447 static void 3448 __execlists_update_reg_state(const struct intel_context *ce, 3449 const struct intel_engine_cs *engine, 3450 u32 head) 3451 { 3452 struct intel_ring *ring = ce->ring; 3453 u32 *regs = ce->lrc_reg_state; 3454 3455 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3456 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3457 3458 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3459 regs[CTX_RING_HEAD] = head; 3460 regs[CTX_RING_TAIL] = ring->tail; 3461 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3462 3463 /* RPCS */ 3464 if (engine->class == RENDER_CLASS) { 3465 regs[CTX_R_PWR_CLK_STATE] = 3466 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3467 3468 i915_oa_init_reg_state(ce, engine); 3469 } 3470 3471 if (ce->wa_bb_page) { 3472 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3473 3474 fn = gen12_emit_indirect_ctx_xcs; 3475 if (ce->engine->class == RENDER_CLASS) 3476 fn = gen12_emit_indirect_ctx_rcs; 3477 3478 /* Mutually exclusive wrt to global indirect bb */ 3479 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3480 setup_indirect_ctx_bb(ce, engine, fn); 3481 } 3482 } 3483 3484 static int 3485 execlists_context_pre_pin(struct intel_context *ce, 3486 struct i915_gem_ww_ctx *ww, void **vaddr) 3487 { 3488 GEM_BUG_ON(!ce->state); 3489 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3490 3491 *vaddr = i915_gem_object_pin_map(ce->state->obj, 3492 i915_coherent_map_type(ce->engine->i915) | 3493 I915_MAP_OVERRIDE); 3494 3495 return PTR_ERR_OR_ZERO(*vaddr); 3496 } 3497 3498 static int 3499 __execlists_context_pin(struct intel_context *ce, 3500 struct intel_engine_cs *engine, 3501 void *vaddr) 3502 { 3503 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3504 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3505 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3506 3507 return 0; 3508 } 3509 3510 static int execlists_context_pin(struct intel_context *ce, void *vaddr) 3511 { 3512 return __execlists_context_pin(ce, ce->engine, vaddr); 3513 } 3514 3515 static int execlists_context_alloc(struct intel_context *ce) 3516 { 3517 return __execlists_context_alloc(ce, ce->engine); 3518 } 3519 3520 static void execlists_context_reset(struct intel_context *ce) 3521 { 3522 CE_TRACE(ce, "reset\n"); 3523 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3524 3525 intel_ring_reset(ce->ring, ce->ring->emit); 3526 3527 /* Scrub away the garbage */ 3528 execlists_init_reg_state(ce->lrc_reg_state, 3529 ce, ce->engine, ce->ring, true); 3530 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3531 3532 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3533 } 3534 3535 static const struct intel_context_ops execlists_context_ops = { 3536 .alloc = execlists_context_alloc, 3537 3538 .pre_pin = execlists_context_pre_pin, 3539 .pin = execlists_context_pin, 3540 .unpin = execlists_context_unpin, 3541 .post_unpin = execlists_context_post_unpin, 3542 3543 .enter = intel_context_enter_engine, 3544 .exit = intel_context_exit_engine, 3545 3546 .reset = execlists_context_reset, 3547 .destroy = execlists_context_destroy, 3548 }; 3549 3550 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3551 { 3552 u32 *cs; 3553 3554 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3555 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3556 return 0; 3557 3558 cs = intel_ring_begin(rq, 6); 3559 if (IS_ERR(cs)) 3560 return PTR_ERR(cs); 3561 3562 /* 3563 * Check if we have been preempted before we even get started. 3564 * 3565 * After this point i915_request_started() reports true, even if 3566 * we get preempted and so are no longer running. 3567 */ 3568 *cs++ = MI_ARB_CHECK; 3569 *cs++ = MI_NOOP; 3570 3571 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3572 *cs++ = i915_request_timeline(rq)->hwsp_offset; 3573 *cs++ = 0; 3574 *cs++ = rq->fence.seqno - 1; 3575 3576 intel_ring_advance(rq, cs); 3577 3578 /* Record the updated position of the request's payload */ 3579 rq->infix = intel_ring_offset(rq, cs); 3580 3581 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3582 3583 return 0; 3584 } 3585 3586 static int emit_pdps(struct i915_request *rq) 3587 { 3588 const struct intel_engine_cs * const engine = rq->engine; 3589 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3590 int err, i; 3591 u32 *cs; 3592 3593 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3594 3595 /* 3596 * Beware ye of the dragons, this sequence is magic! 3597 * 3598 * Small changes to this sequence can cause anything from 3599 * GPU hangs to forcewake errors and machine lockups! 3600 */ 3601 3602 /* Flush any residual operations from the context load */ 3603 err = engine->emit_flush(rq, EMIT_FLUSH); 3604 if (err) 3605 return err; 3606 3607 /* Magic required to prevent forcewake errors! */ 3608 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3609 if (err) 3610 return err; 3611 3612 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3613 if (IS_ERR(cs)) 3614 return PTR_ERR(cs); 3615 3616 /* Ensure the LRI have landed before we invalidate & continue */ 3617 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3618 for (i = GEN8_3LVL_PDPES; i--; ) { 3619 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3620 u32 base = engine->mmio_base; 3621 3622 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3623 *cs++ = upper_32_bits(pd_daddr); 3624 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3625 *cs++ = lower_32_bits(pd_daddr); 3626 } 3627 *cs++ = MI_NOOP; 3628 3629 intel_ring_advance(rq, cs); 3630 3631 return 0; 3632 } 3633 3634 static int execlists_request_alloc(struct i915_request *request) 3635 { 3636 int ret; 3637 3638 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3639 3640 /* 3641 * Flush enough space to reduce the likelihood of waiting after 3642 * we start building the request - in which case we will just 3643 * have to repeat work. 3644 */ 3645 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3646 3647 /* 3648 * Note that after this point, we have committed to using 3649 * this request as it is being used to both track the 3650 * state of engine initialisation and liveness of the 3651 * golden renderstate above. Think twice before you try 3652 * to cancel/unwind this request now. 3653 */ 3654 3655 if (!i915_vm_is_4lvl(request->context->vm)) { 3656 ret = emit_pdps(request); 3657 if (ret) 3658 return ret; 3659 } 3660 3661 /* Unconditionally invalidate GPU caches and TLBs. */ 3662 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3663 if (ret) 3664 return ret; 3665 3666 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3667 return 0; 3668 } 3669 3670 /* 3671 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3672 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3673 * but there is a slight complication as this is applied in WA batch where the 3674 * values are only initialized once so we cannot take register value at the 3675 * beginning and reuse it further; hence we save its value to memory, upload a 3676 * constant value with bit21 set and then we restore it back with the saved value. 3677 * To simplify the WA, a constant value is formed by using the default value 3678 * of this register. This shouldn't be a problem because we are only modifying 3679 * it for a short period and this batch in non-premptible. We can ofcourse 3680 * use additional instructions that read the actual value of the register 3681 * at that time and set our bit of interest but it makes the WA complicated. 3682 * 3683 * This WA is also required for Gen9 so extracting as a function avoids 3684 * code duplication. 3685 */ 3686 static u32 * 3687 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3688 { 3689 /* NB no one else is allowed to scribble over scratch + 256! */ 3690 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3691 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3692 *batch++ = intel_gt_scratch_offset(engine->gt, 3693 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3694 *batch++ = 0; 3695 3696 *batch++ = MI_LOAD_REGISTER_IMM(1); 3697 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3698 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3699 3700 batch = gen8_emit_pipe_control(batch, 3701 PIPE_CONTROL_CS_STALL | 3702 PIPE_CONTROL_DC_FLUSH_ENABLE, 3703 0); 3704 3705 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3706 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3707 *batch++ = intel_gt_scratch_offset(engine->gt, 3708 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3709 *batch++ = 0; 3710 3711 return batch; 3712 } 3713 3714 /* 3715 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3716 * initialized at the beginning and shared across all contexts but this field 3717 * helps us to have multiple batches at different offsets and select them based 3718 * on a criteria. At the moment this batch always start at the beginning of the page 3719 * and at this point we don't have multiple wa_ctx batch buffers. 3720 * 3721 * The number of WA applied are not known at the beginning; we use this field 3722 * to return the no of DWORDS written. 3723 * 3724 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3725 * so it adds NOOPs as padding to make it cacheline aligned. 3726 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3727 * makes a complete batch buffer. 3728 */ 3729 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3730 { 3731 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3732 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3733 3734 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3735 if (IS_BROADWELL(engine->i915)) 3736 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3737 3738 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3739 /* Actual scratch location is at 128 bytes offset */ 3740 batch = gen8_emit_pipe_control(batch, 3741 PIPE_CONTROL_FLUSH_L3 | 3742 PIPE_CONTROL_STORE_DATA_INDEX | 3743 PIPE_CONTROL_CS_STALL | 3744 PIPE_CONTROL_QW_WRITE, 3745 LRC_PPHWSP_SCRATCH_ADDR); 3746 3747 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3748 3749 /* Pad to end of cacheline */ 3750 while ((unsigned long)batch % CACHELINE_BYTES) 3751 *batch++ = MI_NOOP; 3752 3753 /* 3754 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3755 * execution depends on the length specified in terms of cache lines 3756 * in the register CTX_RCS_INDIRECT_CTX 3757 */ 3758 3759 return batch; 3760 } 3761 3762 struct lri { 3763 i915_reg_t reg; 3764 u32 value; 3765 }; 3766 3767 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3768 { 3769 GEM_BUG_ON(!count || count > 63); 3770 3771 *batch++ = MI_LOAD_REGISTER_IMM(count); 3772 do { 3773 *batch++ = i915_mmio_reg_offset(lri->reg); 3774 *batch++ = lri->value; 3775 } while (lri++, --count); 3776 *batch++ = MI_NOOP; 3777 3778 return batch; 3779 } 3780 3781 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3782 { 3783 static const struct lri lri[] = { 3784 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3785 { 3786 COMMON_SLICE_CHICKEN2, 3787 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3788 0), 3789 }, 3790 3791 /* BSpec: 11391 */ 3792 { 3793 FF_SLICE_CHICKEN, 3794 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3795 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3796 }, 3797 3798 /* BSpec: 11299 */ 3799 { 3800 _3D_CHICKEN3, 3801 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3802 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3803 } 3804 }; 3805 3806 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3807 3808 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3809 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3810 3811 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3812 batch = gen8_emit_pipe_control(batch, 3813 PIPE_CONTROL_FLUSH_L3 | 3814 PIPE_CONTROL_STORE_DATA_INDEX | 3815 PIPE_CONTROL_CS_STALL | 3816 PIPE_CONTROL_QW_WRITE, 3817 LRC_PPHWSP_SCRATCH_ADDR); 3818 3819 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3820 3821 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3822 if (HAS_POOLED_EU(engine->i915)) { 3823 /* 3824 * EU pool configuration is setup along with golden context 3825 * during context initialization. This value depends on 3826 * device type (2x6 or 3x6) and needs to be updated based 3827 * on which subslice is disabled especially for 2x6 3828 * devices, however it is safe to load default 3829 * configuration of 3x6 device instead of masking off 3830 * corresponding bits because HW ignores bits of a disabled 3831 * subslice and drops down to appropriate config. Please 3832 * see render_state_setup() in i915_gem_render_state.c for 3833 * possible configurations, to avoid duplication they are 3834 * not shown here again. 3835 */ 3836 *batch++ = GEN9_MEDIA_POOL_STATE; 3837 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3838 *batch++ = 0x00777000; 3839 *batch++ = 0; 3840 *batch++ = 0; 3841 *batch++ = 0; 3842 } 3843 3844 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3845 3846 /* Pad to end of cacheline */ 3847 while ((unsigned long)batch % CACHELINE_BYTES) 3848 *batch++ = MI_NOOP; 3849 3850 return batch; 3851 } 3852 3853 static u32 * 3854 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3855 { 3856 int i; 3857 3858 /* 3859 * WaPipeControlBefore3DStateSamplePattern: cnl 3860 * 3861 * Ensure the engine is idle prior to programming a 3862 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3863 */ 3864 batch = gen8_emit_pipe_control(batch, 3865 PIPE_CONTROL_CS_STALL, 3866 0); 3867 /* 3868 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3869 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3870 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3871 * confusing. Since gen8_emit_pipe_control() already advances the 3872 * batch by 6 dwords, we advance the other 10 here, completing a 3873 * cacheline. It's not clear if the workaround requires this padding 3874 * before other commands, or if it's just the regular padding we would 3875 * already have for the workaround bb, so leave it here for now. 3876 */ 3877 for (i = 0; i < 10; i++) 3878 *batch++ = MI_NOOP; 3879 3880 /* Pad to end of cacheline */ 3881 while ((unsigned long)batch % CACHELINE_BYTES) 3882 *batch++ = MI_NOOP; 3883 3884 return batch; 3885 } 3886 3887 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3888 3889 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3890 { 3891 struct drm_i915_gem_object *obj; 3892 struct i915_vma *vma; 3893 int err; 3894 3895 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3896 if (IS_ERR(obj)) 3897 return PTR_ERR(obj); 3898 3899 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3900 if (IS_ERR(vma)) { 3901 err = PTR_ERR(vma); 3902 goto err; 3903 } 3904 3905 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 3906 if (err) 3907 goto err; 3908 3909 engine->wa_ctx.vma = vma; 3910 return 0; 3911 3912 err: 3913 i915_gem_object_put(obj); 3914 return err; 3915 } 3916 3917 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3918 { 3919 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3920 } 3921 3922 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3923 3924 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3925 { 3926 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3927 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3928 &wa_ctx->per_ctx }; 3929 wa_bb_func_t wa_bb_fn[2]; 3930 void *batch, *batch_ptr; 3931 unsigned int i; 3932 int ret; 3933 3934 if (engine->class != RENDER_CLASS) 3935 return 0; 3936 3937 switch (INTEL_GEN(engine->i915)) { 3938 case 12: 3939 case 11: 3940 return 0; 3941 case 10: 3942 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3943 wa_bb_fn[1] = NULL; 3944 break; 3945 case 9: 3946 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3947 wa_bb_fn[1] = NULL; 3948 break; 3949 case 8: 3950 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3951 wa_bb_fn[1] = NULL; 3952 break; 3953 default: 3954 MISSING_CASE(INTEL_GEN(engine->i915)); 3955 return 0; 3956 } 3957 3958 ret = lrc_setup_wa_ctx(engine); 3959 if (ret) { 3960 drm_dbg(&engine->i915->drm, 3961 "Failed to setup context WA page: %d\n", ret); 3962 return ret; 3963 } 3964 3965 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 3966 3967 /* 3968 * Emit the two workaround batch buffers, recording the offset from the 3969 * start of the workaround batch buffer object for each and their 3970 * respective sizes. 3971 */ 3972 batch_ptr = batch; 3973 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3974 wa_bb[i]->offset = batch_ptr - batch; 3975 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3976 CACHELINE_BYTES))) { 3977 ret = -EINVAL; 3978 break; 3979 } 3980 if (wa_bb_fn[i]) 3981 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3982 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3983 } 3984 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3985 3986 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 3987 __i915_gem_object_release_map(wa_ctx->vma->obj); 3988 if (ret) 3989 lrc_destroy_wa_ctx(engine); 3990 3991 return ret; 3992 } 3993 3994 static void reset_csb_pointers(struct intel_engine_cs *engine) 3995 { 3996 struct intel_engine_execlists * const execlists = &engine->execlists; 3997 const unsigned int reset_value = execlists->csb_size - 1; 3998 3999 ring_set_paused(engine, 0); 4000 4001 /* 4002 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 4003 * Bludgeon them with a mmio update to be sure. 4004 */ 4005 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4006 0xffff << 16 | reset_value << 8 | reset_value); 4007 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4008 4009 /* 4010 * After a reset, the HW starts writing into CSB entry [0]. We 4011 * therefore have to set our HEAD pointer back one entry so that 4012 * the *first* entry we check is entry 0. To complicate this further, 4013 * as we don't wait for the first interrupt after reset, we have to 4014 * fake the HW write to point back to the last entry so that our 4015 * inline comparison of our cached head position against the last HW 4016 * write works even before the first interrupt. 4017 */ 4018 execlists->csb_head = reset_value; 4019 WRITE_ONCE(*execlists->csb_write, reset_value); 4020 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4021 4022 /* Check that the GPU does indeed update the CSB entries! */ 4023 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); 4024 invalidate_csb_entries(&execlists->csb_status[0], 4025 &execlists->csb_status[reset_value]); 4026 4027 /* Once more for luck and our trusty paranoia */ 4028 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4029 0xffff << 16 | reset_value << 8 | reset_value); 4030 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4031 4032 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4033 } 4034 4035 static void execlists_sanitize(struct intel_engine_cs *engine) 4036 { 4037 /* 4038 * Poison residual state on resume, in case the suspend didn't! 4039 * 4040 * We have to assume that across suspend/resume (or other loss 4041 * of control) that the contents of our pinned buffers has been 4042 * lost, replaced by garbage. Since this doesn't always happen, 4043 * let's poison such state so that we more quickly spot when 4044 * we falsely assume it has been preserved. 4045 */ 4046 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4047 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4048 4049 reset_csb_pointers(engine); 4050 4051 /* 4052 * The kernel_context HWSP is stored in the status_page. As above, 4053 * that may be lost on resume/initialisation, and so we need to 4054 * reset the value in the HWSP. 4055 */ 4056 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4057 4058 /* And scrub the dirty cachelines for the HWSP */ 4059 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4060 } 4061 4062 static void enable_error_interrupt(struct intel_engine_cs *engine) 4063 { 4064 u32 status; 4065 4066 engine->execlists.error_interrupt = 0; 4067 ENGINE_WRITE(engine, RING_EMR, ~0u); 4068 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4069 4070 status = ENGINE_READ(engine, RING_ESR); 4071 if (unlikely(status)) { 4072 drm_err(&engine->i915->drm, 4073 "engine '%s' resumed still in error: %08x\n", 4074 engine->name, status); 4075 __intel_gt_reset(engine->gt, engine->mask); 4076 } 4077 4078 /* 4079 * On current gen8+, we have 2 signals to play with 4080 * 4081 * - I915_ERROR_INSTUCTION (bit 0) 4082 * 4083 * Generate an error if the command parser encounters an invalid 4084 * instruction 4085 * 4086 * This is a fatal error. 4087 * 4088 * - CP_PRIV (bit 2) 4089 * 4090 * Generate an error on privilege violation (where the CP replaces 4091 * the instruction with a no-op). This also fires for writes into 4092 * read-only scratch pages. 4093 * 4094 * This is a non-fatal error, parsing continues. 4095 * 4096 * * there are a few others defined for odd HW that we do not use 4097 * 4098 * Since CP_PRIV fires for cases where we have chosen to ignore the 4099 * error (as the HW is validating and suppressing the mistakes), we 4100 * only unmask the instruction error bit. 4101 */ 4102 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4103 } 4104 4105 static void enable_execlists(struct intel_engine_cs *engine) 4106 { 4107 u32 mode; 4108 4109 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4110 4111 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4112 4113 if (INTEL_GEN(engine->i915) >= 11) 4114 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4115 else 4116 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4117 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4118 4119 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4120 4121 ENGINE_WRITE_FW(engine, 4122 RING_HWS_PGA, 4123 i915_ggtt_offset(engine->status_page.vma)); 4124 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4125 4126 enable_error_interrupt(engine); 4127 4128 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4129 } 4130 4131 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4132 { 4133 bool unexpected = false; 4134 4135 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4136 drm_dbg(&engine->i915->drm, 4137 "STOP_RING still set in RING_MI_MODE\n"); 4138 unexpected = true; 4139 } 4140 4141 return unexpected; 4142 } 4143 4144 static int execlists_resume(struct intel_engine_cs *engine) 4145 { 4146 intel_mocs_init_engine(engine); 4147 4148 intel_breadcrumbs_reset(engine->breadcrumbs); 4149 4150 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4151 struct drm_printer p = drm_debug_printer(__func__); 4152 4153 intel_engine_dump(engine, &p, NULL); 4154 } 4155 4156 enable_execlists(engine); 4157 4158 return 0; 4159 } 4160 4161 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4162 { 4163 struct intel_engine_execlists * const execlists = &engine->execlists; 4164 unsigned long flags; 4165 4166 ENGINE_TRACE(engine, "depth<-%d\n", 4167 atomic_read(&execlists->tasklet.count)); 4168 4169 /* 4170 * Prevent request submission to the hardware until we have 4171 * completed the reset in i915_gem_reset_finish(). If a request 4172 * is completed by one engine, it may then queue a request 4173 * to a second via its execlists->tasklet *just* as we are 4174 * calling engine->resume() and also writing the ELSP. 4175 * Turning off the execlists->tasklet until the reset is over 4176 * prevents the race. 4177 */ 4178 __tasklet_disable_sync_once(&execlists->tasklet); 4179 GEM_BUG_ON(!reset_in_progress(execlists)); 4180 4181 /* And flush any current direct submission. */ 4182 spin_lock_irqsave(&engine->active.lock, flags); 4183 spin_unlock_irqrestore(&engine->active.lock, flags); 4184 4185 /* 4186 * We stop engines, otherwise we might get failed reset and a 4187 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4188 * from system hang if batchbuffer is progressing when 4189 * the reset is issued, regardless of READY_TO_RESET ack. 4190 * Thus assume it is best to stop engines on all gens 4191 * where we have a gpu reset. 4192 * 4193 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4194 * 4195 * FIXME: Wa for more modern gens needs to be validated 4196 */ 4197 ring_set_paused(engine, 1); 4198 intel_engine_stop_cs(engine); 4199 4200 engine->execlists.reset_ccid = active_ccid(engine); 4201 } 4202 4203 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4204 { 4205 int x; 4206 4207 x = lrc_ring_mi_mode(engine); 4208 if (x != -1) { 4209 regs[x + 1] &= ~STOP_RING; 4210 regs[x + 1] |= STOP_RING << 16; 4211 } 4212 } 4213 4214 static void __execlists_reset_reg_state(const struct intel_context *ce, 4215 const struct intel_engine_cs *engine) 4216 { 4217 u32 *regs = ce->lrc_reg_state; 4218 4219 __reset_stop_ring(regs, engine); 4220 } 4221 4222 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4223 { 4224 struct intel_engine_execlists * const execlists = &engine->execlists; 4225 struct intel_context *ce; 4226 struct i915_request *rq; 4227 u32 head; 4228 4229 mb(); /* paranoia: read the CSB pointers from after the reset */ 4230 clflush(execlists->csb_write); 4231 mb(); 4232 4233 process_csb(engine); /* drain preemption events */ 4234 4235 /* Following the reset, we need to reload the CSB read/write pointers */ 4236 reset_csb_pointers(engine); 4237 4238 /* 4239 * Save the currently executing context, even if we completed 4240 * its request, it was still running at the time of the 4241 * reset and will have been clobbered. 4242 */ 4243 rq = active_context(engine, engine->execlists.reset_ccid); 4244 if (!rq) 4245 goto unwind; 4246 4247 ce = rq->context; 4248 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4249 4250 if (i915_request_completed(rq)) { 4251 /* Idle context; tidy up the ring so we can restart afresh */ 4252 head = intel_ring_wrap(ce->ring, rq->tail); 4253 goto out_replay; 4254 } 4255 4256 /* We still have requests in-flight; the engine should be active */ 4257 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4258 4259 /* Context has requests still in-flight; it should not be idle! */ 4260 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4261 4262 rq = active_request(ce->timeline, rq); 4263 head = intel_ring_wrap(ce->ring, rq->head); 4264 GEM_BUG_ON(head == ce->ring->tail); 4265 4266 /* 4267 * If this request hasn't started yet, e.g. it is waiting on a 4268 * semaphore, we need to avoid skipping the request or else we 4269 * break the signaling chain. However, if the context is corrupt 4270 * the request will not restart and we will be stuck with a wedged 4271 * device. It is quite often the case that if we issue a reset 4272 * while the GPU is loading the context image, that the context 4273 * image becomes corrupt. 4274 * 4275 * Otherwise, if we have not started yet, the request should replay 4276 * perfectly and we do not need to flag the result as being erroneous. 4277 */ 4278 if (!i915_request_started(rq)) 4279 goto out_replay; 4280 4281 /* 4282 * If the request was innocent, we leave the request in the ELSP 4283 * and will try to replay it on restarting. The context image may 4284 * have been corrupted by the reset, in which case we may have 4285 * to service a new GPU hang, but more likely we can continue on 4286 * without impact. 4287 * 4288 * If the request was guilty, we presume the context is corrupt 4289 * and have to at least restore the RING register in the context 4290 * image back to the expected values to skip over the guilty request. 4291 */ 4292 __i915_request_reset(rq, stalled); 4293 4294 /* 4295 * We want a simple context + ring to execute the breadcrumb update. 4296 * We cannot rely on the context being intact across the GPU hang, 4297 * so clear it and rebuild just what we need for the breadcrumb. 4298 * All pending requests for this context will be zapped, and any 4299 * future request will be after userspace has had the opportunity 4300 * to recreate its own state. 4301 */ 4302 out_replay: 4303 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4304 head, ce->ring->tail); 4305 __execlists_reset_reg_state(ce, engine); 4306 __execlists_update_reg_state(ce, engine, head); 4307 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4308 4309 unwind: 4310 /* Push back any incomplete requests for replay after the reset. */ 4311 cancel_port_requests(execlists); 4312 __unwind_incomplete_requests(engine); 4313 } 4314 4315 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4316 { 4317 unsigned long flags; 4318 4319 ENGINE_TRACE(engine, "\n"); 4320 4321 spin_lock_irqsave(&engine->active.lock, flags); 4322 4323 __execlists_reset(engine, stalled); 4324 4325 spin_unlock_irqrestore(&engine->active.lock, flags); 4326 } 4327 4328 static void nop_submission_tasklet(unsigned long data) 4329 { 4330 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4331 4332 /* The driver is wedged; don't process any more events. */ 4333 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4334 } 4335 4336 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4337 { 4338 struct intel_engine_execlists * const execlists = &engine->execlists; 4339 struct i915_request *rq, *rn; 4340 struct rb_node *rb; 4341 unsigned long flags; 4342 4343 ENGINE_TRACE(engine, "\n"); 4344 4345 /* 4346 * Before we call engine->cancel_requests(), we should have exclusive 4347 * access to the submission state. This is arranged for us by the 4348 * caller disabling the interrupt generation, the tasklet and other 4349 * threads that may then access the same state, giving us a free hand 4350 * to reset state. However, we still need to let lockdep be aware that 4351 * we know this state may be accessed in hardirq context, so we 4352 * disable the irq around this manipulation and we want to keep 4353 * the spinlock focused on its duties and not accidentally conflate 4354 * coverage to the submission's irq state. (Similarly, although we 4355 * shouldn't need to disable irq around the manipulation of the 4356 * submission's irq state, we also wish to remind ourselves that 4357 * it is irq state.) 4358 */ 4359 spin_lock_irqsave(&engine->active.lock, flags); 4360 4361 __execlists_reset(engine, true); 4362 4363 /* Mark all executing requests as skipped. */ 4364 list_for_each_entry(rq, &engine->active.requests, sched.link) 4365 mark_eio(rq); 4366 4367 /* Flush the queued requests to the timeline list (for retiring). */ 4368 while ((rb = rb_first_cached(&execlists->queue))) { 4369 struct i915_priolist *p = to_priolist(rb); 4370 int i; 4371 4372 priolist_for_each_request_consume(rq, rn, p, i) { 4373 mark_eio(rq); 4374 __i915_request_submit(rq); 4375 } 4376 4377 rb_erase_cached(&p->node, &execlists->queue); 4378 i915_priolist_free(p); 4379 } 4380 4381 /* On-hold requests will be flushed to timeline upon their release */ 4382 list_for_each_entry(rq, &engine->active.hold, sched.link) 4383 mark_eio(rq); 4384 4385 /* Cancel all attached virtual engines */ 4386 while ((rb = rb_first_cached(&execlists->virtual))) { 4387 struct virtual_engine *ve = 4388 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4389 4390 rb_erase_cached(rb, &execlists->virtual); 4391 RB_CLEAR_NODE(rb); 4392 4393 spin_lock(&ve->base.active.lock); 4394 rq = fetch_and_zero(&ve->request); 4395 if (rq) { 4396 mark_eio(rq); 4397 4398 rq->engine = engine; 4399 __i915_request_submit(rq); 4400 i915_request_put(rq); 4401 4402 ve->base.execlists.queue_priority_hint = INT_MIN; 4403 } 4404 spin_unlock(&ve->base.active.lock); 4405 } 4406 4407 /* Remaining _unready_ requests will be nop'ed when submitted */ 4408 4409 execlists->queue_priority_hint = INT_MIN; 4410 execlists->queue = RB_ROOT_CACHED; 4411 4412 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4413 execlists->tasklet.func = nop_submission_tasklet; 4414 4415 spin_unlock_irqrestore(&engine->active.lock, flags); 4416 } 4417 4418 static void execlists_reset_finish(struct intel_engine_cs *engine) 4419 { 4420 struct intel_engine_execlists * const execlists = &engine->execlists; 4421 4422 /* 4423 * After a GPU reset, we may have requests to replay. Do so now while 4424 * we still have the forcewake to be sure that the GPU is not allowed 4425 * to sleep before we restart and reload a context. 4426 */ 4427 GEM_BUG_ON(!reset_in_progress(execlists)); 4428 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4429 execlists->tasklet.func(execlists->tasklet.data); 4430 4431 if (__tasklet_enable(&execlists->tasklet)) 4432 /* And kick in case we missed a new request submission. */ 4433 tasklet_hi_schedule(&execlists->tasklet); 4434 ENGINE_TRACE(engine, "depth->%d\n", 4435 atomic_read(&execlists->tasklet.count)); 4436 } 4437 4438 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4439 u64 offset, u32 len, 4440 const unsigned int flags) 4441 { 4442 u32 *cs; 4443 4444 cs = intel_ring_begin(rq, 4); 4445 if (IS_ERR(cs)) 4446 return PTR_ERR(cs); 4447 4448 /* 4449 * WaDisableCtxRestoreArbitration:bdw,chv 4450 * 4451 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4452 * particular all the gen that do not need the w/a at all!), if we 4453 * took care to make sure that on every switch into this context 4454 * (both ordinary and for preemption) that arbitrartion was enabled 4455 * we would be fine. However, for gen8 there is another w/a that 4456 * requires us to not preempt inside GPGPU execution, so we keep 4457 * arbitration disabled for gen8 batches. Arbitration will be 4458 * re-enabled before we close the request 4459 * (engine->emit_fini_breadcrumb). 4460 */ 4461 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4462 4463 /* FIXME(BDW+): Address space and security selectors. */ 4464 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4465 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4466 *cs++ = lower_32_bits(offset); 4467 *cs++ = upper_32_bits(offset); 4468 4469 intel_ring_advance(rq, cs); 4470 4471 return 0; 4472 } 4473 4474 static int gen8_emit_bb_start(struct i915_request *rq, 4475 u64 offset, u32 len, 4476 const unsigned int flags) 4477 { 4478 u32 *cs; 4479 4480 cs = intel_ring_begin(rq, 6); 4481 if (IS_ERR(cs)) 4482 return PTR_ERR(cs); 4483 4484 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4485 4486 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4487 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4488 *cs++ = lower_32_bits(offset); 4489 *cs++ = upper_32_bits(offset); 4490 4491 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4492 *cs++ = MI_NOOP; 4493 4494 intel_ring_advance(rq, cs); 4495 4496 return 0; 4497 } 4498 4499 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4500 { 4501 ENGINE_WRITE(engine, RING_IMR, 4502 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4503 ENGINE_POSTING_READ(engine, RING_IMR); 4504 } 4505 4506 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4507 { 4508 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4509 } 4510 4511 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4512 { 4513 u32 cmd, *cs; 4514 4515 cs = intel_ring_begin(request, 4); 4516 if (IS_ERR(cs)) 4517 return PTR_ERR(cs); 4518 4519 cmd = MI_FLUSH_DW + 1; 4520 4521 /* We always require a command barrier so that subsequent 4522 * commands, such as breadcrumb interrupts, are strictly ordered 4523 * wrt the contents of the write cache being flushed to memory 4524 * (and thus being coherent from the CPU). 4525 */ 4526 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4527 4528 if (mode & EMIT_INVALIDATE) { 4529 cmd |= MI_INVALIDATE_TLB; 4530 if (request->engine->class == VIDEO_DECODE_CLASS) 4531 cmd |= MI_INVALIDATE_BSD; 4532 } 4533 4534 *cs++ = cmd; 4535 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4536 *cs++ = 0; /* upper addr */ 4537 *cs++ = 0; /* value */ 4538 intel_ring_advance(request, cs); 4539 4540 return 0; 4541 } 4542 4543 static int gen8_emit_flush_render(struct i915_request *request, 4544 u32 mode) 4545 { 4546 bool vf_flush_wa = false, dc_flush_wa = false; 4547 u32 *cs, flags = 0; 4548 int len; 4549 4550 flags |= PIPE_CONTROL_CS_STALL; 4551 4552 if (mode & EMIT_FLUSH) { 4553 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4554 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4555 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4556 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4557 } 4558 4559 if (mode & EMIT_INVALIDATE) { 4560 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4561 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4562 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4563 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4564 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4565 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4566 flags |= PIPE_CONTROL_QW_WRITE; 4567 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4568 4569 /* 4570 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4571 * pipe control. 4572 */ 4573 if (IS_GEN(request->engine->i915, 9)) 4574 vf_flush_wa = true; 4575 4576 /* WaForGAMHang:kbl */ 4577 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4578 dc_flush_wa = true; 4579 } 4580 4581 len = 6; 4582 4583 if (vf_flush_wa) 4584 len += 6; 4585 4586 if (dc_flush_wa) 4587 len += 12; 4588 4589 cs = intel_ring_begin(request, len); 4590 if (IS_ERR(cs)) 4591 return PTR_ERR(cs); 4592 4593 if (vf_flush_wa) 4594 cs = gen8_emit_pipe_control(cs, 0, 0); 4595 4596 if (dc_flush_wa) 4597 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4598 0); 4599 4600 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4601 4602 if (dc_flush_wa) 4603 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4604 4605 intel_ring_advance(request, cs); 4606 4607 return 0; 4608 } 4609 4610 static int gen11_emit_flush_render(struct i915_request *request, 4611 u32 mode) 4612 { 4613 if (mode & EMIT_FLUSH) { 4614 u32 *cs; 4615 u32 flags = 0; 4616 4617 flags |= PIPE_CONTROL_CS_STALL; 4618 4619 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4620 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4621 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4622 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4623 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4624 flags |= PIPE_CONTROL_QW_WRITE; 4625 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4626 4627 cs = intel_ring_begin(request, 6); 4628 if (IS_ERR(cs)) 4629 return PTR_ERR(cs); 4630 4631 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4632 intel_ring_advance(request, cs); 4633 } 4634 4635 if (mode & EMIT_INVALIDATE) { 4636 u32 *cs; 4637 u32 flags = 0; 4638 4639 flags |= PIPE_CONTROL_CS_STALL; 4640 4641 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4642 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4643 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4644 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4645 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4646 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4647 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4648 flags |= PIPE_CONTROL_QW_WRITE; 4649 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4650 4651 cs = intel_ring_begin(request, 6); 4652 if (IS_ERR(cs)) 4653 return PTR_ERR(cs); 4654 4655 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4656 intel_ring_advance(request, cs); 4657 } 4658 4659 return 0; 4660 } 4661 4662 static u32 preparser_disable(bool state) 4663 { 4664 return MI_ARB_CHECK | 1 << 8 | state; 4665 } 4666 4667 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4668 { 4669 static const i915_reg_t vd[] = { 4670 GEN12_VD0_AUX_NV, 4671 GEN12_VD1_AUX_NV, 4672 GEN12_VD2_AUX_NV, 4673 GEN12_VD3_AUX_NV, 4674 }; 4675 4676 static const i915_reg_t ve[] = { 4677 GEN12_VE0_AUX_NV, 4678 GEN12_VE1_AUX_NV, 4679 }; 4680 4681 if (engine->class == VIDEO_DECODE_CLASS) 4682 return vd[engine->instance]; 4683 4684 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4685 return ve[engine->instance]; 4686 4687 GEM_BUG_ON("unknown aux_inv_reg\n"); 4688 4689 return INVALID_MMIO_REG; 4690 } 4691 4692 static u32 * 4693 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4694 { 4695 *cs++ = MI_LOAD_REGISTER_IMM(1); 4696 *cs++ = i915_mmio_reg_offset(inv_reg); 4697 *cs++ = AUX_INV; 4698 *cs++ = MI_NOOP; 4699 4700 return cs; 4701 } 4702 4703 static int gen12_emit_flush_render(struct i915_request *request, 4704 u32 mode) 4705 { 4706 if (mode & EMIT_FLUSH) { 4707 u32 flags = 0; 4708 u32 *cs; 4709 4710 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4711 flags |= PIPE_CONTROL_FLUSH_L3; 4712 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4713 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4714 /* Wa_1409600907:tgl */ 4715 flags |= PIPE_CONTROL_DEPTH_STALL; 4716 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4717 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4718 4719 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4720 flags |= PIPE_CONTROL_QW_WRITE; 4721 4722 flags |= PIPE_CONTROL_CS_STALL; 4723 4724 cs = intel_ring_begin(request, 6); 4725 if (IS_ERR(cs)) 4726 return PTR_ERR(cs); 4727 4728 cs = gen12_emit_pipe_control(cs, 4729 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4730 flags, LRC_PPHWSP_SCRATCH_ADDR); 4731 intel_ring_advance(request, cs); 4732 } 4733 4734 if (mode & EMIT_INVALIDATE) { 4735 u32 flags = 0; 4736 u32 *cs; 4737 4738 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4739 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4740 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4741 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4742 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4743 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4744 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4745 4746 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4747 flags |= PIPE_CONTROL_QW_WRITE; 4748 4749 flags |= PIPE_CONTROL_CS_STALL; 4750 4751 cs = intel_ring_begin(request, 8 + 4); 4752 if (IS_ERR(cs)) 4753 return PTR_ERR(cs); 4754 4755 /* 4756 * Prevent the pre-parser from skipping past the TLB 4757 * invalidate and loading a stale page for the batch 4758 * buffer / request payload. 4759 */ 4760 *cs++ = preparser_disable(true); 4761 4762 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4763 4764 /* hsdes: 1809175790 */ 4765 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4766 4767 *cs++ = preparser_disable(false); 4768 intel_ring_advance(request, cs); 4769 } 4770 4771 return 0; 4772 } 4773 4774 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4775 { 4776 intel_engine_mask_t aux_inv = 0; 4777 u32 cmd, *cs; 4778 4779 cmd = 4; 4780 if (mode & EMIT_INVALIDATE) 4781 cmd += 2; 4782 if (mode & EMIT_INVALIDATE) 4783 aux_inv = request->engine->mask & ~BIT(BCS0); 4784 if (aux_inv) 4785 cmd += 2 * hweight8(aux_inv) + 2; 4786 4787 cs = intel_ring_begin(request, cmd); 4788 if (IS_ERR(cs)) 4789 return PTR_ERR(cs); 4790 4791 if (mode & EMIT_INVALIDATE) 4792 *cs++ = preparser_disable(true); 4793 4794 cmd = MI_FLUSH_DW + 1; 4795 4796 /* We always require a command barrier so that subsequent 4797 * commands, such as breadcrumb interrupts, are strictly ordered 4798 * wrt the contents of the write cache being flushed to memory 4799 * (and thus being coherent from the CPU). 4800 */ 4801 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4802 4803 if (mode & EMIT_INVALIDATE) { 4804 cmd |= MI_INVALIDATE_TLB; 4805 if (request->engine->class == VIDEO_DECODE_CLASS) 4806 cmd |= MI_INVALIDATE_BSD; 4807 } 4808 4809 *cs++ = cmd; 4810 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4811 *cs++ = 0; /* upper addr */ 4812 *cs++ = 0; /* value */ 4813 4814 if (aux_inv) { /* hsdes: 1809175790 */ 4815 struct intel_engine_cs *engine; 4816 unsigned int tmp; 4817 4818 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4819 for_each_engine_masked(engine, request->engine->gt, 4820 aux_inv, tmp) { 4821 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4822 *cs++ = AUX_INV; 4823 } 4824 *cs++ = MI_NOOP; 4825 } 4826 4827 if (mode & EMIT_INVALIDATE) 4828 *cs++ = preparser_disable(false); 4829 4830 intel_ring_advance(request, cs); 4831 4832 return 0; 4833 } 4834 4835 static void assert_request_valid(struct i915_request *rq) 4836 { 4837 struct intel_ring *ring __maybe_unused = rq->ring; 4838 4839 /* Can we unwind this request without appearing to go forwards? */ 4840 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4841 } 4842 4843 /* 4844 * Reserve space for 2 NOOPs at the end of each request to be 4845 * used as a workaround for not being allowed to do lite 4846 * restore with HEAD==TAIL (WaIdleLiteRestore). 4847 */ 4848 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4849 { 4850 /* Ensure there's always at least one preemption point per-request. */ 4851 *cs++ = MI_ARB_CHECK; 4852 *cs++ = MI_NOOP; 4853 request->wa_tail = intel_ring_offset(request, cs); 4854 4855 /* Check that entire request is less than half the ring */ 4856 assert_request_valid(request); 4857 4858 return cs; 4859 } 4860 4861 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4862 { 4863 *cs++ = MI_SEMAPHORE_WAIT | 4864 MI_SEMAPHORE_GLOBAL_GTT | 4865 MI_SEMAPHORE_POLL | 4866 MI_SEMAPHORE_SAD_EQ_SDD; 4867 *cs++ = 0; 4868 *cs++ = intel_hws_preempt_address(request->engine); 4869 *cs++ = 0; 4870 4871 return cs; 4872 } 4873 4874 static __always_inline u32* 4875 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4876 { 4877 *cs++ = MI_USER_INTERRUPT; 4878 4879 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4880 if (intel_engine_has_semaphores(request->engine)) 4881 cs = emit_preempt_busywait(request, cs); 4882 4883 request->tail = intel_ring_offset(request, cs); 4884 assert_ring_tail_valid(request->ring, request->tail); 4885 4886 return gen8_emit_wa_tail(request, cs); 4887 } 4888 4889 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) 4890 { 4891 u32 addr = i915_request_active_timeline(request)->hwsp_offset; 4892 4893 return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); 4894 } 4895 4896 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4897 { 4898 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4899 } 4900 4901 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4902 { 4903 cs = gen8_emit_pipe_control(cs, 4904 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4905 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4906 PIPE_CONTROL_DC_FLUSH_ENABLE, 4907 0); 4908 4909 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4910 cs = gen8_emit_ggtt_write_rcs(cs, 4911 request->fence.seqno, 4912 i915_request_active_timeline(request)->hwsp_offset, 4913 PIPE_CONTROL_FLUSH_ENABLE | 4914 PIPE_CONTROL_CS_STALL); 4915 4916 return gen8_emit_fini_breadcrumb_tail(request, cs); 4917 } 4918 4919 static u32 * 4920 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4921 { 4922 cs = gen8_emit_ggtt_write_rcs(cs, 4923 request->fence.seqno, 4924 i915_request_active_timeline(request)->hwsp_offset, 4925 PIPE_CONTROL_CS_STALL | 4926 PIPE_CONTROL_TILE_CACHE_FLUSH | 4927 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4928 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4929 PIPE_CONTROL_DC_FLUSH_ENABLE | 4930 PIPE_CONTROL_FLUSH_ENABLE); 4931 4932 return gen8_emit_fini_breadcrumb_tail(request, cs); 4933 } 4934 4935 /* 4936 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4937 * flush and will continue pre-fetching the instructions after it before the 4938 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4939 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4940 * of the next request before the memory has been flushed, we're guaranteed that 4941 * we won't access the batch itself too early. 4942 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4943 * so, if the current request is modifying an instruction in the next request on 4944 * the same intel_context, we might pre-fetch and then execute the pre-update 4945 * instruction. To avoid this, the users of self-modifying code should either 4946 * disable the parser around the code emitting the memory writes, via a new flag 4947 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4948 * the in-kernel use-cases we've opted to use a separate context, see 4949 * reloc_gpu() as an example. 4950 * All the above applies only to the instructions themselves. Non-inline data 4951 * used by the instructions is not pre-fetched. 4952 */ 4953 4954 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4955 { 4956 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4957 MI_SEMAPHORE_GLOBAL_GTT | 4958 MI_SEMAPHORE_POLL | 4959 MI_SEMAPHORE_SAD_EQ_SDD; 4960 *cs++ = 0; 4961 *cs++ = intel_hws_preempt_address(request->engine); 4962 *cs++ = 0; 4963 *cs++ = 0; 4964 *cs++ = MI_NOOP; 4965 4966 return cs; 4967 } 4968 4969 static __always_inline u32* 4970 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4971 { 4972 *cs++ = MI_USER_INTERRUPT; 4973 4974 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4975 if (intel_engine_has_semaphores(request->engine)) 4976 cs = gen12_emit_preempt_busywait(request, cs); 4977 4978 request->tail = intel_ring_offset(request, cs); 4979 assert_ring_tail_valid(request->ring, request->tail); 4980 4981 return gen8_emit_wa_tail(request, cs); 4982 } 4983 4984 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4985 { 4986 return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4987 } 4988 4989 static u32 * 4990 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4991 { 4992 cs = gen12_emit_ggtt_write_rcs(cs, 4993 request->fence.seqno, 4994 i915_request_active_timeline(request)->hwsp_offset, 4995 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4996 PIPE_CONTROL_CS_STALL | 4997 PIPE_CONTROL_TILE_CACHE_FLUSH | 4998 PIPE_CONTROL_FLUSH_L3 | 4999 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5000 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5001 /* Wa_1409600907:tgl */ 5002 PIPE_CONTROL_DEPTH_STALL | 5003 PIPE_CONTROL_DC_FLUSH_ENABLE | 5004 PIPE_CONTROL_FLUSH_ENABLE); 5005 5006 return gen12_emit_fini_breadcrumb_tail(request, cs); 5007 } 5008 5009 static void execlists_park(struct intel_engine_cs *engine) 5010 { 5011 cancel_timer(&engine->execlists.timer); 5012 cancel_timer(&engine->execlists.preempt); 5013 } 5014 5015 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 5016 { 5017 engine->submit_request = execlists_submit_request; 5018 engine->schedule = i915_schedule; 5019 engine->execlists.tasklet.func = execlists_submission_tasklet; 5020 5021 engine->reset.prepare = execlists_reset_prepare; 5022 engine->reset.rewind = execlists_reset_rewind; 5023 engine->reset.cancel = execlists_reset_cancel; 5024 engine->reset.finish = execlists_reset_finish; 5025 5026 engine->park = execlists_park; 5027 engine->unpark = NULL; 5028 5029 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5030 if (!intel_vgpu_active(engine->i915)) { 5031 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5032 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5033 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5034 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5035 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5036 } 5037 } 5038 5039 if (INTEL_GEN(engine->i915) >= 12) 5040 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5041 5042 if (intel_engine_has_preemption(engine)) 5043 engine->emit_bb_start = gen8_emit_bb_start; 5044 else 5045 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5046 } 5047 5048 static void execlists_shutdown(struct intel_engine_cs *engine) 5049 { 5050 /* Synchronise with residual timers and any softirq they raise */ 5051 del_timer_sync(&engine->execlists.timer); 5052 del_timer_sync(&engine->execlists.preempt); 5053 tasklet_kill(&engine->execlists.tasklet); 5054 } 5055 5056 static void execlists_release(struct intel_engine_cs *engine) 5057 { 5058 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5059 5060 execlists_shutdown(engine); 5061 5062 intel_engine_cleanup_common(engine); 5063 lrc_destroy_wa_ctx(engine); 5064 } 5065 5066 static void 5067 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5068 { 5069 /* Default vfuncs which can be overriden by each engine. */ 5070 5071 engine->resume = execlists_resume; 5072 5073 engine->cops = &execlists_context_ops; 5074 engine->request_alloc = execlists_request_alloc; 5075 5076 engine->emit_flush = gen8_emit_flush; 5077 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5078 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5079 if (INTEL_GEN(engine->i915) >= 12) { 5080 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5081 engine->emit_flush = gen12_emit_flush; 5082 } 5083 engine->set_default_submission = intel_execlists_set_default_submission; 5084 5085 if (INTEL_GEN(engine->i915) < 11) { 5086 engine->irq_enable = gen8_logical_ring_enable_irq; 5087 engine->irq_disable = gen8_logical_ring_disable_irq; 5088 } else { 5089 /* 5090 * TODO: On Gen11 interrupt masks need to be clear 5091 * to allow C6 entry. Keep interrupts enabled at 5092 * and take the hit of generating extra interrupts 5093 * until a more refined solution exists. 5094 */ 5095 } 5096 } 5097 5098 static inline void 5099 logical_ring_default_irqs(struct intel_engine_cs *engine) 5100 { 5101 unsigned int shift = 0; 5102 5103 if (INTEL_GEN(engine->i915) < 11) { 5104 const u8 irq_shifts[] = { 5105 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5106 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5107 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5108 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5109 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5110 }; 5111 5112 shift = irq_shifts[engine->id]; 5113 } 5114 5115 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5116 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5117 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5118 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5119 } 5120 5121 static void rcs_submission_override(struct intel_engine_cs *engine) 5122 { 5123 switch (INTEL_GEN(engine->i915)) { 5124 case 12: 5125 engine->emit_flush = gen12_emit_flush_render; 5126 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5127 break; 5128 case 11: 5129 engine->emit_flush = gen11_emit_flush_render; 5130 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5131 break; 5132 default: 5133 engine->emit_flush = gen8_emit_flush_render; 5134 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5135 break; 5136 } 5137 } 5138 5139 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5140 { 5141 struct intel_engine_execlists * const execlists = &engine->execlists; 5142 struct drm_i915_private *i915 = engine->i915; 5143 struct intel_uncore *uncore = engine->uncore; 5144 u32 base = engine->mmio_base; 5145 5146 tasklet_init(&engine->execlists.tasklet, 5147 execlists_submission_tasklet, (unsigned long)engine); 5148 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5149 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5150 5151 logical_ring_default_vfuncs(engine); 5152 logical_ring_default_irqs(engine); 5153 5154 if (engine->class == RENDER_CLASS) 5155 rcs_submission_override(engine); 5156 5157 if (intel_init_workaround_bb(engine)) 5158 /* 5159 * We continue even if we fail to initialize WA batch 5160 * because we only expect rare glitches but nothing 5161 * critical to prevent us from using GPU 5162 */ 5163 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5164 5165 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5166 execlists->submit_reg = uncore->regs + 5167 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5168 execlists->ctrl_reg = uncore->regs + 5169 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5170 } else { 5171 execlists->submit_reg = uncore->regs + 5172 i915_mmio_reg_offset(RING_ELSP(base)); 5173 } 5174 5175 execlists->csb_status = 5176 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5177 5178 execlists->csb_write = 5179 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5180 5181 if (INTEL_GEN(i915) < 11) 5182 execlists->csb_size = GEN8_CSB_ENTRIES; 5183 else 5184 execlists->csb_size = GEN11_CSB_ENTRIES; 5185 5186 if (INTEL_GEN(engine->i915) >= 11) { 5187 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5188 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5189 } 5190 5191 /* Finally, take ownership and responsibility for cleanup! */ 5192 engine->sanitize = execlists_sanitize; 5193 engine->release = execlists_release; 5194 5195 return 0; 5196 } 5197 5198 static void init_common_reg_state(u32 * const regs, 5199 const struct intel_engine_cs *engine, 5200 const struct intel_ring *ring, 5201 bool inhibit) 5202 { 5203 u32 ctl; 5204 5205 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5206 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5207 if (inhibit) 5208 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5209 if (INTEL_GEN(engine->i915) < 11) 5210 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5211 CTX_CTRL_RS_CTX_ENABLE); 5212 regs[CTX_CONTEXT_CONTROL] = ctl; 5213 5214 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5215 regs[CTX_TIMESTAMP] = 0; 5216 } 5217 5218 static void init_wa_bb_reg_state(u32 * const regs, 5219 const struct intel_engine_cs *engine) 5220 { 5221 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5222 5223 if (wa_ctx->per_ctx.size) { 5224 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5225 5226 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5227 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5228 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5229 } 5230 5231 if (wa_ctx->indirect_ctx.size) { 5232 lrc_ring_setup_indirect_ctx(regs, engine, 5233 i915_ggtt_offset(wa_ctx->vma) + 5234 wa_ctx->indirect_ctx.offset, 5235 wa_ctx->indirect_ctx.size); 5236 } 5237 } 5238 5239 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5240 { 5241 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5242 /* 64b PPGTT (48bit canonical) 5243 * PDP0_DESCRIPTOR contains the base address to PML4 and 5244 * other PDP Descriptors are ignored. 5245 */ 5246 ASSIGN_CTX_PML4(ppgtt, regs); 5247 } else { 5248 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5249 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5250 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5251 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5252 } 5253 } 5254 5255 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5256 { 5257 if (i915_is_ggtt(vm)) 5258 return i915_vm_to_ggtt(vm)->alias; 5259 else 5260 return i915_vm_to_ppgtt(vm); 5261 } 5262 5263 static void execlists_init_reg_state(u32 *regs, 5264 const struct intel_context *ce, 5265 const struct intel_engine_cs *engine, 5266 const struct intel_ring *ring, 5267 bool inhibit) 5268 { 5269 /* 5270 * A context is actually a big batch buffer with several 5271 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5272 * values we are setting here are only for the first context restore: 5273 * on a subsequent save, the GPU will recreate this batchbuffer with new 5274 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5275 * we are not initializing here). 5276 * 5277 * Must keep consistent with virtual_update_register_offsets(). 5278 */ 5279 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5280 5281 init_common_reg_state(regs, engine, ring, inhibit); 5282 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5283 5284 init_wa_bb_reg_state(regs, engine); 5285 5286 __reset_stop_ring(regs, engine); 5287 } 5288 5289 static int 5290 populate_lr_context(struct intel_context *ce, 5291 struct drm_i915_gem_object *ctx_obj, 5292 struct intel_engine_cs *engine, 5293 struct intel_ring *ring) 5294 { 5295 bool inhibit = true; 5296 void *vaddr; 5297 5298 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5299 if (IS_ERR(vaddr)) { 5300 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5301 return PTR_ERR(vaddr); 5302 } 5303 5304 set_redzone(vaddr, engine); 5305 5306 if (engine->default_state) { 5307 shmem_read(engine->default_state, 0, 5308 vaddr, engine->context_size); 5309 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5310 inhibit = false; 5311 } 5312 5313 /* Clear the ppHWSP (inc. per-context counters) */ 5314 memset(vaddr, 0, PAGE_SIZE); 5315 5316 /* 5317 * The second page of the context object contains some registers which 5318 * must be set up prior to the first execution. 5319 */ 5320 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5321 ce, engine, ring, inhibit); 5322 5323 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5324 i915_gem_object_unpin_map(ctx_obj); 5325 return 0; 5326 } 5327 5328 static struct intel_timeline *pinned_timeline(struct intel_context *ce) 5329 { 5330 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 5331 5332 return intel_timeline_create_from_engine(ce->engine, 5333 page_unmask_bits(tl)); 5334 } 5335 5336 static int __execlists_context_alloc(struct intel_context *ce, 5337 struct intel_engine_cs *engine) 5338 { 5339 struct drm_i915_gem_object *ctx_obj; 5340 struct intel_ring *ring; 5341 struct i915_vma *vma; 5342 u32 context_size; 5343 int ret; 5344 5345 GEM_BUG_ON(ce->state); 5346 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5347 5348 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5349 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5350 5351 if (INTEL_GEN(engine->i915) == 12) { 5352 ce->wa_bb_page = context_size / PAGE_SIZE; 5353 context_size += PAGE_SIZE; 5354 } 5355 5356 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5357 if (IS_ERR(ctx_obj)) 5358 return PTR_ERR(ctx_obj); 5359 5360 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5361 if (IS_ERR(vma)) { 5362 ret = PTR_ERR(vma); 5363 goto error_deref_obj; 5364 } 5365 5366 if (!page_mask_bits(ce->timeline)) { 5367 struct intel_timeline *tl; 5368 5369 /* 5370 * Use the static global HWSP for the kernel context, and 5371 * a dynamically allocated cacheline for everyone else. 5372 */ 5373 if (unlikely(ce->timeline)) 5374 tl = pinned_timeline(ce); 5375 else 5376 tl = intel_timeline_create(engine->gt); 5377 if (IS_ERR(tl)) { 5378 ret = PTR_ERR(tl); 5379 goto error_deref_obj; 5380 } 5381 5382 ce->timeline = tl; 5383 } 5384 5385 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5386 if (IS_ERR(ring)) { 5387 ret = PTR_ERR(ring); 5388 goto error_deref_obj; 5389 } 5390 5391 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5392 if (ret) { 5393 drm_dbg(&engine->i915->drm, 5394 "Failed to populate LRC: %d\n", ret); 5395 goto error_ring_free; 5396 } 5397 5398 ce->ring = ring; 5399 ce->state = vma; 5400 5401 return 0; 5402 5403 error_ring_free: 5404 intel_ring_put(ring); 5405 error_deref_obj: 5406 i915_gem_object_put(ctx_obj); 5407 return ret; 5408 } 5409 5410 static struct list_head *virtual_queue(struct virtual_engine *ve) 5411 { 5412 return &ve->base.execlists.default_priolist.requests[0]; 5413 } 5414 5415 static void virtual_context_destroy(struct kref *kref) 5416 { 5417 struct virtual_engine *ve = 5418 container_of(kref, typeof(*ve), context.ref); 5419 unsigned int n; 5420 5421 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5422 GEM_BUG_ON(ve->request); 5423 GEM_BUG_ON(ve->context.inflight); 5424 5425 for (n = 0; n < ve->num_siblings; n++) { 5426 struct intel_engine_cs *sibling = ve->siblings[n]; 5427 struct rb_node *node = &ve->nodes[sibling->id].rb; 5428 unsigned long flags; 5429 5430 if (RB_EMPTY_NODE(node)) 5431 continue; 5432 5433 spin_lock_irqsave(&sibling->active.lock, flags); 5434 5435 /* Detachment is lazily performed in the execlists tasklet */ 5436 if (!RB_EMPTY_NODE(node)) 5437 rb_erase_cached(node, &sibling->execlists.virtual); 5438 5439 spin_unlock_irqrestore(&sibling->active.lock, flags); 5440 } 5441 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5442 5443 if (ve->context.state) 5444 __execlists_context_fini(&ve->context); 5445 intel_context_fini(&ve->context); 5446 5447 intel_engine_free_request_pool(&ve->base); 5448 5449 kfree(ve->bonds); 5450 kfree(ve); 5451 } 5452 5453 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5454 { 5455 int swp; 5456 5457 /* 5458 * Pick a random sibling on starting to help spread the load around. 5459 * 5460 * New contexts are typically created with exactly the same order 5461 * of siblings, and often started in batches. Due to the way we iterate 5462 * the array of sibling when submitting requests, sibling[0] is 5463 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5464 * randomised across the system, we also help spread the load by the 5465 * first engine we inspect being different each time. 5466 * 5467 * NB This does not force us to execute on this engine, it will just 5468 * typically be the first we inspect for submission. 5469 */ 5470 swp = prandom_u32_max(ve->num_siblings); 5471 if (swp) 5472 swap(ve->siblings[swp], ve->siblings[0]); 5473 } 5474 5475 static int virtual_context_alloc(struct intel_context *ce) 5476 { 5477 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5478 5479 return __execlists_context_alloc(ce, ve->siblings[0]); 5480 } 5481 5482 static int virtual_context_pin(struct intel_context *ce, void *vaddr) 5483 { 5484 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5485 5486 /* Note: we must use a real engine class for setting up reg state */ 5487 return __execlists_context_pin(ce, ve->siblings[0], vaddr); 5488 } 5489 5490 static void virtual_context_enter(struct intel_context *ce) 5491 { 5492 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5493 unsigned int n; 5494 5495 for (n = 0; n < ve->num_siblings; n++) 5496 intel_engine_pm_get(ve->siblings[n]); 5497 5498 intel_timeline_enter(ce->timeline); 5499 } 5500 5501 static void virtual_context_exit(struct intel_context *ce) 5502 { 5503 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5504 unsigned int n; 5505 5506 intel_timeline_exit(ce->timeline); 5507 5508 for (n = 0; n < ve->num_siblings; n++) 5509 intel_engine_pm_put(ve->siblings[n]); 5510 } 5511 5512 static const struct intel_context_ops virtual_context_ops = { 5513 .alloc = virtual_context_alloc, 5514 5515 .pre_pin = execlists_context_pre_pin, 5516 .pin = virtual_context_pin, 5517 .unpin = execlists_context_unpin, 5518 .post_unpin = execlists_context_post_unpin, 5519 5520 .enter = virtual_context_enter, 5521 .exit = virtual_context_exit, 5522 5523 .destroy = virtual_context_destroy, 5524 }; 5525 5526 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5527 { 5528 struct i915_request *rq; 5529 intel_engine_mask_t mask; 5530 5531 rq = READ_ONCE(ve->request); 5532 if (!rq) 5533 return 0; 5534 5535 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5536 mask = rq->execution_mask; 5537 if (unlikely(!mask)) { 5538 /* Invalid selection, submit to a random engine in error */ 5539 i915_request_set_error_once(rq, -ENODEV); 5540 mask = ve->siblings[0]->mask; 5541 } 5542 5543 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5544 rq->fence.context, rq->fence.seqno, 5545 mask, ve->base.execlists.queue_priority_hint); 5546 5547 return mask; 5548 } 5549 5550 static void virtual_submission_tasklet(unsigned long data) 5551 { 5552 struct virtual_engine * const ve = (struct virtual_engine *)data; 5553 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5554 intel_engine_mask_t mask; 5555 unsigned int n; 5556 5557 rcu_read_lock(); 5558 mask = virtual_submission_mask(ve); 5559 rcu_read_unlock(); 5560 if (unlikely(!mask)) 5561 return; 5562 5563 local_irq_disable(); 5564 for (n = 0; n < ve->num_siblings; n++) { 5565 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5566 struct ve_node * const node = &ve->nodes[sibling->id]; 5567 struct rb_node **parent, *rb; 5568 bool first; 5569 5570 if (!READ_ONCE(ve->request)) 5571 break; /* already handled by a sibling's tasklet */ 5572 5573 if (unlikely(!(mask & sibling->mask))) { 5574 if (!RB_EMPTY_NODE(&node->rb)) { 5575 spin_lock(&sibling->active.lock); 5576 rb_erase_cached(&node->rb, 5577 &sibling->execlists.virtual); 5578 RB_CLEAR_NODE(&node->rb); 5579 spin_unlock(&sibling->active.lock); 5580 } 5581 continue; 5582 } 5583 5584 spin_lock(&sibling->active.lock); 5585 5586 if (!RB_EMPTY_NODE(&node->rb)) { 5587 /* 5588 * Cheat and avoid rebalancing the tree if we can 5589 * reuse this node in situ. 5590 */ 5591 first = rb_first_cached(&sibling->execlists.virtual) == 5592 &node->rb; 5593 if (prio == node->prio || (prio > node->prio && first)) 5594 goto submit_engine; 5595 5596 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5597 } 5598 5599 rb = NULL; 5600 first = true; 5601 parent = &sibling->execlists.virtual.rb_root.rb_node; 5602 while (*parent) { 5603 struct ve_node *other; 5604 5605 rb = *parent; 5606 other = rb_entry(rb, typeof(*other), rb); 5607 if (prio > other->prio) { 5608 parent = &rb->rb_left; 5609 } else { 5610 parent = &rb->rb_right; 5611 first = false; 5612 } 5613 } 5614 5615 rb_link_node(&node->rb, rb, parent); 5616 rb_insert_color_cached(&node->rb, 5617 &sibling->execlists.virtual, 5618 first); 5619 5620 submit_engine: 5621 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5622 node->prio = prio; 5623 if (first && prio > sibling->execlists.queue_priority_hint) 5624 tasklet_hi_schedule(&sibling->execlists.tasklet); 5625 5626 spin_unlock(&sibling->active.lock); 5627 } 5628 local_irq_enable(); 5629 } 5630 5631 static void virtual_submit_request(struct i915_request *rq) 5632 { 5633 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5634 struct i915_request *old; 5635 unsigned long flags; 5636 5637 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5638 rq->fence.context, 5639 rq->fence.seqno); 5640 5641 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5642 5643 spin_lock_irqsave(&ve->base.active.lock, flags); 5644 5645 old = ve->request; 5646 if (old) { /* background completion event from preempt-to-busy */ 5647 GEM_BUG_ON(!i915_request_completed(old)); 5648 __i915_request_submit(old); 5649 i915_request_put(old); 5650 } 5651 5652 if (i915_request_completed(rq)) { 5653 __i915_request_submit(rq); 5654 5655 ve->base.execlists.queue_priority_hint = INT_MIN; 5656 ve->request = NULL; 5657 } else { 5658 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5659 ve->request = i915_request_get(rq); 5660 5661 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5662 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5663 5664 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5665 } 5666 5667 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5668 } 5669 5670 static struct ve_bond * 5671 virtual_find_bond(struct virtual_engine *ve, 5672 const struct intel_engine_cs *master) 5673 { 5674 int i; 5675 5676 for (i = 0; i < ve->num_bonds; i++) { 5677 if (ve->bonds[i].master == master) 5678 return &ve->bonds[i]; 5679 } 5680 5681 return NULL; 5682 } 5683 5684 static void 5685 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5686 { 5687 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5688 intel_engine_mask_t allowed, exec; 5689 struct ve_bond *bond; 5690 5691 allowed = ~to_request(signal)->engine->mask; 5692 5693 bond = virtual_find_bond(ve, to_request(signal)->engine); 5694 if (bond) 5695 allowed &= bond->sibling_mask; 5696 5697 /* Restrict the bonded request to run on only the available engines */ 5698 exec = READ_ONCE(rq->execution_mask); 5699 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5700 ; 5701 5702 /* Prevent the master from being re-run on the bonded engines */ 5703 to_request(signal)->execution_mask &= ~allowed; 5704 } 5705 5706 struct intel_context * 5707 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5708 unsigned int count) 5709 { 5710 struct virtual_engine *ve; 5711 unsigned int n; 5712 int err; 5713 5714 if (count == 0) 5715 return ERR_PTR(-EINVAL); 5716 5717 if (count == 1) 5718 return intel_context_create(siblings[0]); 5719 5720 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5721 if (!ve) 5722 return ERR_PTR(-ENOMEM); 5723 5724 ve->base.i915 = siblings[0]->i915; 5725 ve->base.gt = siblings[0]->gt; 5726 ve->base.uncore = siblings[0]->uncore; 5727 ve->base.id = -1; 5728 5729 ve->base.class = OTHER_CLASS; 5730 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5731 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5732 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5733 5734 /* 5735 * The decision on whether to submit a request using semaphores 5736 * depends on the saturated state of the engine. We only compute 5737 * this during HW submission of the request, and we need for this 5738 * state to be globally applied to all requests being submitted 5739 * to this engine. Virtual engines encompass more than one physical 5740 * engine and so we cannot accurately tell in advance if one of those 5741 * engines is already saturated and so cannot afford to use a semaphore 5742 * and be pessimized in priority for doing so -- if we are the only 5743 * context using semaphores after all other clients have stopped, we 5744 * will be starved on the saturated system. Such a global switch for 5745 * semaphores is less than ideal, but alas is the current compromise. 5746 */ 5747 ve->base.saturated = ALL_ENGINES; 5748 5749 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5750 5751 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5752 intel_engine_init_execlists(&ve->base); 5753 5754 ve->base.cops = &virtual_context_ops; 5755 ve->base.request_alloc = execlists_request_alloc; 5756 5757 ve->base.schedule = i915_schedule; 5758 ve->base.submit_request = virtual_submit_request; 5759 ve->base.bond_execute = virtual_bond_execute; 5760 5761 INIT_LIST_HEAD(virtual_queue(ve)); 5762 ve->base.execlists.queue_priority_hint = INT_MIN; 5763 tasklet_init(&ve->base.execlists.tasklet, 5764 virtual_submission_tasklet, 5765 (unsigned long)ve); 5766 5767 intel_context_init(&ve->context, &ve->base); 5768 5769 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); 5770 if (!ve->base.breadcrumbs) { 5771 err = -ENOMEM; 5772 goto err_put; 5773 } 5774 5775 for (n = 0; n < count; n++) { 5776 struct intel_engine_cs *sibling = siblings[n]; 5777 5778 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5779 if (sibling->mask & ve->base.mask) { 5780 DRM_DEBUG("duplicate %s entry in load balancer\n", 5781 sibling->name); 5782 err = -EINVAL; 5783 goto err_put; 5784 } 5785 5786 /* 5787 * The virtual engine implementation is tightly coupled to 5788 * the execlists backend -- we push out request directly 5789 * into a tree inside each physical engine. We could support 5790 * layering if we handle cloning of the requests and 5791 * submitting a copy into each backend. 5792 */ 5793 if (sibling->execlists.tasklet.func != 5794 execlists_submission_tasklet) { 5795 err = -ENODEV; 5796 goto err_put; 5797 } 5798 5799 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5800 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5801 5802 ve->siblings[ve->num_siblings++] = sibling; 5803 ve->base.mask |= sibling->mask; 5804 5805 /* 5806 * All physical engines must be compatible for their emission 5807 * functions (as we build the instructions during request 5808 * construction and do not alter them before submission 5809 * on the physical engine). We use the engine class as a guide 5810 * here, although that could be refined. 5811 */ 5812 if (ve->base.class != OTHER_CLASS) { 5813 if (ve->base.class != sibling->class) { 5814 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5815 sibling->class, ve->base.class); 5816 err = -EINVAL; 5817 goto err_put; 5818 } 5819 continue; 5820 } 5821 5822 ve->base.class = sibling->class; 5823 ve->base.uabi_class = sibling->uabi_class; 5824 snprintf(ve->base.name, sizeof(ve->base.name), 5825 "v%dx%d", ve->base.class, count); 5826 ve->base.context_size = sibling->context_size; 5827 5828 ve->base.emit_bb_start = sibling->emit_bb_start; 5829 ve->base.emit_flush = sibling->emit_flush; 5830 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5831 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5832 ve->base.emit_fini_breadcrumb_dw = 5833 sibling->emit_fini_breadcrumb_dw; 5834 5835 ve->base.flags = sibling->flags; 5836 } 5837 5838 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5839 5840 virtual_engine_initial_hint(ve); 5841 return &ve->context; 5842 5843 err_put: 5844 intel_context_put(&ve->context); 5845 return ERR_PTR(err); 5846 } 5847 5848 struct intel_context * 5849 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5850 { 5851 struct virtual_engine *se = to_virtual_engine(src); 5852 struct intel_context *dst; 5853 5854 dst = intel_execlists_create_virtual(se->siblings, 5855 se->num_siblings); 5856 if (IS_ERR(dst)) 5857 return dst; 5858 5859 if (se->num_bonds) { 5860 struct virtual_engine *de = to_virtual_engine(dst->engine); 5861 5862 de->bonds = kmemdup(se->bonds, 5863 sizeof(*se->bonds) * se->num_bonds, 5864 GFP_KERNEL); 5865 if (!de->bonds) { 5866 intel_context_put(dst); 5867 return ERR_PTR(-ENOMEM); 5868 } 5869 5870 de->num_bonds = se->num_bonds; 5871 } 5872 5873 return dst; 5874 } 5875 5876 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5877 const struct intel_engine_cs *master, 5878 const struct intel_engine_cs *sibling) 5879 { 5880 struct virtual_engine *ve = to_virtual_engine(engine); 5881 struct ve_bond *bond; 5882 int n; 5883 5884 /* Sanity check the sibling is part of the virtual engine */ 5885 for (n = 0; n < ve->num_siblings; n++) 5886 if (sibling == ve->siblings[n]) 5887 break; 5888 if (n == ve->num_siblings) 5889 return -EINVAL; 5890 5891 bond = virtual_find_bond(ve, master); 5892 if (bond) { 5893 bond->sibling_mask |= sibling->mask; 5894 return 0; 5895 } 5896 5897 bond = krealloc(ve->bonds, 5898 sizeof(*bond) * (ve->num_bonds + 1), 5899 GFP_KERNEL); 5900 if (!bond) 5901 return -ENOMEM; 5902 5903 bond[ve->num_bonds].master = master; 5904 bond[ve->num_bonds].sibling_mask = sibling->mask; 5905 5906 ve->bonds = bond; 5907 ve->num_bonds++; 5908 5909 return 0; 5910 } 5911 5912 struct intel_engine_cs * 5913 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5914 unsigned int sibling) 5915 { 5916 struct virtual_engine *ve = to_virtual_engine(engine); 5917 5918 if (sibling >= ve->num_siblings) 5919 return NULL; 5920 5921 return ve->siblings[sibling]; 5922 } 5923 5924 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5925 struct drm_printer *m, 5926 void (*show_request)(struct drm_printer *m, 5927 struct i915_request *rq, 5928 const char *prefix), 5929 unsigned int max) 5930 { 5931 const struct intel_engine_execlists *execlists = &engine->execlists; 5932 struct i915_request *rq, *last; 5933 unsigned long flags; 5934 unsigned int count; 5935 struct rb_node *rb; 5936 5937 spin_lock_irqsave(&engine->active.lock, flags); 5938 5939 last = NULL; 5940 count = 0; 5941 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5942 if (count++ < max - 1) 5943 show_request(m, rq, "\t\tE "); 5944 else 5945 last = rq; 5946 } 5947 if (last) { 5948 if (count > max) { 5949 drm_printf(m, 5950 "\t\t...skipping %d executing requests...\n", 5951 count - max); 5952 } 5953 show_request(m, last, "\t\tE "); 5954 } 5955 5956 if (execlists->switch_priority_hint != INT_MIN) 5957 drm_printf(m, "\t\tSwitch priority hint: %d\n", 5958 READ_ONCE(execlists->switch_priority_hint)); 5959 if (execlists->queue_priority_hint != INT_MIN) 5960 drm_printf(m, "\t\tQueue priority hint: %d\n", 5961 READ_ONCE(execlists->queue_priority_hint)); 5962 5963 last = NULL; 5964 count = 0; 5965 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5966 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5967 int i; 5968 5969 priolist_for_each_request(rq, p, i) { 5970 if (count++ < max - 1) 5971 show_request(m, rq, "\t\tQ "); 5972 else 5973 last = rq; 5974 } 5975 } 5976 if (last) { 5977 if (count > max) { 5978 drm_printf(m, 5979 "\t\t...skipping %d queued requests...\n", 5980 count - max); 5981 } 5982 show_request(m, last, "\t\tQ "); 5983 } 5984 5985 last = NULL; 5986 count = 0; 5987 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5988 struct virtual_engine *ve = 5989 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5990 struct i915_request *rq = READ_ONCE(ve->request); 5991 5992 if (rq) { 5993 if (count++ < max - 1) 5994 show_request(m, rq, "\t\tV "); 5995 else 5996 last = rq; 5997 } 5998 } 5999 if (last) { 6000 if (count > max) { 6001 drm_printf(m, 6002 "\t\t...skipping %d virtual requests...\n", 6003 count - max); 6004 } 6005 show_request(m, last, "\t\tV "); 6006 } 6007 6008 spin_unlock_irqrestore(&engine->active.lock, flags); 6009 } 6010 6011 void intel_lr_context_reset(struct intel_engine_cs *engine, 6012 struct intel_context *ce, 6013 u32 head, 6014 bool scrub) 6015 { 6016 GEM_BUG_ON(!intel_context_is_pinned(ce)); 6017 6018 /* 6019 * We want a simple context + ring to execute the breadcrumb update. 6020 * We cannot rely on the context being intact across the GPU hang, 6021 * so clear it and rebuild just what we need for the breadcrumb. 6022 * All pending requests for this context will be zapped, and any 6023 * future request will be after userspace has had the opportunity 6024 * to recreate its own state. 6025 */ 6026 if (scrub) 6027 restore_default_state(ce, engine); 6028 6029 /* Rerun the request; its payload has been neutered (if guilty). */ 6030 __execlists_update_reg_state(ce, engine, head); 6031 } 6032 6033 bool 6034 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 6035 { 6036 return engine->set_default_submission == 6037 intel_execlists_set_default_submission; 6038 } 6039 6040 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 6041 #include "selftest_lrc.c" 6042 #endif 6043