1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_breadcrumbs.h" 141 #include "intel_context.h" 142 #include "intel_engine_pm.h" 143 #include "intel_gt.h" 144 #include "intel_gt_pm.h" 145 #include "intel_gt_requests.h" 146 #include "intel_lrc_reg.h" 147 #include "intel_mocs.h" 148 #include "intel_reset.h" 149 #include "intel_ring.h" 150 #include "intel_workarounds.h" 151 #include "shmem_utils.h" 152 153 #define RING_EXECLIST_QFULL (1 << 0x2) 154 #define RING_EXECLIST1_VALID (1 << 0x3) 155 #define RING_EXECLIST0_VALID (1 << 0x4) 156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 157 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 158 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 159 160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 166 167 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 169 170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 171 172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 175 #define GEN12_IDLE_CTX_ID 0x7FF 176 #define GEN12_CSB_CTX_VALID(csb_dw) \ 177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 178 179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 181 182 struct virtual_engine { 183 struct intel_engine_cs base; 184 struct intel_context context; 185 186 /* 187 * We allow only a single request through the virtual engine at a time 188 * (each request in the timeline waits for the completion fence of 189 * the previous before being submitted). By restricting ourselves to 190 * only submitting a single request, each request is placed on to a 191 * physical to maximise load spreading (by virtue of the late greedy 192 * scheduling -- each real engine takes the next available request 193 * upon idling). 194 */ 195 struct i915_request *request; 196 197 /* 198 * We keep a rbtree of available virtual engines inside each physical 199 * engine, sorted by priority. Here we preallocate the nodes we need 200 * for the virtual engine, indexed by physical_engine->id. 201 */ 202 struct ve_node { 203 struct rb_node rb; 204 int prio; 205 } nodes[I915_NUM_ENGINES]; 206 207 /* 208 * Keep track of bonded pairs -- restrictions upon on our selection 209 * of physical engines any particular request may be submitted to. 210 * If we receive a submit-fence from a master engine, we will only 211 * use one of sibling_mask physical engines. 212 */ 213 struct ve_bond { 214 const struct intel_engine_cs *master; 215 intel_engine_mask_t sibling_mask; 216 } *bonds; 217 unsigned int num_bonds; 218 219 /* And finally, which physical engines this virtual engine maps onto. */ 220 unsigned int num_siblings; 221 struct intel_engine_cs *siblings[]; 222 }; 223 224 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 225 { 226 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 227 return container_of(engine, struct virtual_engine, base); 228 } 229 230 static int __execlists_context_alloc(struct intel_context *ce, 231 struct intel_engine_cs *engine); 232 233 static void execlists_init_reg_state(u32 *reg_state, 234 const struct intel_context *ce, 235 const struct intel_engine_cs *engine, 236 const struct intel_ring *ring, 237 bool close); 238 static void 239 __execlists_update_reg_state(const struct intel_context *ce, 240 const struct intel_engine_cs *engine, 241 u32 head); 242 243 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 244 { 245 if (INTEL_GEN(engine->i915) >= 12) 246 return 0x60; 247 else if (INTEL_GEN(engine->i915) >= 9) 248 return 0x54; 249 else if (engine->class == RENDER_CLASS) 250 return 0x58; 251 else 252 return -1; 253 } 254 255 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 256 { 257 if (INTEL_GEN(engine->i915) >= 12) 258 return 0x74; 259 else if (INTEL_GEN(engine->i915) >= 9) 260 return 0x68; 261 else if (engine->class == RENDER_CLASS) 262 return 0xd8; 263 else 264 return -1; 265 } 266 267 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 268 { 269 if (INTEL_GEN(engine->i915) >= 12) 270 return 0x12; 271 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 272 return 0x18; 273 else 274 return -1; 275 } 276 277 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 278 { 279 int x; 280 281 x = lrc_ring_wa_bb_per_ctx(engine); 282 if (x < 0) 283 return x; 284 285 return x + 2; 286 } 287 288 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 289 { 290 int x; 291 292 x = lrc_ring_indirect_ptr(engine); 293 if (x < 0) 294 return x; 295 296 return x + 2; 297 } 298 299 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 300 { 301 if (engine->class != RENDER_CLASS) 302 return -1; 303 304 if (INTEL_GEN(engine->i915) >= 12) 305 return 0xb6; 306 else if (INTEL_GEN(engine->i915) >= 11) 307 return 0xaa; 308 else 309 return -1; 310 } 311 312 static u32 313 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 314 { 315 switch (INTEL_GEN(engine->i915)) { 316 default: 317 MISSING_CASE(INTEL_GEN(engine->i915)); 318 fallthrough; 319 case 12: 320 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 321 case 11: 322 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 323 case 10: 324 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 325 case 9: 326 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 327 case 8: 328 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 329 } 330 } 331 332 static void 333 lrc_ring_setup_indirect_ctx(u32 *regs, 334 const struct intel_engine_cs *engine, 335 u32 ctx_bb_ggtt_addr, 336 u32 size) 337 { 338 GEM_BUG_ON(!size); 339 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 340 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 341 regs[lrc_ring_indirect_ptr(engine) + 1] = 342 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 343 344 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 345 regs[lrc_ring_indirect_offset(engine) + 1] = 346 lrc_ring_indirect_offset_default(engine) << 6; 347 } 348 349 static u32 intel_context_get_runtime(const struct intel_context *ce) 350 { 351 /* 352 * We can use either ppHWSP[16] which is recorded before the context 353 * switch (and so excludes the cost of context switches) or use the 354 * value from the context image itself, which is saved/restored earlier 355 * and so includes the cost of the save. 356 */ 357 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 358 } 359 360 static void mark_eio(struct i915_request *rq) 361 { 362 if (i915_request_completed(rq)) 363 return; 364 365 GEM_BUG_ON(i915_request_signaled(rq)); 366 367 i915_request_set_error_once(rq, -EIO); 368 i915_request_mark_complete(rq); 369 } 370 371 static struct i915_request * 372 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 373 { 374 struct i915_request *active = rq; 375 376 rcu_read_lock(); 377 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 378 if (i915_request_completed(rq)) 379 break; 380 381 active = rq; 382 } 383 rcu_read_unlock(); 384 385 return active; 386 } 387 388 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 389 { 390 return (i915_ggtt_offset(engine->status_page.vma) + 391 I915_GEM_HWS_PREEMPT_ADDR); 392 } 393 394 static inline void 395 ring_set_paused(const struct intel_engine_cs *engine, int state) 396 { 397 /* 398 * We inspect HWS_PREEMPT with a semaphore inside 399 * engine->emit_fini_breadcrumb. If the dword is true, 400 * the ring is paused as the semaphore will busywait 401 * until the dword is false. 402 */ 403 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 404 if (state) 405 wmb(); 406 } 407 408 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 409 { 410 return rb_entry(rb, struct i915_priolist, node); 411 } 412 413 static inline int rq_prio(const struct i915_request *rq) 414 { 415 return READ_ONCE(rq->sched.attr.priority); 416 } 417 418 static int effective_prio(const struct i915_request *rq) 419 { 420 int prio = rq_prio(rq); 421 422 /* 423 * If this request is special and must not be interrupted at any 424 * cost, so be it. Note we are only checking the most recent request 425 * in the context and so may be masking an earlier vip request. It 426 * is hoped that under the conditions where nopreempt is used, this 427 * will not matter (i.e. all requests to that context will be 428 * nopreempt for as long as desired). 429 */ 430 if (i915_request_has_nopreempt(rq)) 431 prio = I915_PRIORITY_UNPREEMPTABLE; 432 433 return prio; 434 } 435 436 static int queue_prio(const struct intel_engine_execlists *execlists) 437 { 438 struct i915_priolist *p; 439 struct rb_node *rb; 440 441 rb = rb_first_cached(&execlists->queue); 442 if (!rb) 443 return INT_MIN; 444 445 /* 446 * As the priolist[] are inverted, with the highest priority in [0], 447 * we have to flip the index value to become priority. 448 */ 449 p = to_priolist(rb); 450 if (!I915_USER_PRIORITY_SHIFT) 451 return p->priority; 452 453 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 454 } 455 456 static inline bool need_preempt(const struct intel_engine_cs *engine, 457 const struct i915_request *rq, 458 struct rb_node *rb) 459 { 460 int last_prio; 461 462 if (!intel_engine_has_semaphores(engine)) 463 return false; 464 465 /* 466 * Check if the current priority hint merits a preemption attempt. 467 * 468 * We record the highest value priority we saw during rescheduling 469 * prior to this dequeue, therefore we know that if it is strictly 470 * less than the current tail of ESLP[0], we do not need to force 471 * a preempt-to-idle cycle. 472 * 473 * However, the priority hint is a mere hint that we may need to 474 * preempt. If that hint is stale or we may be trying to preempt 475 * ourselves, ignore the request. 476 * 477 * More naturally we would write 478 * prio >= max(0, last); 479 * except that we wish to prevent triggering preemption at the same 480 * priority level: the task that is running should remain running 481 * to preserve FIFO ordering of dependencies. 482 */ 483 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 484 if (engine->execlists.queue_priority_hint <= last_prio) 485 return false; 486 487 /* 488 * Check against the first request in ELSP[1], it will, thanks to the 489 * power of PI, be the highest priority of that context. 490 */ 491 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 492 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 493 return true; 494 495 if (rb) { 496 struct virtual_engine *ve = 497 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 498 bool preempt = false; 499 500 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 501 struct i915_request *next; 502 503 rcu_read_lock(); 504 next = READ_ONCE(ve->request); 505 if (next) 506 preempt = rq_prio(next) > last_prio; 507 rcu_read_unlock(); 508 } 509 510 if (preempt) 511 return preempt; 512 } 513 514 /* 515 * If the inflight context did not trigger the preemption, then maybe 516 * it was the set of queued requests? Pick the highest priority in 517 * the queue (the first active priolist) and see if it deserves to be 518 * running instead of ELSP[0]. 519 * 520 * The highest priority request in the queue can not be either 521 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 522 * context, it's priority would not exceed ELSP[0] aka last_prio. 523 */ 524 return queue_prio(&engine->execlists) > last_prio; 525 } 526 527 __maybe_unused static inline bool 528 assert_priority_queue(const struct i915_request *prev, 529 const struct i915_request *next) 530 { 531 /* 532 * Without preemption, the prev may refer to the still active element 533 * which we refuse to let go. 534 * 535 * Even with preemption, there are times when we think it is better not 536 * to preempt and leave an ostensibly lower priority request in flight. 537 */ 538 if (i915_request_is_active(prev)) 539 return true; 540 541 return rq_prio(prev) >= rq_prio(next); 542 } 543 544 /* 545 * The context descriptor encodes various attributes of a context, 546 * including its GTT address and some flags. Because it's fairly 547 * expensive to calculate, we'll just do it once and cache the result, 548 * which remains valid until the context is unpinned. 549 * 550 * This is what a descriptor looks like, from LSB to MSB:: 551 * 552 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 553 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 554 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 555 * bits 53-54: mbz, reserved for use by hardware 556 * bits 55-63: group ID, currently unused and set to 0 557 * 558 * Starting from Gen11, the upper dword of the descriptor has a new format: 559 * 560 * bits 32-36: reserved 561 * bits 37-47: SW context ID 562 * bits 48:53: engine instance 563 * bit 54: mbz, reserved for use by hardware 564 * bits 55-60: SW counter 565 * bits 61-63: engine class 566 * 567 * engine info, SW context ID and SW counter need to form a unique number 568 * (Context ID) per lrc. 569 */ 570 static u32 571 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 572 { 573 u32 desc; 574 575 desc = INTEL_LEGACY_32B_CONTEXT; 576 if (i915_vm_is_4lvl(ce->vm)) 577 desc = INTEL_LEGACY_64B_CONTEXT; 578 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 579 580 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 581 if (IS_GEN(engine->i915, 8)) 582 desc |= GEN8_CTX_L3LLC_COHERENT; 583 584 return i915_ggtt_offset(ce->state) | desc; 585 } 586 587 static inline unsigned int dword_in_page(void *addr) 588 { 589 return offset_in_page(addr) / sizeof(u32); 590 } 591 592 static void set_offsets(u32 *regs, 593 const u8 *data, 594 const struct intel_engine_cs *engine, 595 bool clear) 596 #define NOP(x) (BIT(7) | (x)) 597 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 598 #define POSTED BIT(0) 599 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 600 #define REG16(x) \ 601 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 602 (((x) >> 2) & 0x7f) 603 #define END(total_state_size) 0, (total_state_size) 604 { 605 const u32 base = engine->mmio_base; 606 607 while (*data) { 608 u8 count, flags; 609 610 if (*data & BIT(7)) { /* skip */ 611 count = *data++ & ~BIT(7); 612 if (clear) 613 memset32(regs, MI_NOOP, count); 614 regs += count; 615 continue; 616 } 617 618 count = *data & 0x3f; 619 flags = *data >> 6; 620 data++; 621 622 *regs = MI_LOAD_REGISTER_IMM(count); 623 if (flags & POSTED) 624 *regs |= MI_LRI_FORCE_POSTED; 625 if (INTEL_GEN(engine->i915) >= 11) 626 *regs |= MI_LRI_LRM_CS_MMIO; 627 regs++; 628 629 GEM_BUG_ON(!count); 630 do { 631 u32 offset = 0; 632 u8 v; 633 634 do { 635 v = *data++; 636 offset <<= 7; 637 offset |= v & ~BIT(7); 638 } while (v & BIT(7)); 639 640 regs[0] = base + (offset << 2); 641 if (clear) 642 regs[1] = 0; 643 regs += 2; 644 } while (--count); 645 } 646 647 if (clear) { 648 u8 count = *++data; 649 650 /* Clear past the tail for HW access */ 651 GEM_BUG_ON(dword_in_page(regs) > count); 652 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 653 654 /* Close the batch; used mainly by live_lrc_layout() */ 655 *regs = MI_BATCH_BUFFER_END; 656 if (INTEL_GEN(engine->i915) >= 10) 657 *regs |= BIT(0); 658 } 659 } 660 661 static const u8 gen8_xcs_offsets[] = { 662 NOP(1), 663 LRI(11, 0), 664 REG16(0x244), 665 REG(0x034), 666 REG(0x030), 667 REG(0x038), 668 REG(0x03c), 669 REG(0x168), 670 REG(0x140), 671 REG(0x110), 672 REG(0x11c), 673 REG(0x114), 674 REG(0x118), 675 676 NOP(9), 677 LRI(9, 0), 678 REG16(0x3a8), 679 REG16(0x28c), 680 REG16(0x288), 681 REG16(0x284), 682 REG16(0x280), 683 REG16(0x27c), 684 REG16(0x278), 685 REG16(0x274), 686 REG16(0x270), 687 688 NOP(13), 689 LRI(2, 0), 690 REG16(0x200), 691 REG(0x028), 692 693 END(80) 694 }; 695 696 static const u8 gen9_xcs_offsets[] = { 697 NOP(1), 698 LRI(14, POSTED), 699 REG16(0x244), 700 REG(0x034), 701 REG(0x030), 702 REG(0x038), 703 REG(0x03c), 704 REG(0x168), 705 REG(0x140), 706 REG(0x110), 707 REG(0x11c), 708 REG(0x114), 709 REG(0x118), 710 REG(0x1c0), 711 REG(0x1c4), 712 REG(0x1c8), 713 714 NOP(3), 715 LRI(9, POSTED), 716 REG16(0x3a8), 717 REG16(0x28c), 718 REG16(0x288), 719 REG16(0x284), 720 REG16(0x280), 721 REG16(0x27c), 722 REG16(0x278), 723 REG16(0x274), 724 REG16(0x270), 725 726 NOP(13), 727 LRI(1, POSTED), 728 REG16(0x200), 729 730 NOP(13), 731 LRI(44, POSTED), 732 REG(0x028), 733 REG(0x09c), 734 REG(0x0c0), 735 REG(0x178), 736 REG(0x17c), 737 REG16(0x358), 738 REG(0x170), 739 REG(0x150), 740 REG(0x154), 741 REG(0x158), 742 REG16(0x41c), 743 REG16(0x600), 744 REG16(0x604), 745 REG16(0x608), 746 REG16(0x60c), 747 REG16(0x610), 748 REG16(0x614), 749 REG16(0x618), 750 REG16(0x61c), 751 REG16(0x620), 752 REG16(0x624), 753 REG16(0x628), 754 REG16(0x62c), 755 REG16(0x630), 756 REG16(0x634), 757 REG16(0x638), 758 REG16(0x63c), 759 REG16(0x640), 760 REG16(0x644), 761 REG16(0x648), 762 REG16(0x64c), 763 REG16(0x650), 764 REG16(0x654), 765 REG16(0x658), 766 REG16(0x65c), 767 REG16(0x660), 768 REG16(0x664), 769 REG16(0x668), 770 REG16(0x66c), 771 REG16(0x670), 772 REG16(0x674), 773 REG16(0x678), 774 REG16(0x67c), 775 REG(0x068), 776 777 END(176) 778 }; 779 780 static const u8 gen12_xcs_offsets[] = { 781 NOP(1), 782 LRI(13, POSTED), 783 REG16(0x244), 784 REG(0x034), 785 REG(0x030), 786 REG(0x038), 787 REG(0x03c), 788 REG(0x168), 789 REG(0x140), 790 REG(0x110), 791 REG(0x1c0), 792 REG(0x1c4), 793 REG(0x1c8), 794 REG(0x180), 795 REG16(0x2b4), 796 797 NOP(5), 798 LRI(9, POSTED), 799 REG16(0x3a8), 800 REG16(0x28c), 801 REG16(0x288), 802 REG16(0x284), 803 REG16(0x280), 804 REG16(0x27c), 805 REG16(0x278), 806 REG16(0x274), 807 REG16(0x270), 808 809 END(80) 810 }; 811 812 static const u8 gen8_rcs_offsets[] = { 813 NOP(1), 814 LRI(14, POSTED), 815 REG16(0x244), 816 REG(0x034), 817 REG(0x030), 818 REG(0x038), 819 REG(0x03c), 820 REG(0x168), 821 REG(0x140), 822 REG(0x110), 823 REG(0x11c), 824 REG(0x114), 825 REG(0x118), 826 REG(0x1c0), 827 REG(0x1c4), 828 REG(0x1c8), 829 830 NOP(3), 831 LRI(9, POSTED), 832 REG16(0x3a8), 833 REG16(0x28c), 834 REG16(0x288), 835 REG16(0x284), 836 REG16(0x280), 837 REG16(0x27c), 838 REG16(0x278), 839 REG16(0x274), 840 REG16(0x270), 841 842 NOP(13), 843 LRI(1, 0), 844 REG(0x0c8), 845 846 END(80) 847 }; 848 849 static const u8 gen9_rcs_offsets[] = { 850 NOP(1), 851 LRI(14, POSTED), 852 REG16(0x244), 853 REG(0x34), 854 REG(0x30), 855 REG(0x38), 856 REG(0x3c), 857 REG(0x168), 858 REG(0x140), 859 REG(0x110), 860 REG(0x11c), 861 REG(0x114), 862 REG(0x118), 863 REG(0x1c0), 864 REG(0x1c4), 865 REG(0x1c8), 866 867 NOP(3), 868 LRI(9, POSTED), 869 REG16(0x3a8), 870 REG16(0x28c), 871 REG16(0x288), 872 REG16(0x284), 873 REG16(0x280), 874 REG16(0x27c), 875 REG16(0x278), 876 REG16(0x274), 877 REG16(0x270), 878 879 NOP(13), 880 LRI(1, 0), 881 REG(0xc8), 882 883 NOP(13), 884 LRI(44, POSTED), 885 REG(0x28), 886 REG(0x9c), 887 REG(0xc0), 888 REG(0x178), 889 REG(0x17c), 890 REG16(0x358), 891 REG(0x170), 892 REG(0x150), 893 REG(0x154), 894 REG(0x158), 895 REG16(0x41c), 896 REG16(0x600), 897 REG16(0x604), 898 REG16(0x608), 899 REG16(0x60c), 900 REG16(0x610), 901 REG16(0x614), 902 REG16(0x618), 903 REG16(0x61c), 904 REG16(0x620), 905 REG16(0x624), 906 REG16(0x628), 907 REG16(0x62c), 908 REG16(0x630), 909 REG16(0x634), 910 REG16(0x638), 911 REG16(0x63c), 912 REG16(0x640), 913 REG16(0x644), 914 REG16(0x648), 915 REG16(0x64c), 916 REG16(0x650), 917 REG16(0x654), 918 REG16(0x658), 919 REG16(0x65c), 920 REG16(0x660), 921 REG16(0x664), 922 REG16(0x668), 923 REG16(0x66c), 924 REG16(0x670), 925 REG16(0x674), 926 REG16(0x678), 927 REG16(0x67c), 928 REG(0x68), 929 930 END(176) 931 }; 932 933 static const u8 gen11_rcs_offsets[] = { 934 NOP(1), 935 LRI(15, POSTED), 936 REG16(0x244), 937 REG(0x034), 938 REG(0x030), 939 REG(0x038), 940 REG(0x03c), 941 REG(0x168), 942 REG(0x140), 943 REG(0x110), 944 REG(0x11c), 945 REG(0x114), 946 REG(0x118), 947 REG(0x1c0), 948 REG(0x1c4), 949 REG(0x1c8), 950 REG(0x180), 951 952 NOP(1), 953 LRI(9, POSTED), 954 REG16(0x3a8), 955 REG16(0x28c), 956 REG16(0x288), 957 REG16(0x284), 958 REG16(0x280), 959 REG16(0x27c), 960 REG16(0x278), 961 REG16(0x274), 962 REG16(0x270), 963 964 LRI(1, POSTED), 965 REG(0x1b0), 966 967 NOP(10), 968 LRI(1, 0), 969 REG(0x0c8), 970 971 END(80) 972 }; 973 974 static const u8 gen12_rcs_offsets[] = { 975 NOP(1), 976 LRI(13, POSTED), 977 REG16(0x244), 978 REG(0x034), 979 REG(0x030), 980 REG(0x038), 981 REG(0x03c), 982 REG(0x168), 983 REG(0x140), 984 REG(0x110), 985 REG(0x1c0), 986 REG(0x1c4), 987 REG(0x1c8), 988 REG(0x180), 989 REG16(0x2b4), 990 991 NOP(5), 992 LRI(9, POSTED), 993 REG16(0x3a8), 994 REG16(0x28c), 995 REG16(0x288), 996 REG16(0x284), 997 REG16(0x280), 998 REG16(0x27c), 999 REG16(0x278), 1000 REG16(0x274), 1001 REG16(0x270), 1002 1003 LRI(3, POSTED), 1004 REG(0x1b0), 1005 REG16(0x5a8), 1006 REG16(0x5ac), 1007 1008 NOP(6), 1009 LRI(1, 0), 1010 REG(0x0c8), 1011 NOP(3 + 9 + 1), 1012 1013 LRI(51, POSTED), 1014 REG16(0x588), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG16(0x588), 1020 REG(0x028), 1021 REG(0x09c), 1022 REG(0x0c0), 1023 REG(0x178), 1024 REG(0x17c), 1025 REG16(0x358), 1026 REG(0x170), 1027 REG(0x150), 1028 REG(0x154), 1029 REG(0x158), 1030 REG16(0x41c), 1031 REG16(0x600), 1032 REG16(0x604), 1033 REG16(0x608), 1034 REG16(0x60c), 1035 REG16(0x610), 1036 REG16(0x614), 1037 REG16(0x618), 1038 REG16(0x61c), 1039 REG16(0x620), 1040 REG16(0x624), 1041 REG16(0x628), 1042 REG16(0x62c), 1043 REG16(0x630), 1044 REG16(0x634), 1045 REG16(0x638), 1046 REG16(0x63c), 1047 REG16(0x640), 1048 REG16(0x644), 1049 REG16(0x648), 1050 REG16(0x64c), 1051 REG16(0x650), 1052 REG16(0x654), 1053 REG16(0x658), 1054 REG16(0x65c), 1055 REG16(0x660), 1056 REG16(0x664), 1057 REG16(0x668), 1058 REG16(0x66c), 1059 REG16(0x670), 1060 REG16(0x674), 1061 REG16(0x678), 1062 REG16(0x67c), 1063 REG(0x068), 1064 REG(0x084), 1065 NOP(1), 1066 1067 END(192) 1068 }; 1069 1070 #undef END 1071 #undef REG16 1072 #undef REG 1073 #undef LRI 1074 #undef NOP 1075 1076 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1077 { 1078 /* 1079 * The gen12+ lists only have the registers we program in the basic 1080 * default state. We rely on the context image using relative 1081 * addressing to automatic fixup the register state between the 1082 * physical engines for virtual engine. 1083 */ 1084 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1085 !intel_engine_has_relative_mmio(engine)); 1086 1087 if (engine->class == RENDER_CLASS) { 1088 if (INTEL_GEN(engine->i915) >= 12) 1089 return gen12_rcs_offsets; 1090 else if (INTEL_GEN(engine->i915) >= 11) 1091 return gen11_rcs_offsets; 1092 else if (INTEL_GEN(engine->i915) >= 9) 1093 return gen9_rcs_offsets; 1094 else 1095 return gen8_rcs_offsets; 1096 } else { 1097 if (INTEL_GEN(engine->i915) >= 12) 1098 return gen12_xcs_offsets; 1099 else if (INTEL_GEN(engine->i915) >= 9) 1100 return gen9_xcs_offsets; 1101 else 1102 return gen8_xcs_offsets; 1103 } 1104 } 1105 1106 static struct i915_request * 1107 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1108 { 1109 struct i915_request *rq, *rn, *active = NULL; 1110 struct list_head *pl; 1111 int prio = I915_PRIORITY_INVALID; 1112 1113 lockdep_assert_held(&engine->active.lock); 1114 1115 list_for_each_entry_safe_reverse(rq, rn, 1116 &engine->active.requests, 1117 sched.link) { 1118 if (i915_request_completed(rq)) 1119 continue; /* XXX */ 1120 1121 __i915_request_unsubmit(rq); 1122 1123 /* 1124 * Push the request back into the queue for later resubmission. 1125 * If this request is not native to this physical engine (i.e. 1126 * it came from a virtual source), push it back onto the virtual 1127 * engine so that it can be moved across onto another physical 1128 * engine as load dictates. 1129 */ 1130 if (likely(rq->execution_mask == engine->mask)) { 1131 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1132 if (rq_prio(rq) != prio) { 1133 prio = rq_prio(rq); 1134 pl = i915_sched_lookup_priolist(engine, prio); 1135 } 1136 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1137 1138 list_move(&rq->sched.link, pl); 1139 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1140 1141 /* Check in case we rollback so far we wrap [size/2] */ 1142 if (intel_ring_direction(rq->ring, 1143 rq->tail, 1144 rq->ring->tail + 8) > 0) 1145 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1146 1147 active = rq; 1148 } else { 1149 struct intel_engine_cs *owner = rq->context->engine; 1150 1151 WRITE_ONCE(rq->engine, owner); 1152 owner->submit_request(rq); 1153 active = NULL; 1154 } 1155 } 1156 1157 return active; 1158 } 1159 1160 struct i915_request * 1161 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1162 { 1163 struct intel_engine_cs *engine = 1164 container_of(execlists, typeof(*engine), execlists); 1165 1166 return __unwind_incomplete_requests(engine); 1167 } 1168 1169 static inline void 1170 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1171 { 1172 /* 1173 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1174 * The compiler should eliminate this function as dead-code. 1175 */ 1176 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1177 return; 1178 1179 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1180 status, rq); 1181 } 1182 1183 static void intel_engine_context_in(struct intel_engine_cs *engine) 1184 { 1185 unsigned long flags; 1186 1187 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1188 return; 1189 1190 write_seqlock_irqsave(&engine->stats.lock, flags); 1191 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1192 engine->stats.start = ktime_get(); 1193 atomic_inc(&engine->stats.active); 1194 } 1195 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1196 } 1197 1198 static void intel_engine_context_out(struct intel_engine_cs *engine) 1199 { 1200 unsigned long flags; 1201 1202 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1203 1204 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1205 return; 1206 1207 write_seqlock_irqsave(&engine->stats.lock, flags); 1208 if (atomic_dec_and_test(&engine->stats.active)) { 1209 engine->stats.total = 1210 ktime_add(engine->stats.total, 1211 ktime_sub(ktime_get(), engine->stats.start)); 1212 } 1213 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1214 } 1215 1216 static void 1217 execlists_check_context(const struct intel_context *ce, 1218 const struct intel_engine_cs *engine) 1219 { 1220 const struct intel_ring *ring = ce->ring; 1221 u32 *regs = ce->lrc_reg_state; 1222 bool valid = true; 1223 int x; 1224 1225 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1226 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1227 engine->name, 1228 regs[CTX_RING_START], 1229 i915_ggtt_offset(ring->vma)); 1230 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1231 valid = false; 1232 } 1233 1234 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1235 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1236 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1237 engine->name, 1238 regs[CTX_RING_CTL], 1239 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1240 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1241 valid = false; 1242 } 1243 1244 x = lrc_ring_mi_mode(engine); 1245 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1246 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1247 engine->name, regs[x + 1]); 1248 regs[x + 1] &= ~STOP_RING; 1249 regs[x + 1] |= STOP_RING << 16; 1250 valid = false; 1251 } 1252 1253 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1254 } 1255 1256 static void restore_default_state(struct intel_context *ce, 1257 struct intel_engine_cs *engine) 1258 { 1259 u32 *regs; 1260 1261 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1262 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1263 1264 ce->runtime.last = intel_context_get_runtime(ce); 1265 } 1266 1267 static void reset_active(struct i915_request *rq, 1268 struct intel_engine_cs *engine) 1269 { 1270 struct intel_context * const ce = rq->context; 1271 u32 head; 1272 1273 /* 1274 * The executing context has been cancelled. We want to prevent 1275 * further execution along this context and propagate the error on 1276 * to anything depending on its results. 1277 * 1278 * In __i915_request_submit(), we apply the -EIO and remove the 1279 * requests' payloads for any banned requests. But first, we must 1280 * rewind the context back to the start of the incomplete request so 1281 * that we do not jump back into the middle of the batch. 1282 * 1283 * We preserve the breadcrumbs and semaphores of the incomplete 1284 * requests so that inter-timeline dependencies (i.e other timelines) 1285 * remain correctly ordered. And we defer to __i915_request_submit() 1286 * so that all asynchronous waits are correctly handled. 1287 */ 1288 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1289 rq->fence.context, rq->fence.seqno); 1290 1291 /* On resubmission of the active request, payload will be scrubbed */ 1292 if (i915_request_completed(rq)) 1293 head = rq->tail; 1294 else 1295 head = active_request(ce->timeline, rq)->head; 1296 head = intel_ring_wrap(ce->ring, head); 1297 1298 /* Scrub the context image to prevent replaying the previous batch */ 1299 restore_default_state(ce, engine); 1300 __execlists_update_reg_state(ce, engine, head); 1301 1302 /* We've switched away, so this should be a no-op, but intent matters */ 1303 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1304 } 1305 1306 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1307 { 1308 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1309 ce->runtime.num_underflow += dt < 0; 1310 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1311 #endif 1312 } 1313 1314 static void intel_context_update_runtime(struct intel_context *ce) 1315 { 1316 u32 old; 1317 s32 dt; 1318 1319 if (intel_context_is_barrier(ce)) 1320 return; 1321 1322 old = ce->runtime.last; 1323 ce->runtime.last = intel_context_get_runtime(ce); 1324 dt = ce->runtime.last - old; 1325 1326 if (unlikely(dt <= 0)) { 1327 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1328 old, ce->runtime.last, dt); 1329 st_update_runtime_underflow(ce, dt); 1330 return; 1331 } 1332 1333 ewma_runtime_add(&ce->runtime.avg, dt); 1334 ce->runtime.total += dt; 1335 } 1336 1337 static inline struct intel_engine_cs * 1338 __execlists_schedule_in(struct i915_request *rq) 1339 { 1340 struct intel_engine_cs * const engine = rq->engine; 1341 struct intel_context * const ce = rq->context; 1342 1343 intel_context_get(ce); 1344 1345 if (unlikely(intel_context_is_banned(ce))) 1346 reset_active(rq, engine); 1347 1348 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1349 execlists_check_context(ce, engine); 1350 1351 if (ce->tag) { 1352 /* Use a fixed tag for OA and friends */ 1353 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1354 ce->lrc.ccid = ce->tag; 1355 } else { 1356 /* We don't need a strict matching tag, just different values */ 1357 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1358 1359 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1360 clear_bit(tag - 1, &engine->context_tag); 1361 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1362 1363 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1364 } 1365 1366 ce->lrc.ccid |= engine->execlists.ccid; 1367 1368 __intel_gt_pm_get(engine->gt); 1369 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1370 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1371 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1372 intel_engine_context_in(engine); 1373 1374 return engine; 1375 } 1376 1377 static inline struct i915_request * 1378 execlists_schedule_in(struct i915_request *rq, int idx) 1379 { 1380 struct intel_context * const ce = rq->context; 1381 struct intel_engine_cs *old; 1382 1383 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1384 trace_i915_request_in(rq, idx); 1385 1386 old = READ_ONCE(ce->inflight); 1387 do { 1388 if (!old) { 1389 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1390 break; 1391 } 1392 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1393 1394 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1395 return i915_request_get(rq); 1396 } 1397 1398 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1399 { 1400 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1401 struct i915_request *next = READ_ONCE(ve->request); 1402 1403 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1404 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1405 } 1406 1407 static inline void 1408 __execlists_schedule_out(struct i915_request *rq, 1409 struct intel_engine_cs * const engine, 1410 unsigned int ccid) 1411 { 1412 struct intel_context * const ce = rq->context; 1413 1414 /* 1415 * NB process_csb() is not under the engine->active.lock and hence 1416 * schedule_out can race with schedule_in meaning that we should 1417 * refrain from doing non-trivial work here. 1418 */ 1419 1420 /* 1421 * If we have just completed this context, the engine may now be 1422 * idle and we want to re-enter powersaving. 1423 */ 1424 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1425 i915_request_completed(rq)) 1426 intel_engine_add_retire(engine, ce->timeline); 1427 1428 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1429 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1430 if (ccid < BITS_PER_LONG) { 1431 GEM_BUG_ON(ccid == 0); 1432 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1433 set_bit(ccid - 1, &engine->context_tag); 1434 } 1435 1436 intel_context_update_runtime(ce); 1437 intel_engine_context_out(engine); 1438 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1439 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1440 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1441 intel_gt_pm_put_async(engine->gt); 1442 1443 /* 1444 * If this is part of a virtual engine, its next request may 1445 * have been blocked waiting for access to the active context. 1446 * We have to kick all the siblings again in case we need to 1447 * switch (e.g. the next request is not runnable on this 1448 * engine). Hopefully, we will already have submitted the next 1449 * request before the tasklet runs and do not need to rebuild 1450 * each virtual tree and kick everyone again. 1451 */ 1452 if (ce->engine != engine) 1453 kick_siblings(rq, ce); 1454 1455 intel_context_put(ce); 1456 } 1457 1458 static inline void 1459 execlists_schedule_out(struct i915_request *rq) 1460 { 1461 struct intel_context * const ce = rq->context; 1462 struct intel_engine_cs *cur, *old; 1463 u32 ccid; 1464 1465 trace_i915_request_out(rq); 1466 1467 ccid = rq->context->lrc.ccid; 1468 old = READ_ONCE(ce->inflight); 1469 do 1470 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1471 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1472 if (!cur) 1473 __execlists_schedule_out(rq, old, ccid); 1474 1475 i915_request_put(rq); 1476 } 1477 1478 static u64 execlists_update_context(struct i915_request *rq) 1479 { 1480 struct intel_context *ce = rq->context; 1481 u64 desc = ce->lrc.desc; 1482 u32 tail, prev; 1483 1484 /* 1485 * WaIdleLiteRestore:bdw,skl 1486 * 1487 * We should never submit the context with the same RING_TAIL twice 1488 * just in case we submit an empty ring, which confuses the HW. 1489 * 1490 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1491 * the normal request to be able to always advance the RING_TAIL on 1492 * subsequent resubmissions (for lite restore). Should that fail us, 1493 * and we try and submit the same tail again, force the context 1494 * reload. 1495 * 1496 * If we need to return to a preempted context, we need to skip the 1497 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1498 * HW has a tendency to ignore us rewinding the TAIL to the end of 1499 * an earlier request. 1500 */ 1501 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1502 prev = rq->ring->tail; 1503 tail = intel_ring_set_tail(rq->ring, rq->tail); 1504 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1505 desc |= CTX_DESC_FORCE_RESTORE; 1506 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1507 rq->tail = rq->wa_tail; 1508 1509 /* 1510 * Make sure the context image is complete before we submit it to HW. 1511 * 1512 * Ostensibly, writes (including the WCB) should be flushed prior to 1513 * an uncached write such as our mmio register access, the empirical 1514 * evidence (esp. on Braswell) suggests that the WC write into memory 1515 * may not be visible to the HW prior to the completion of the UC 1516 * register write and that we may begin execution from the context 1517 * before its image is complete leading to invalid PD chasing. 1518 */ 1519 wmb(); 1520 1521 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1522 return desc; 1523 } 1524 1525 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1526 { 1527 if (execlists->ctrl_reg) { 1528 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1529 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1530 } else { 1531 writel(upper_32_bits(desc), execlists->submit_reg); 1532 writel(lower_32_bits(desc), execlists->submit_reg); 1533 } 1534 } 1535 1536 static __maybe_unused char * 1537 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1538 { 1539 if (!rq) 1540 return ""; 1541 1542 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1543 prefix, 1544 rq->context->lrc.ccid, 1545 rq->fence.context, rq->fence.seqno, 1546 i915_request_completed(rq) ? "!" : 1547 i915_request_started(rq) ? "*" : 1548 "", 1549 rq_prio(rq)); 1550 1551 return buf; 1552 } 1553 1554 static __maybe_unused void 1555 trace_ports(const struct intel_engine_execlists *execlists, 1556 const char *msg, 1557 struct i915_request * const *ports) 1558 { 1559 const struct intel_engine_cs *engine = 1560 container_of(execlists, typeof(*engine), execlists); 1561 char __maybe_unused p0[40], p1[40]; 1562 1563 if (!ports[0]) 1564 return; 1565 1566 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1567 dump_port(p0, sizeof(p0), "", ports[0]), 1568 dump_port(p1, sizeof(p1), ", ", ports[1])); 1569 } 1570 1571 static inline bool 1572 reset_in_progress(const struct intel_engine_execlists *execlists) 1573 { 1574 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1575 } 1576 1577 static __maybe_unused bool 1578 assert_pending_valid(const struct intel_engine_execlists *execlists, 1579 const char *msg) 1580 { 1581 struct intel_engine_cs *engine = 1582 container_of(execlists, typeof(*engine), execlists); 1583 struct i915_request * const *port, *rq; 1584 struct intel_context *ce = NULL; 1585 bool sentinel = false; 1586 u32 ccid = -1; 1587 1588 trace_ports(execlists, msg, execlists->pending); 1589 1590 /* We may be messing around with the lists during reset, lalala */ 1591 if (reset_in_progress(execlists)) 1592 return true; 1593 1594 if (!execlists->pending[0]) { 1595 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1596 engine->name); 1597 return false; 1598 } 1599 1600 if (execlists->pending[execlists_num_ports(execlists)]) { 1601 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1602 engine->name, execlists_num_ports(execlists)); 1603 return false; 1604 } 1605 1606 for (port = execlists->pending; (rq = *port); port++) { 1607 unsigned long flags; 1608 bool ok = true; 1609 1610 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1611 GEM_BUG_ON(!i915_request_is_active(rq)); 1612 1613 if (ce == rq->context) { 1614 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1615 engine->name, 1616 ce->timeline->fence_context, 1617 port - execlists->pending); 1618 return false; 1619 } 1620 ce = rq->context; 1621 1622 if (ccid == ce->lrc.ccid) { 1623 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1624 engine->name, 1625 ccid, ce->timeline->fence_context, 1626 port - execlists->pending); 1627 return false; 1628 } 1629 ccid = ce->lrc.ccid; 1630 1631 /* 1632 * Sentinels are supposed to be the last request so they flush 1633 * the current execution off the HW. Check that they are the only 1634 * request in the pending submission. 1635 */ 1636 if (sentinel) { 1637 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1638 engine->name, 1639 ce->timeline->fence_context, 1640 port - execlists->pending); 1641 return false; 1642 } 1643 sentinel = i915_request_has_sentinel(rq); 1644 1645 /* Hold tightly onto the lock to prevent concurrent retires! */ 1646 if (!spin_trylock_irqsave(&rq->lock, flags)) 1647 continue; 1648 1649 if (i915_request_completed(rq)) 1650 goto unlock; 1651 1652 if (i915_active_is_idle(&ce->active) && 1653 !intel_context_is_barrier(ce)) { 1654 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1655 engine->name, 1656 ce->timeline->fence_context, 1657 port - execlists->pending); 1658 ok = false; 1659 goto unlock; 1660 } 1661 1662 if (!i915_vma_is_pinned(ce->state)) { 1663 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1664 engine->name, 1665 ce->timeline->fence_context, 1666 port - execlists->pending); 1667 ok = false; 1668 goto unlock; 1669 } 1670 1671 if (!i915_vma_is_pinned(ce->ring->vma)) { 1672 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1673 engine->name, 1674 ce->timeline->fence_context, 1675 port - execlists->pending); 1676 ok = false; 1677 goto unlock; 1678 } 1679 1680 unlock: 1681 spin_unlock_irqrestore(&rq->lock, flags); 1682 if (!ok) 1683 return false; 1684 } 1685 1686 return ce; 1687 } 1688 1689 static void execlists_submit_ports(struct intel_engine_cs *engine) 1690 { 1691 struct intel_engine_execlists *execlists = &engine->execlists; 1692 unsigned int n; 1693 1694 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1695 1696 /* 1697 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1698 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1699 * not be relinquished until the device is idle (see 1700 * i915_gem_idle_work_handler()). As a precaution, we make sure 1701 * that all ELSP are drained i.e. we have processed the CSB, 1702 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1703 */ 1704 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1705 1706 /* 1707 * ELSQ note: the submit queue is not cleared after being submitted 1708 * to the HW so we need to make sure we always clean it up. This is 1709 * currently ensured by the fact that we always write the same number 1710 * of elsq entries, keep this in mind before changing the loop below. 1711 */ 1712 for (n = execlists_num_ports(execlists); n--; ) { 1713 struct i915_request *rq = execlists->pending[n]; 1714 1715 write_desc(execlists, 1716 rq ? execlists_update_context(rq) : 0, 1717 n); 1718 } 1719 1720 /* we need to manually load the submit queue */ 1721 if (execlists->ctrl_reg) 1722 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1723 } 1724 1725 static bool ctx_single_port_submission(const struct intel_context *ce) 1726 { 1727 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1728 intel_context_force_single_submission(ce)); 1729 } 1730 1731 static bool can_merge_ctx(const struct intel_context *prev, 1732 const struct intel_context *next) 1733 { 1734 if (prev != next) 1735 return false; 1736 1737 if (ctx_single_port_submission(prev)) 1738 return false; 1739 1740 return true; 1741 } 1742 1743 static unsigned long i915_request_flags(const struct i915_request *rq) 1744 { 1745 return READ_ONCE(rq->fence.flags); 1746 } 1747 1748 static bool can_merge_rq(const struct i915_request *prev, 1749 const struct i915_request *next) 1750 { 1751 GEM_BUG_ON(prev == next); 1752 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1753 1754 /* 1755 * We do not submit known completed requests. Therefore if the next 1756 * request is already completed, we can pretend to merge it in 1757 * with the previous context (and we will skip updating the ELSP 1758 * and tracking). Thus hopefully keeping the ELSP full with active 1759 * contexts, despite the best efforts of preempt-to-busy to confuse 1760 * us. 1761 */ 1762 if (i915_request_completed(next)) 1763 return true; 1764 1765 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1766 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1767 BIT(I915_FENCE_FLAG_SENTINEL)))) 1768 return false; 1769 1770 if (!can_merge_ctx(prev->context, next->context)) 1771 return false; 1772 1773 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1774 return true; 1775 } 1776 1777 static void virtual_update_register_offsets(u32 *regs, 1778 struct intel_engine_cs *engine) 1779 { 1780 set_offsets(regs, reg_offsets(engine), engine, false); 1781 } 1782 1783 static bool virtual_matches(const struct virtual_engine *ve, 1784 const struct i915_request *rq, 1785 const struct intel_engine_cs *engine) 1786 { 1787 const struct intel_engine_cs *inflight; 1788 1789 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1790 return false; 1791 1792 /* 1793 * We track when the HW has completed saving the context image 1794 * (i.e. when we have seen the final CS event switching out of 1795 * the context) and must not overwrite the context image before 1796 * then. This restricts us to only using the active engine 1797 * while the previous virtualized request is inflight (so 1798 * we reuse the register offsets). This is a very small 1799 * hystersis on the greedy seelction algorithm. 1800 */ 1801 inflight = intel_context_inflight(&ve->context); 1802 if (inflight && inflight != engine) 1803 return false; 1804 1805 return true; 1806 } 1807 1808 static void virtual_xfer_context(struct virtual_engine *ve, 1809 struct intel_engine_cs *engine) 1810 { 1811 unsigned int n; 1812 1813 if (likely(engine == ve->siblings[0])) 1814 return; 1815 1816 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 1817 if (!intel_engine_has_relative_mmio(engine)) 1818 virtual_update_register_offsets(ve->context.lrc_reg_state, 1819 engine); 1820 1821 /* 1822 * Move the bound engine to the top of the list for 1823 * future execution. We then kick this tasklet first 1824 * before checking others, so that we preferentially 1825 * reuse this set of bound registers. 1826 */ 1827 for (n = 1; n < ve->num_siblings; n++) { 1828 if (ve->siblings[n] == engine) { 1829 swap(ve->siblings[n], ve->siblings[0]); 1830 break; 1831 } 1832 } 1833 } 1834 1835 #define for_each_waiter(p__, rq__) \ 1836 list_for_each_entry_lockless(p__, \ 1837 &(rq__)->sched.waiters_list, \ 1838 wait_link) 1839 1840 #define for_each_signaler(p__, rq__) \ 1841 list_for_each_entry_rcu(p__, \ 1842 &(rq__)->sched.signalers_list, \ 1843 signal_link) 1844 1845 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1846 { 1847 LIST_HEAD(list); 1848 1849 /* 1850 * We want to move the interrupted request to the back of 1851 * the round-robin list (i.e. its priority level), but 1852 * in doing so, we must then move all requests that were in 1853 * flight and were waiting for the interrupted request to 1854 * be run after it again. 1855 */ 1856 do { 1857 struct i915_dependency *p; 1858 1859 GEM_BUG_ON(i915_request_is_active(rq)); 1860 list_move_tail(&rq->sched.link, pl); 1861 1862 for_each_waiter(p, rq) { 1863 struct i915_request *w = 1864 container_of(p->waiter, typeof(*w), sched); 1865 1866 if (p->flags & I915_DEPENDENCY_WEAK) 1867 continue; 1868 1869 /* Leave semaphores spinning on the other engines */ 1870 if (w->engine != rq->engine) 1871 continue; 1872 1873 /* No waiter should start before its signaler */ 1874 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1875 i915_request_started(w) && 1876 !i915_request_completed(rq)); 1877 1878 GEM_BUG_ON(i915_request_is_active(w)); 1879 if (!i915_request_is_ready(w)) 1880 continue; 1881 1882 if (rq_prio(w) < rq_prio(rq)) 1883 continue; 1884 1885 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1886 list_move_tail(&w->sched.link, &list); 1887 } 1888 1889 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1890 } while (rq); 1891 } 1892 1893 static void defer_active(struct intel_engine_cs *engine) 1894 { 1895 struct i915_request *rq; 1896 1897 rq = __unwind_incomplete_requests(engine); 1898 if (!rq) 1899 return; 1900 1901 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1902 } 1903 1904 static bool 1905 need_timeslice(const struct intel_engine_cs *engine, 1906 const struct i915_request *rq, 1907 const struct rb_node *rb) 1908 { 1909 int hint; 1910 1911 if (!intel_engine_has_timeslices(engine)) 1912 return false; 1913 1914 hint = engine->execlists.queue_priority_hint; 1915 1916 if (rb) { 1917 const struct virtual_engine *ve = 1918 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1919 const struct intel_engine_cs *inflight = 1920 intel_context_inflight(&ve->context); 1921 1922 if (!inflight || inflight == engine) { 1923 struct i915_request *next; 1924 1925 rcu_read_lock(); 1926 next = READ_ONCE(ve->request); 1927 if (next) 1928 hint = max(hint, rq_prio(next)); 1929 rcu_read_unlock(); 1930 } 1931 } 1932 1933 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1934 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1935 1936 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1937 return hint >= effective_prio(rq); 1938 } 1939 1940 static bool 1941 timeslice_yield(const struct intel_engine_execlists *el, 1942 const struct i915_request *rq) 1943 { 1944 /* 1945 * Once bitten, forever smitten! 1946 * 1947 * If the active context ever busy-waited on a semaphore, 1948 * it will be treated as a hog until the end of its timeslice (i.e. 1949 * until it is scheduled out and replaced by a new submission, 1950 * possibly even its own lite-restore). The HW only sends an interrupt 1951 * on the first miss, and we do know if that semaphore has been 1952 * signaled, or even if it is now stuck on another semaphore. Play 1953 * safe, yield if it might be stuck -- it will be given a fresh 1954 * timeslice in the near future. 1955 */ 1956 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1957 } 1958 1959 static bool 1960 timeslice_expired(const struct intel_engine_execlists *el, 1961 const struct i915_request *rq) 1962 { 1963 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1964 } 1965 1966 static int 1967 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1968 { 1969 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1970 return engine->execlists.queue_priority_hint; 1971 1972 return rq_prio(list_next_entry(rq, sched.link)); 1973 } 1974 1975 static inline unsigned long 1976 timeslice(const struct intel_engine_cs *engine) 1977 { 1978 return READ_ONCE(engine->props.timeslice_duration_ms); 1979 } 1980 1981 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1982 { 1983 const struct intel_engine_execlists *execlists = &engine->execlists; 1984 const struct i915_request *rq = *execlists->active; 1985 1986 if (!rq || i915_request_completed(rq)) 1987 return 0; 1988 1989 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1990 return 0; 1991 1992 return timeslice(engine); 1993 } 1994 1995 static void set_timeslice(struct intel_engine_cs *engine) 1996 { 1997 unsigned long duration; 1998 1999 if (!intel_engine_has_timeslices(engine)) 2000 return; 2001 2002 duration = active_timeslice(engine); 2003 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2004 2005 set_timer_ms(&engine->execlists.timer, duration); 2006 } 2007 2008 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2009 { 2010 struct intel_engine_execlists *execlists = &engine->execlists; 2011 unsigned long duration; 2012 2013 if (!intel_engine_has_timeslices(engine)) 2014 return; 2015 2016 WRITE_ONCE(execlists->switch_priority_hint, prio); 2017 if (prio == INT_MIN) 2018 return; 2019 2020 if (timer_pending(&execlists->timer)) 2021 return; 2022 2023 duration = timeslice(engine); 2024 ENGINE_TRACE(engine, 2025 "start timeslicing, prio:%d, interval:%lu", 2026 prio, duration); 2027 2028 set_timer_ms(&execlists->timer, duration); 2029 } 2030 2031 static void record_preemption(struct intel_engine_execlists *execlists) 2032 { 2033 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2034 } 2035 2036 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2037 const struct i915_request *rq) 2038 { 2039 if (!rq) 2040 return 0; 2041 2042 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2043 if (unlikely(intel_context_is_banned(rq->context))) 2044 return 1; 2045 2046 return READ_ONCE(engine->props.preempt_timeout_ms); 2047 } 2048 2049 static void set_preempt_timeout(struct intel_engine_cs *engine, 2050 const struct i915_request *rq) 2051 { 2052 if (!intel_engine_has_preempt_reset(engine)) 2053 return; 2054 2055 set_timer_ms(&engine->execlists.preempt, 2056 active_preempt_timeout(engine, rq)); 2057 } 2058 2059 static inline void clear_ports(struct i915_request **ports, int count) 2060 { 2061 memset_p((void **)ports, NULL, count); 2062 } 2063 2064 static inline void 2065 copy_ports(struct i915_request **dst, struct i915_request **src, int count) 2066 { 2067 /* A memcpy_p() would be very useful here! */ 2068 while (count--) 2069 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */ 2070 } 2071 2072 static void execlists_dequeue(struct intel_engine_cs *engine) 2073 { 2074 struct intel_engine_execlists * const execlists = &engine->execlists; 2075 struct i915_request **port = execlists->pending; 2076 struct i915_request ** const last_port = port + execlists->port_mask; 2077 struct i915_request * const *active; 2078 struct i915_request *last; 2079 struct rb_node *rb; 2080 bool submit = false; 2081 2082 /* 2083 * Hardware submission is through 2 ports. Conceptually each port 2084 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2085 * static for a context, and unique to each, so we only execute 2086 * requests belonging to a single context from each ring. RING_HEAD 2087 * is maintained by the CS in the context image, it marks the place 2088 * where it got up to last time, and through RING_TAIL we tell the CS 2089 * where we want to execute up to this time. 2090 * 2091 * In this list the requests are in order of execution. Consecutive 2092 * requests from the same context are adjacent in the ringbuffer. We 2093 * can combine these requests into a single RING_TAIL update: 2094 * 2095 * RING_HEAD...req1...req2 2096 * ^- RING_TAIL 2097 * since to execute req2 the CS must first execute req1. 2098 * 2099 * Our goal then is to point each port to the end of a consecutive 2100 * sequence of requests as being the most optimal (fewest wake ups 2101 * and context switches) submission. 2102 */ 2103 2104 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2105 struct virtual_engine *ve = 2106 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2107 struct i915_request *rq = READ_ONCE(ve->request); 2108 2109 if (!rq) { /* lazily cleanup after another engine handled rq */ 2110 rb_erase_cached(rb, &execlists->virtual); 2111 RB_CLEAR_NODE(rb); 2112 rb = rb_first_cached(&execlists->virtual); 2113 continue; 2114 } 2115 2116 if (!virtual_matches(ve, rq, engine)) { 2117 rb = rb_next(rb); 2118 continue; 2119 } 2120 2121 break; 2122 } 2123 2124 /* 2125 * If the queue is higher priority than the last 2126 * request in the currently active context, submit afresh. 2127 * We will resubmit again afterwards in case we need to split 2128 * the active context to interject the preemption request, 2129 * i.e. we will retrigger preemption following the ack in case 2130 * of trouble. 2131 */ 2132 active = READ_ONCE(execlists->active); 2133 2134 /* 2135 * In theory we can skip over completed contexts that have not 2136 * yet been processed by events (as those events are in flight): 2137 * 2138 * while ((last = *active) && i915_request_completed(last)) 2139 * active++; 2140 * 2141 * However, the GPU cannot handle this as it will ultimately 2142 * find itself trying to jump back into a context it has just 2143 * completed and barf. 2144 */ 2145 2146 if ((last = *active)) { 2147 if (need_preempt(engine, last, rb)) { 2148 if (i915_request_completed(last)) { 2149 tasklet_hi_schedule(&execlists->tasklet); 2150 return; 2151 } 2152 2153 ENGINE_TRACE(engine, 2154 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2155 last->fence.context, 2156 last->fence.seqno, 2157 last->sched.attr.priority, 2158 execlists->queue_priority_hint); 2159 record_preemption(execlists); 2160 2161 /* 2162 * Don't let the RING_HEAD advance past the breadcrumb 2163 * as we unwind (and until we resubmit) so that we do 2164 * not accidentally tell it to go backwards. 2165 */ 2166 ring_set_paused(engine, 1); 2167 2168 /* 2169 * Note that we have not stopped the GPU at this point, 2170 * so we are unwinding the incomplete requests as they 2171 * remain inflight and so by the time we do complete 2172 * the preemption, some of the unwound requests may 2173 * complete! 2174 */ 2175 __unwind_incomplete_requests(engine); 2176 2177 last = NULL; 2178 } else if (need_timeslice(engine, last, rb) && 2179 timeslice_expired(execlists, last)) { 2180 if (i915_request_completed(last)) { 2181 tasklet_hi_schedule(&execlists->tasklet); 2182 return; 2183 } 2184 2185 ENGINE_TRACE(engine, 2186 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2187 last->fence.context, 2188 last->fence.seqno, 2189 last->sched.attr.priority, 2190 execlists->queue_priority_hint, 2191 yesno(timeslice_yield(execlists, last))); 2192 2193 ring_set_paused(engine, 1); 2194 defer_active(engine); 2195 2196 /* 2197 * Unlike for preemption, if we rewind and continue 2198 * executing the same context as previously active, 2199 * the order of execution will remain the same and 2200 * the tail will only advance. We do not need to 2201 * force a full context restore, as a lite-restore 2202 * is sufficient to resample the monotonic TAIL. 2203 * 2204 * If we switch to any other context, similarly we 2205 * will not rewind TAIL of current context, and 2206 * normal save/restore will preserve state and allow 2207 * us to later continue executing the same request. 2208 */ 2209 last = NULL; 2210 } else { 2211 /* 2212 * Otherwise if we already have a request pending 2213 * for execution after the current one, we can 2214 * just wait until the next CS event before 2215 * queuing more. In either case we will force a 2216 * lite-restore preemption event, but if we wait 2217 * we hopefully coalesce several updates into a single 2218 * submission. 2219 */ 2220 if (!list_is_last(&last->sched.link, 2221 &engine->active.requests)) { 2222 /* 2223 * Even if ELSP[1] is occupied and not worthy 2224 * of timeslices, our queue might be. 2225 */ 2226 start_timeslice(engine, queue_prio(execlists)); 2227 return; 2228 } 2229 } 2230 } 2231 2232 while (rb) { /* XXX virtual is always taking precedence */ 2233 struct virtual_engine *ve = 2234 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2235 struct i915_request *rq; 2236 2237 spin_lock(&ve->base.active.lock); 2238 2239 rq = ve->request; 2240 if (unlikely(!rq)) { /* lost the race to a sibling */ 2241 spin_unlock(&ve->base.active.lock); 2242 rb_erase_cached(rb, &execlists->virtual); 2243 RB_CLEAR_NODE(rb); 2244 rb = rb_first_cached(&execlists->virtual); 2245 continue; 2246 } 2247 2248 GEM_BUG_ON(rq != ve->request); 2249 GEM_BUG_ON(rq->engine != &ve->base); 2250 GEM_BUG_ON(rq->context != &ve->context); 2251 2252 if (rq_prio(rq) >= queue_prio(execlists)) { 2253 if (!virtual_matches(ve, rq, engine)) { 2254 spin_unlock(&ve->base.active.lock); 2255 rb = rb_next(rb); 2256 continue; 2257 } 2258 2259 if (last && !can_merge_rq(last, rq)) { 2260 spin_unlock(&ve->base.active.lock); 2261 start_timeslice(engine, rq_prio(rq)); 2262 return; /* leave this for another sibling */ 2263 } 2264 2265 ENGINE_TRACE(engine, 2266 "virtual rq=%llx:%lld%s, new engine? %s\n", 2267 rq->fence.context, 2268 rq->fence.seqno, 2269 i915_request_completed(rq) ? "!" : 2270 i915_request_started(rq) ? "*" : 2271 "", 2272 yesno(engine != ve->siblings[0])); 2273 2274 WRITE_ONCE(ve->request, NULL); 2275 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2276 INT_MIN); 2277 rb_erase_cached(rb, &execlists->virtual); 2278 RB_CLEAR_NODE(rb); 2279 2280 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2281 WRITE_ONCE(rq->engine, engine); 2282 2283 if (__i915_request_submit(rq)) { 2284 /* 2285 * Only after we confirm that we will submit 2286 * this request (i.e. it has not already 2287 * completed), do we want to update the context. 2288 * 2289 * This serves two purposes. It avoids 2290 * unnecessary work if we are resubmitting an 2291 * already completed request after timeslicing. 2292 * But more importantly, it prevents us altering 2293 * ve->siblings[] on an idle context, where 2294 * we may be using ve->siblings[] in 2295 * virtual_context_enter / virtual_context_exit. 2296 */ 2297 virtual_xfer_context(ve, engine); 2298 GEM_BUG_ON(ve->siblings[0] != engine); 2299 2300 submit = true; 2301 last = rq; 2302 } 2303 i915_request_put(rq); 2304 2305 /* 2306 * Hmm, we have a bunch of virtual engine requests, 2307 * but the first one was already completed (thanks 2308 * preempt-to-busy!). Keep looking at the veng queue 2309 * until we have no more relevant requests (i.e. 2310 * the normal submit queue has higher priority). 2311 */ 2312 if (!submit) { 2313 spin_unlock(&ve->base.active.lock); 2314 rb = rb_first_cached(&execlists->virtual); 2315 continue; 2316 } 2317 } 2318 2319 spin_unlock(&ve->base.active.lock); 2320 break; 2321 } 2322 2323 while ((rb = rb_first_cached(&execlists->queue))) { 2324 struct i915_priolist *p = to_priolist(rb); 2325 struct i915_request *rq, *rn; 2326 int i; 2327 2328 priolist_for_each_request_consume(rq, rn, p, i) { 2329 bool merge = true; 2330 2331 /* 2332 * Can we combine this request with the current port? 2333 * It has to be the same context/ringbuffer and not 2334 * have any exceptions (e.g. GVT saying never to 2335 * combine contexts). 2336 * 2337 * If we can combine the requests, we can execute both 2338 * by updating the RING_TAIL to point to the end of the 2339 * second request, and so we never need to tell the 2340 * hardware about the first. 2341 */ 2342 if (last && !can_merge_rq(last, rq)) { 2343 /* 2344 * If we are on the second port and cannot 2345 * combine this request with the last, then we 2346 * are done. 2347 */ 2348 if (port == last_port) 2349 goto done; 2350 2351 /* 2352 * We must not populate both ELSP[] with the 2353 * same LRCA, i.e. we must submit 2 different 2354 * contexts if we submit 2 ELSP. 2355 */ 2356 if (last->context == rq->context) 2357 goto done; 2358 2359 if (i915_request_has_sentinel(last)) 2360 goto done; 2361 2362 /* 2363 * If GVT overrides us we only ever submit 2364 * port[0], leaving port[1] empty. Note that we 2365 * also have to be careful that we don't queue 2366 * the same context (even though a different 2367 * request) to the second port. 2368 */ 2369 if (ctx_single_port_submission(last->context) || 2370 ctx_single_port_submission(rq->context)) 2371 goto done; 2372 2373 merge = false; 2374 } 2375 2376 if (__i915_request_submit(rq)) { 2377 if (!merge) { 2378 *port = execlists_schedule_in(last, port - execlists->pending); 2379 port++; 2380 last = NULL; 2381 } 2382 2383 GEM_BUG_ON(last && 2384 !can_merge_ctx(last->context, 2385 rq->context)); 2386 GEM_BUG_ON(last && 2387 i915_seqno_passed(last->fence.seqno, 2388 rq->fence.seqno)); 2389 2390 submit = true; 2391 last = rq; 2392 } 2393 } 2394 2395 rb_erase_cached(&p->node, &execlists->queue); 2396 i915_priolist_free(p); 2397 } 2398 2399 done: 2400 /* 2401 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2402 * 2403 * We choose the priority hint such that if we add a request of greater 2404 * priority than this, we kick the submission tasklet to decide on 2405 * the right order of submitting the requests to hardware. We must 2406 * also be prepared to reorder requests as they are in-flight on the 2407 * HW. We derive the priority hint then as the first "hole" in 2408 * the HW submission ports and if there are no available slots, 2409 * the priority of the lowest executing request, i.e. last. 2410 * 2411 * When we do receive a higher priority request ready to run from the 2412 * user, see queue_request(), the priority hint is bumped to that 2413 * request triggering preemption on the next dequeue (or subsequent 2414 * interrupt for secondary ports). 2415 */ 2416 execlists->queue_priority_hint = queue_prio(execlists); 2417 2418 if (submit) { 2419 *port = execlists_schedule_in(last, port - execlists->pending); 2420 execlists->switch_priority_hint = 2421 switch_prio(engine, *execlists->pending); 2422 2423 /* 2424 * Skip if we ended up with exactly the same set of requests, 2425 * e.g. trying to timeslice a pair of ordered contexts 2426 */ 2427 if (!memcmp(active, execlists->pending, 2428 (port - execlists->pending + 1) * sizeof(*port))) { 2429 do 2430 execlists_schedule_out(fetch_and_zero(port)); 2431 while (port-- != execlists->pending); 2432 2433 goto skip_submit; 2434 } 2435 clear_ports(port + 1, last_port - port); 2436 2437 WRITE_ONCE(execlists->yield, -1); 2438 set_preempt_timeout(engine, *active); 2439 execlists_submit_ports(engine); 2440 } else { 2441 start_timeslice(engine, execlists->queue_priority_hint); 2442 skip_submit: 2443 ring_set_paused(engine, 0); 2444 } 2445 } 2446 2447 static void 2448 cancel_port_requests(struct intel_engine_execlists * const execlists) 2449 { 2450 struct i915_request * const *port; 2451 2452 for (port = execlists->pending; *port; port++) 2453 execlists_schedule_out(*port); 2454 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2455 2456 /* Mark the end of active before we overwrite *active */ 2457 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2458 execlists_schedule_out(*port); 2459 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2460 2461 smp_wmb(); /* complete the seqlock for execlists_active() */ 2462 WRITE_ONCE(execlists->active, execlists->inflight); 2463 } 2464 2465 static inline void 2466 invalidate_csb_entries(const u64 *first, const u64 *last) 2467 { 2468 clflush((void *)first); 2469 clflush((void *)last); 2470 } 2471 2472 /* 2473 * Starting with Gen12, the status has a new format: 2474 * 2475 * bit 0: switched to new queue 2476 * bit 1: reserved 2477 * bit 2: semaphore wait mode (poll or signal), only valid when 2478 * switch detail is set to "wait on semaphore" 2479 * bits 3-5: engine class 2480 * bits 6-11: engine instance 2481 * bits 12-14: reserved 2482 * bits 15-25: sw context id of the lrc the GT switched to 2483 * bits 26-31: sw counter of the lrc the GT switched to 2484 * bits 32-35: context switch detail 2485 * - 0: ctx complete 2486 * - 1: wait on sync flip 2487 * - 2: wait on vblank 2488 * - 3: wait on scanline 2489 * - 4: wait on semaphore 2490 * - 5: context preempted (not on SEMAPHORE_WAIT or 2491 * WAIT_FOR_EVENT) 2492 * bit 36: reserved 2493 * bits 37-43: wait detail (for switch detail 1 to 4) 2494 * bits 44-46: reserved 2495 * bits 47-57: sw context id of the lrc the GT switched away from 2496 * bits 58-63: sw counter of the lrc the GT switched away from 2497 */ 2498 static inline bool gen12_csb_parse(const u64 *csb) 2499 { 2500 bool ctx_away_valid; 2501 bool new_queue; 2502 u64 entry; 2503 2504 /* HSD#22011248461 */ 2505 entry = READ_ONCE(*csb); 2506 if (unlikely(entry == -1)) { 2507 preempt_disable(); 2508 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50)) 2509 GEM_WARN_ON("50us CSB timeout"); 2510 preempt_enable(); 2511 } 2512 WRITE_ONCE(*(u64 *)csb, -1); 2513 2514 ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry)); 2515 new_queue = 2516 lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2517 2518 /* 2519 * The context switch detail is not guaranteed to be 5 when a preemption 2520 * occurs, so we can't just check for that. The check below works for 2521 * all the cases we care about, including preemptions of WAIT 2522 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2523 * would require some extra handling, but we don't support that. 2524 */ 2525 if (!ctx_away_valid || new_queue) { 2526 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry))); 2527 return true; 2528 } 2529 2530 /* 2531 * switch detail = 5 is covered by the case above and we do not expect a 2532 * context switch on an unsuccessful wait instruction since we always 2533 * use polling mode. 2534 */ 2535 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry))); 2536 return false; 2537 } 2538 2539 static inline bool gen8_csb_parse(const u64 *csb) 2540 { 2541 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2542 } 2543 2544 static void process_csb(struct intel_engine_cs *engine) 2545 { 2546 struct intel_engine_execlists * const execlists = &engine->execlists; 2547 const u64 * const buf = execlists->csb_status; 2548 const u8 num_entries = execlists->csb_size; 2549 u8 head, tail; 2550 2551 /* 2552 * As we modify our execlists state tracking we require exclusive 2553 * access. Either we are inside the tasklet, or the tasklet is disabled 2554 * and we assume that is only inside the reset paths and so serialised. 2555 */ 2556 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2557 !reset_in_progress(execlists)); 2558 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2559 2560 /* 2561 * Note that csb_write, csb_status may be either in HWSP or mmio. 2562 * When reading from the csb_write mmio register, we have to be 2563 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2564 * the low 4bits. As it happens we know the next 4bits are always 2565 * zero and so we can simply masked off the low u8 of the register 2566 * and treat it identically to reading from the HWSP (without having 2567 * to use explicit shifting and masking, and probably bifurcating 2568 * the code to handle the legacy mmio read). 2569 */ 2570 head = execlists->csb_head; 2571 tail = READ_ONCE(*execlists->csb_write); 2572 if (unlikely(head == tail)) 2573 return; 2574 2575 /* 2576 * We will consume all events from HW, or at least pretend to. 2577 * 2578 * The sequence of events from the HW is deterministic, and derived 2579 * from our writes to the ELSP, with a smidgen of variability for 2580 * the arrival of the asynchronous requests wrt to the inflight 2581 * execution. If the HW sends an event that does not correspond with 2582 * the one we are expecting, we have to abandon all hope as we lose 2583 * all tracking of what the engine is actually executing. We will 2584 * only detect we are out of sequence with the HW when we get an 2585 * 'impossible' event because we have already drained our own 2586 * preemption/promotion queue. If this occurs, we know that we likely 2587 * lost track of execution earlier and must unwind and restart, the 2588 * simplest way is by stop processing the event queue and force the 2589 * engine to reset. 2590 */ 2591 execlists->csb_head = tail; 2592 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2593 2594 /* 2595 * Hopefully paired with a wmb() in HW! 2596 * 2597 * We must complete the read of the write pointer before any reads 2598 * from the CSB, so that we do not see stale values. Without an rmb 2599 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2600 * we perform the READ_ONCE(*csb_write). 2601 */ 2602 rmb(); 2603 do { 2604 bool promote; 2605 2606 if (++head == num_entries) 2607 head = 0; 2608 2609 /* 2610 * We are flying near dragons again. 2611 * 2612 * We hold a reference to the request in execlist_port[] 2613 * but no more than that. We are operating in softirq 2614 * context and so cannot hold any mutex or sleep. That 2615 * prevents us stopping the requests we are processing 2616 * in port[] from being retired simultaneously (the 2617 * breadcrumb will be complete before we see the 2618 * context-switch). As we only hold the reference to the 2619 * request, any pointer chasing underneath the request 2620 * is subject to a potential use-after-free. Thus we 2621 * store all of the bookkeeping within port[] as 2622 * required, and avoid using unguarded pointers beneath 2623 * request itself. The same applies to the atomic 2624 * status notifier. 2625 */ 2626 2627 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2628 head, 2629 upper_32_bits(buf[head]), 2630 lower_32_bits(buf[head])); 2631 2632 if (INTEL_GEN(engine->i915) >= 12) 2633 promote = gen12_csb_parse(buf + head); 2634 else 2635 promote = gen8_csb_parse(buf + head); 2636 if (promote) { 2637 struct i915_request * const *old = execlists->active; 2638 2639 if (GEM_WARN_ON(!*execlists->pending)) { 2640 execlists->error_interrupt |= ERROR_CSB; 2641 break; 2642 } 2643 2644 ring_set_paused(engine, 0); 2645 2646 /* Point active to the new ELSP; prevent overwriting */ 2647 WRITE_ONCE(execlists->active, execlists->pending); 2648 smp_wmb(); /* notify execlists_active() */ 2649 2650 /* cancel old inflight, prepare for switch */ 2651 trace_ports(execlists, "preempted", old); 2652 while (*old) 2653 execlists_schedule_out(*old++); 2654 2655 /* switch pending to inflight */ 2656 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2657 copy_ports(execlists->inflight, 2658 execlists->pending, 2659 execlists_num_ports(execlists)); 2660 smp_wmb(); /* complete the seqlock */ 2661 WRITE_ONCE(execlists->active, execlists->inflight); 2662 2663 /* XXX Magic delay for tgl */ 2664 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 2665 2666 WRITE_ONCE(execlists->pending[0], NULL); 2667 } else { 2668 if (GEM_WARN_ON(!*execlists->active)) { 2669 execlists->error_interrupt |= ERROR_CSB; 2670 break; 2671 } 2672 2673 /* port0 completed, advanced to port1 */ 2674 trace_ports(execlists, "completed", execlists->active); 2675 2676 /* 2677 * We rely on the hardware being strongly 2678 * ordered, that the breadcrumb write is 2679 * coherent (visible from the CPU) before the 2680 * user interrupt is processed. One might assume 2681 * that the breadcrumb write being before the 2682 * user interrupt and the CS event for the context 2683 * switch would therefore be before the CS event 2684 * itself... 2685 */ 2686 if (GEM_SHOW_DEBUG() && 2687 !i915_request_completed(*execlists->active)) { 2688 struct i915_request *rq = *execlists->active; 2689 const u32 *regs __maybe_unused = 2690 rq->context->lrc_reg_state; 2691 2692 ENGINE_TRACE(engine, 2693 "context completed before request!\n"); 2694 ENGINE_TRACE(engine, 2695 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2696 ENGINE_READ(engine, RING_START), 2697 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2698 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2699 ENGINE_READ(engine, RING_CTL), 2700 ENGINE_READ(engine, RING_MI_MODE)); 2701 ENGINE_TRACE(engine, 2702 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2703 i915_ggtt_offset(rq->ring->vma), 2704 rq->head, rq->tail, 2705 rq->fence.context, 2706 lower_32_bits(rq->fence.seqno), 2707 hwsp_seqno(rq)); 2708 ENGINE_TRACE(engine, 2709 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2710 regs[CTX_RING_START], 2711 regs[CTX_RING_HEAD], 2712 regs[CTX_RING_TAIL]); 2713 } 2714 2715 execlists_schedule_out(*execlists->active++); 2716 2717 GEM_BUG_ON(execlists->active - execlists->inflight > 2718 execlists_num_ports(execlists)); 2719 } 2720 } while (head != tail); 2721 2722 set_timeslice(engine); 2723 2724 /* 2725 * Gen11 has proven to fail wrt global observation point between 2726 * entry and tail update, failing on the ordering and thus 2727 * we see an old entry in the context status buffer. 2728 * 2729 * Forcibly evict out entries for the next gpu csb update, 2730 * to increase the odds that we get a fresh entries with non 2731 * working hardware. The cost for doing so comes out mostly with 2732 * the wash as hardware, working or not, will need to do the 2733 * invalidation before. 2734 */ 2735 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2736 } 2737 2738 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2739 { 2740 lockdep_assert_held(&engine->active.lock); 2741 if (!READ_ONCE(engine->execlists.pending[0])) { 2742 rcu_read_lock(); /* protect peeking at execlists->active */ 2743 execlists_dequeue(engine); 2744 rcu_read_unlock(); 2745 } 2746 } 2747 2748 static void __execlists_hold(struct i915_request *rq) 2749 { 2750 LIST_HEAD(list); 2751 2752 do { 2753 struct i915_dependency *p; 2754 2755 if (i915_request_is_active(rq)) 2756 __i915_request_unsubmit(rq); 2757 2758 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2759 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2760 i915_request_set_hold(rq); 2761 RQ_TRACE(rq, "on hold\n"); 2762 2763 for_each_waiter(p, rq) { 2764 struct i915_request *w = 2765 container_of(p->waiter, typeof(*w), sched); 2766 2767 /* Leave semaphores spinning on the other engines */ 2768 if (w->engine != rq->engine) 2769 continue; 2770 2771 if (!i915_request_is_ready(w)) 2772 continue; 2773 2774 if (i915_request_completed(w)) 2775 continue; 2776 2777 if (i915_request_on_hold(w)) 2778 continue; 2779 2780 list_move_tail(&w->sched.link, &list); 2781 } 2782 2783 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2784 } while (rq); 2785 } 2786 2787 static bool execlists_hold(struct intel_engine_cs *engine, 2788 struct i915_request *rq) 2789 { 2790 spin_lock_irq(&engine->active.lock); 2791 2792 if (i915_request_completed(rq)) { /* too late! */ 2793 rq = NULL; 2794 goto unlock; 2795 } 2796 2797 if (rq->engine != engine) { /* preempted virtual engine */ 2798 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2799 2800 /* 2801 * intel_context_inflight() is only protected by virtue 2802 * of process_csb() being called only by the tasklet (or 2803 * directly from inside reset while the tasklet is suspended). 2804 * Assert that neither of those are allowed to run while we 2805 * poke at the request queues. 2806 */ 2807 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2808 2809 /* 2810 * An unsubmitted request along a virtual engine will 2811 * remain on the active (this) engine until we are able 2812 * to process the context switch away (and so mark the 2813 * context as no longer in flight). That cannot have happened 2814 * yet, otherwise we would not be hanging! 2815 */ 2816 spin_lock(&ve->base.active.lock); 2817 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2818 GEM_BUG_ON(ve->request != rq); 2819 ve->request = NULL; 2820 spin_unlock(&ve->base.active.lock); 2821 i915_request_put(rq); 2822 2823 rq->engine = engine; 2824 } 2825 2826 /* 2827 * Transfer this request onto the hold queue to prevent it 2828 * being resumbitted to HW (and potentially completed) before we have 2829 * released it. Since we may have already submitted following 2830 * requests, we need to remove those as well. 2831 */ 2832 GEM_BUG_ON(i915_request_on_hold(rq)); 2833 GEM_BUG_ON(rq->engine != engine); 2834 __execlists_hold(rq); 2835 GEM_BUG_ON(list_empty(&engine->active.hold)); 2836 2837 unlock: 2838 spin_unlock_irq(&engine->active.lock); 2839 return rq; 2840 } 2841 2842 static bool hold_request(const struct i915_request *rq) 2843 { 2844 struct i915_dependency *p; 2845 bool result = false; 2846 2847 /* 2848 * If one of our ancestors is on hold, we must also be on hold, 2849 * otherwise we will bypass it and execute before it. 2850 */ 2851 rcu_read_lock(); 2852 for_each_signaler(p, rq) { 2853 const struct i915_request *s = 2854 container_of(p->signaler, typeof(*s), sched); 2855 2856 if (s->engine != rq->engine) 2857 continue; 2858 2859 result = i915_request_on_hold(s); 2860 if (result) 2861 break; 2862 } 2863 rcu_read_unlock(); 2864 2865 return result; 2866 } 2867 2868 static void __execlists_unhold(struct i915_request *rq) 2869 { 2870 LIST_HEAD(list); 2871 2872 do { 2873 struct i915_dependency *p; 2874 2875 RQ_TRACE(rq, "hold release\n"); 2876 2877 GEM_BUG_ON(!i915_request_on_hold(rq)); 2878 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2879 2880 i915_request_clear_hold(rq); 2881 list_move_tail(&rq->sched.link, 2882 i915_sched_lookup_priolist(rq->engine, 2883 rq_prio(rq))); 2884 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2885 2886 /* Also release any children on this engine that are ready */ 2887 for_each_waiter(p, rq) { 2888 struct i915_request *w = 2889 container_of(p->waiter, typeof(*w), sched); 2890 2891 /* Propagate any change in error status */ 2892 if (rq->fence.error) 2893 i915_request_set_error_once(w, rq->fence.error); 2894 2895 if (w->engine != rq->engine) 2896 continue; 2897 2898 if (!i915_request_on_hold(w)) 2899 continue; 2900 2901 /* Check that no other parents are also on hold */ 2902 if (hold_request(w)) 2903 continue; 2904 2905 list_move_tail(&w->sched.link, &list); 2906 } 2907 2908 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2909 } while (rq); 2910 } 2911 2912 static void execlists_unhold(struct intel_engine_cs *engine, 2913 struct i915_request *rq) 2914 { 2915 spin_lock_irq(&engine->active.lock); 2916 2917 /* 2918 * Move this request back to the priority queue, and all of its 2919 * children and grandchildren that were suspended along with it. 2920 */ 2921 __execlists_unhold(rq); 2922 2923 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2924 engine->execlists.queue_priority_hint = rq_prio(rq); 2925 tasklet_hi_schedule(&engine->execlists.tasklet); 2926 } 2927 2928 spin_unlock_irq(&engine->active.lock); 2929 } 2930 2931 struct execlists_capture { 2932 struct work_struct work; 2933 struct i915_request *rq; 2934 struct i915_gpu_coredump *error; 2935 }; 2936 2937 static void execlists_capture_work(struct work_struct *work) 2938 { 2939 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2940 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2941 struct intel_engine_cs *engine = cap->rq->engine; 2942 struct intel_gt_coredump *gt = cap->error->gt; 2943 struct intel_engine_capture_vma *vma; 2944 2945 /* Compress all the objects attached to the request, slow! */ 2946 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2947 if (vma) { 2948 struct i915_vma_compress *compress = 2949 i915_vma_capture_prepare(gt); 2950 2951 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2952 i915_vma_capture_finish(gt, compress); 2953 } 2954 2955 gt->simulated = gt->engine->simulated; 2956 cap->error->simulated = gt->simulated; 2957 2958 /* Publish the error state, and announce it to the world */ 2959 i915_error_state_store(cap->error); 2960 i915_gpu_coredump_put(cap->error); 2961 2962 /* Return this request and all that depend upon it for signaling */ 2963 execlists_unhold(engine, cap->rq); 2964 i915_request_put(cap->rq); 2965 2966 kfree(cap); 2967 } 2968 2969 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2970 { 2971 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2972 struct execlists_capture *cap; 2973 2974 cap = kmalloc(sizeof(*cap), gfp); 2975 if (!cap) 2976 return NULL; 2977 2978 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2979 if (!cap->error) 2980 goto err_cap; 2981 2982 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2983 if (!cap->error->gt) 2984 goto err_gpu; 2985 2986 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2987 if (!cap->error->gt->engine) 2988 goto err_gt; 2989 2990 return cap; 2991 2992 err_gt: 2993 kfree(cap->error->gt); 2994 err_gpu: 2995 kfree(cap->error); 2996 err_cap: 2997 kfree(cap); 2998 return NULL; 2999 } 3000 3001 static struct i915_request * 3002 active_context(struct intel_engine_cs *engine, u32 ccid) 3003 { 3004 const struct intel_engine_execlists * const el = &engine->execlists; 3005 struct i915_request * const *port, *rq; 3006 3007 /* 3008 * Use the most recent result from process_csb(), but just in case 3009 * we trigger an error (via interrupt) before the first CS event has 3010 * been written, peek at the next submission. 3011 */ 3012 3013 for (port = el->active; (rq = *port); port++) { 3014 if (rq->context->lrc.ccid == ccid) { 3015 ENGINE_TRACE(engine, 3016 "ccid found at active:%zd\n", 3017 port - el->active); 3018 return rq; 3019 } 3020 } 3021 3022 for (port = el->pending; (rq = *port); port++) { 3023 if (rq->context->lrc.ccid == ccid) { 3024 ENGINE_TRACE(engine, 3025 "ccid found at pending:%zd\n", 3026 port - el->pending); 3027 return rq; 3028 } 3029 } 3030 3031 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3032 return NULL; 3033 } 3034 3035 static u32 active_ccid(struct intel_engine_cs *engine) 3036 { 3037 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3038 } 3039 3040 static void execlists_capture(struct intel_engine_cs *engine) 3041 { 3042 struct execlists_capture *cap; 3043 3044 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3045 return; 3046 3047 /* 3048 * We need to _quickly_ capture the engine state before we reset. 3049 * We are inside an atomic section (softirq) here and we are delaying 3050 * the forced preemption event. 3051 */ 3052 cap = capture_regs(engine); 3053 if (!cap) 3054 return; 3055 3056 spin_lock_irq(&engine->active.lock); 3057 cap->rq = active_context(engine, active_ccid(engine)); 3058 if (cap->rq) { 3059 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3060 cap->rq = i915_request_get_rcu(cap->rq); 3061 } 3062 spin_unlock_irq(&engine->active.lock); 3063 if (!cap->rq) 3064 goto err_free; 3065 3066 /* 3067 * Remove the request from the execlists queue, and take ownership 3068 * of the request. We pass it to our worker who will _slowly_ compress 3069 * all the pages the _user_ requested for debugging their batch, after 3070 * which we return it to the queue for signaling. 3071 * 3072 * By removing them from the execlists queue, we also remove the 3073 * requests from being processed by __unwind_incomplete_requests() 3074 * during the intel_engine_reset(), and so they will *not* be replayed 3075 * afterwards. 3076 * 3077 * Note that because we have not yet reset the engine at this point, 3078 * it is possible for the request that we have identified as being 3079 * guilty, did in fact complete and we will then hit an arbitration 3080 * point allowing the outstanding preemption to succeed. The likelihood 3081 * of that is very low (as capturing of the engine registers should be 3082 * fast enough to run inside an irq-off atomic section!), so we will 3083 * simply hold that request accountable for being non-preemptible 3084 * long enough to force the reset. 3085 */ 3086 if (!execlists_hold(engine, cap->rq)) 3087 goto err_rq; 3088 3089 INIT_WORK(&cap->work, execlists_capture_work); 3090 schedule_work(&cap->work); 3091 return; 3092 3093 err_rq: 3094 i915_request_put(cap->rq); 3095 err_free: 3096 i915_gpu_coredump_put(cap->error); 3097 kfree(cap); 3098 } 3099 3100 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3101 { 3102 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3103 unsigned long *lock = &engine->gt->reset.flags; 3104 3105 if (!intel_has_reset_engine(engine->gt)) 3106 return; 3107 3108 if (test_and_set_bit(bit, lock)) 3109 return; 3110 3111 ENGINE_TRACE(engine, "reset for %s\n", msg); 3112 3113 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3114 tasklet_disable_nosync(&engine->execlists.tasklet); 3115 3116 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3117 execlists_capture(engine); 3118 intel_engine_reset(engine, msg); 3119 3120 tasklet_enable(&engine->execlists.tasklet); 3121 clear_and_wake_up_bit(bit, lock); 3122 } 3123 3124 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3125 { 3126 const struct timer_list *t = &engine->execlists.preempt; 3127 3128 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3129 return false; 3130 3131 if (!timer_expired(t)) 3132 return false; 3133 3134 return READ_ONCE(engine->execlists.pending[0]); 3135 } 3136 3137 /* 3138 * Check the unread Context Status Buffers and manage the submission of new 3139 * contexts to the ELSP accordingly. 3140 */ 3141 static void execlists_submission_tasklet(unsigned long data) 3142 { 3143 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3144 bool timeout = preempt_timeout(engine); 3145 3146 process_csb(engine); 3147 3148 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3149 const char *msg; 3150 3151 /* Generate the error message in priority wrt to the user! */ 3152 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3153 msg = "CS error"; /* thrown by a user payload */ 3154 else if (engine->execlists.error_interrupt & ERROR_CSB) 3155 msg = "invalid CSB event"; 3156 else 3157 msg = "internal error"; 3158 3159 engine->execlists.error_interrupt = 0; 3160 execlists_reset(engine, msg); 3161 } 3162 3163 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3164 unsigned long flags; 3165 3166 spin_lock_irqsave(&engine->active.lock, flags); 3167 __execlists_submission_tasklet(engine); 3168 spin_unlock_irqrestore(&engine->active.lock, flags); 3169 3170 /* Recheck after serialising with direct-submission */ 3171 if (unlikely(timeout && preempt_timeout(engine))) 3172 execlists_reset(engine, "preemption time out"); 3173 } 3174 } 3175 3176 static void __execlists_kick(struct intel_engine_execlists *execlists) 3177 { 3178 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3179 tasklet_hi_schedule(&execlists->tasklet); 3180 } 3181 3182 #define execlists_kick(t, member) \ 3183 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3184 3185 static void execlists_timeslice(struct timer_list *timer) 3186 { 3187 execlists_kick(timer, timer); 3188 } 3189 3190 static void execlists_preempt(struct timer_list *timer) 3191 { 3192 execlists_kick(timer, preempt); 3193 } 3194 3195 static void queue_request(struct intel_engine_cs *engine, 3196 struct i915_request *rq) 3197 { 3198 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3199 list_add_tail(&rq->sched.link, 3200 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3201 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3202 } 3203 3204 static void __submit_queue_imm(struct intel_engine_cs *engine) 3205 { 3206 struct intel_engine_execlists * const execlists = &engine->execlists; 3207 3208 if (reset_in_progress(execlists)) 3209 return; /* defer until we restart the engine following reset */ 3210 3211 __execlists_submission_tasklet(engine); 3212 } 3213 3214 static void submit_queue(struct intel_engine_cs *engine, 3215 const struct i915_request *rq) 3216 { 3217 struct intel_engine_execlists *execlists = &engine->execlists; 3218 3219 if (rq_prio(rq) <= execlists->queue_priority_hint) 3220 return; 3221 3222 execlists->queue_priority_hint = rq_prio(rq); 3223 __submit_queue_imm(engine); 3224 } 3225 3226 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3227 const struct i915_request *rq) 3228 { 3229 GEM_BUG_ON(i915_request_on_hold(rq)); 3230 return !list_empty(&engine->active.hold) && hold_request(rq); 3231 } 3232 3233 static void flush_csb(struct intel_engine_cs *engine) 3234 { 3235 struct intel_engine_execlists *el = &engine->execlists; 3236 3237 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3238 if (!reset_in_progress(el)) 3239 process_csb(engine); 3240 tasklet_unlock(&el->tasklet); 3241 } 3242 } 3243 3244 static void execlists_submit_request(struct i915_request *request) 3245 { 3246 struct intel_engine_cs *engine = request->engine; 3247 unsigned long flags; 3248 3249 /* Hopefully we clear execlists->pending[] to let us through */ 3250 flush_csb(engine); 3251 3252 /* Will be called from irq-context when using foreign fences. */ 3253 spin_lock_irqsave(&engine->active.lock, flags); 3254 3255 if (unlikely(ancestor_on_hold(engine, request))) { 3256 RQ_TRACE(request, "ancestor on hold\n"); 3257 list_add_tail(&request->sched.link, &engine->active.hold); 3258 i915_request_set_hold(request); 3259 } else { 3260 queue_request(engine, request); 3261 3262 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3263 GEM_BUG_ON(list_empty(&request->sched.link)); 3264 3265 submit_queue(engine, request); 3266 } 3267 3268 spin_unlock_irqrestore(&engine->active.lock, flags); 3269 } 3270 3271 static void __execlists_context_fini(struct intel_context *ce) 3272 { 3273 intel_ring_put(ce->ring); 3274 i915_vma_put(ce->state); 3275 } 3276 3277 static void execlists_context_destroy(struct kref *kref) 3278 { 3279 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3280 3281 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3282 GEM_BUG_ON(intel_context_is_pinned(ce)); 3283 3284 if (ce->state) 3285 __execlists_context_fini(ce); 3286 3287 intel_context_fini(ce); 3288 intel_context_free(ce); 3289 } 3290 3291 static void 3292 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3293 { 3294 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3295 return; 3296 3297 vaddr += engine->context_size; 3298 3299 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3300 } 3301 3302 static void 3303 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3304 { 3305 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3306 return; 3307 3308 vaddr += engine->context_size; 3309 3310 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3311 drm_err_once(&engine->i915->drm, 3312 "%s context redzone overwritten!\n", 3313 engine->name); 3314 } 3315 3316 static void execlists_context_unpin(struct intel_context *ce) 3317 { 3318 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3319 ce->engine); 3320 } 3321 3322 static void execlists_context_post_unpin(struct intel_context *ce) 3323 { 3324 i915_gem_object_unpin_map(ce->state->obj); 3325 } 3326 3327 static u32 * 3328 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3329 { 3330 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3331 MI_SRM_LRM_GLOBAL_GTT | 3332 MI_LRI_LRM_CS_MMIO; 3333 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3334 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3335 CTX_TIMESTAMP * sizeof(u32); 3336 *cs++ = 0; 3337 3338 *cs++ = MI_LOAD_REGISTER_REG | 3339 MI_LRR_SOURCE_CS_MMIO | 3340 MI_LRI_LRM_CS_MMIO; 3341 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3342 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3343 3344 *cs++ = MI_LOAD_REGISTER_REG | 3345 MI_LRR_SOURCE_CS_MMIO | 3346 MI_LRI_LRM_CS_MMIO; 3347 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3348 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3349 3350 return cs; 3351 } 3352 3353 static u32 * 3354 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3355 { 3356 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3357 3358 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3359 MI_SRM_LRM_GLOBAL_GTT | 3360 MI_LRI_LRM_CS_MMIO; 3361 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3362 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3363 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3364 *cs++ = 0; 3365 3366 return cs; 3367 } 3368 3369 static u32 * 3370 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3371 { 3372 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3373 3374 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3375 MI_SRM_LRM_GLOBAL_GTT | 3376 MI_LRI_LRM_CS_MMIO; 3377 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3378 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3379 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3380 *cs++ = 0; 3381 3382 *cs++ = MI_LOAD_REGISTER_REG | 3383 MI_LRR_SOURCE_CS_MMIO | 3384 MI_LRI_LRM_CS_MMIO; 3385 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3386 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3387 3388 return cs; 3389 } 3390 3391 static u32 * 3392 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3393 { 3394 cs = gen12_emit_timestamp_wa(ce, cs); 3395 cs = gen12_emit_cmd_buf_wa(ce, cs); 3396 cs = gen12_emit_restore_scratch(ce, cs); 3397 3398 return cs; 3399 } 3400 3401 static u32 * 3402 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3403 { 3404 cs = gen12_emit_timestamp_wa(ce, cs); 3405 cs = gen12_emit_restore_scratch(ce, cs); 3406 3407 return cs; 3408 } 3409 3410 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3411 { 3412 return PAGE_SIZE * ce->wa_bb_page; 3413 } 3414 3415 static u32 *context_indirect_bb(const struct intel_context *ce) 3416 { 3417 void *ptr; 3418 3419 GEM_BUG_ON(!ce->wa_bb_page); 3420 3421 ptr = ce->lrc_reg_state; 3422 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3423 ptr += context_wa_bb_offset(ce); 3424 3425 return ptr; 3426 } 3427 3428 static void 3429 setup_indirect_ctx_bb(const struct intel_context *ce, 3430 const struct intel_engine_cs *engine, 3431 u32 *(*emit)(const struct intel_context *, u32 *)) 3432 { 3433 u32 * const start = context_indirect_bb(ce); 3434 u32 *cs; 3435 3436 cs = emit(ce, start); 3437 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3438 while ((unsigned long)cs % CACHELINE_BYTES) 3439 *cs++ = MI_NOOP; 3440 3441 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3442 i915_ggtt_offset(ce->state) + 3443 context_wa_bb_offset(ce), 3444 (cs - start) * sizeof(*cs)); 3445 } 3446 3447 static void 3448 __execlists_update_reg_state(const struct intel_context *ce, 3449 const struct intel_engine_cs *engine, 3450 u32 head) 3451 { 3452 struct intel_ring *ring = ce->ring; 3453 u32 *regs = ce->lrc_reg_state; 3454 3455 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3456 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3457 3458 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3459 regs[CTX_RING_HEAD] = head; 3460 regs[CTX_RING_TAIL] = ring->tail; 3461 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3462 3463 /* RPCS */ 3464 if (engine->class == RENDER_CLASS) { 3465 regs[CTX_R_PWR_CLK_STATE] = 3466 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3467 3468 i915_oa_init_reg_state(ce, engine); 3469 } 3470 3471 if (ce->wa_bb_page) { 3472 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3473 3474 fn = gen12_emit_indirect_ctx_xcs; 3475 if (ce->engine->class == RENDER_CLASS) 3476 fn = gen12_emit_indirect_ctx_rcs; 3477 3478 /* Mutually exclusive wrt to global indirect bb */ 3479 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3480 setup_indirect_ctx_bb(ce, engine, fn); 3481 } 3482 } 3483 3484 static int 3485 execlists_context_pre_pin(struct intel_context *ce, 3486 struct i915_gem_ww_ctx *ww, void **vaddr) 3487 { 3488 GEM_BUG_ON(!ce->state); 3489 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3490 3491 *vaddr = i915_gem_object_pin_map(ce->state->obj, 3492 i915_coherent_map_type(ce->engine->i915) | 3493 I915_MAP_OVERRIDE); 3494 3495 return PTR_ERR_OR_ZERO(*vaddr); 3496 } 3497 3498 static int 3499 __execlists_context_pin(struct intel_context *ce, 3500 struct intel_engine_cs *engine, 3501 void *vaddr) 3502 { 3503 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3504 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3505 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3506 3507 return 0; 3508 } 3509 3510 static int execlists_context_pin(struct intel_context *ce, void *vaddr) 3511 { 3512 return __execlists_context_pin(ce, ce->engine, vaddr); 3513 } 3514 3515 static int execlists_context_alloc(struct intel_context *ce) 3516 { 3517 return __execlists_context_alloc(ce, ce->engine); 3518 } 3519 3520 static void execlists_context_reset(struct intel_context *ce) 3521 { 3522 CE_TRACE(ce, "reset\n"); 3523 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3524 3525 intel_ring_reset(ce->ring, ce->ring->emit); 3526 3527 /* Scrub away the garbage */ 3528 execlists_init_reg_state(ce->lrc_reg_state, 3529 ce, ce->engine, ce->ring, true); 3530 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3531 3532 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3533 } 3534 3535 static const struct intel_context_ops execlists_context_ops = { 3536 .alloc = execlists_context_alloc, 3537 3538 .pre_pin = execlists_context_pre_pin, 3539 .pin = execlists_context_pin, 3540 .unpin = execlists_context_unpin, 3541 .post_unpin = execlists_context_post_unpin, 3542 3543 .enter = intel_context_enter_engine, 3544 .exit = intel_context_exit_engine, 3545 3546 .reset = execlists_context_reset, 3547 .destroy = execlists_context_destroy, 3548 }; 3549 3550 static u32 hwsp_offset(const struct i915_request *rq) 3551 { 3552 const struct intel_timeline_cacheline *cl; 3553 3554 /* Before the request is executed, the timeline/cachline is fixed */ 3555 3556 cl = rcu_dereference_protected(rq->hwsp_cacheline, 1); 3557 if (cl) 3558 return cl->ggtt_offset; 3559 3560 return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset; 3561 } 3562 3563 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3564 { 3565 u32 *cs; 3566 3567 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3568 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3569 return 0; 3570 3571 cs = intel_ring_begin(rq, 6); 3572 if (IS_ERR(cs)) 3573 return PTR_ERR(cs); 3574 3575 /* 3576 * Check if we have been preempted before we even get started. 3577 * 3578 * After this point i915_request_started() reports true, even if 3579 * we get preempted and so are no longer running. 3580 */ 3581 *cs++ = MI_ARB_CHECK; 3582 *cs++ = MI_NOOP; 3583 3584 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3585 *cs++ = hwsp_offset(rq); 3586 *cs++ = 0; 3587 *cs++ = rq->fence.seqno - 1; 3588 3589 intel_ring_advance(rq, cs); 3590 3591 /* Record the updated position of the request's payload */ 3592 rq->infix = intel_ring_offset(rq, cs); 3593 3594 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3595 3596 return 0; 3597 } 3598 3599 static int emit_pdps(struct i915_request *rq) 3600 { 3601 const struct intel_engine_cs * const engine = rq->engine; 3602 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3603 int err, i; 3604 u32 *cs; 3605 3606 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3607 3608 /* 3609 * Beware ye of the dragons, this sequence is magic! 3610 * 3611 * Small changes to this sequence can cause anything from 3612 * GPU hangs to forcewake errors and machine lockups! 3613 */ 3614 3615 /* Flush any residual operations from the context load */ 3616 err = engine->emit_flush(rq, EMIT_FLUSH); 3617 if (err) 3618 return err; 3619 3620 /* Magic required to prevent forcewake errors! */ 3621 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3622 if (err) 3623 return err; 3624 3625 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3626 if (IS_ERR(cs)) 3627 return PTR_ERR(cs); 3628 3629 /* Ensure the LRI have landed before we invalidate & continue */ 3630 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3631 for (i = GEN8_3LVL_PDPES; i--; ) { 3632 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3633 u32 base = engine->mmio_base; 3634 3635 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3636 *cs++ = upper_32_bits(pd_daddr); 3637 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3638 *cs++ = lower_32_bits(pd_daddr); 3639 } 3640 *cs++ = MI_NOOP; 3641 3642 intel_ring_advance(rq, cs); 3643 3644 return 0; 3645 } 3646 3647 static int execlists_request_alloc(struct i915_request *request) 3648 { 3649 int ret; 3650 3651 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3652 3653 /* 3654 * Flush enough space to reduce the likelihood of waiting after 3655 * we start building the request - in which case we will just 3656 * have to repeat work. 3657 */ 3658 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3659 3660 /* 3661 * Note that after this point, we have committed to using 3662 * this request as it is being used to both track the 3663 * state of engine initialisation and liveness of the 3664 * golden renderstate above. Think twice before you try 3665 * to cancel/unwind this request now. 3666 */ 3667 3668 if (!i915_vm_is_4lvl(request->context->vm)) { 3669 ret = emit_pdps(request); 3670 if (ret) 3671 return ret; 3672 } 3673 3674 /* Unconditionally invalidate GPU caches and TLBs. */ 3675 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3676 if (ret) 3677 return ret; 3678 3679 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3680 return 0; 3681 } 3682 3683 /* 3684 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3685 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3686 * but there is a slight complication as this is applied in WA batch where the 3687 * values are only initialized once so we cannot take register value at the 3688 * beginning and reuse it further; hence we save its value to memory, upload a 3689 * constant value with bit21 set and then we restore it back with the saved value. 3690 * To simplify the WA, a constant value is formed by using the default value 3691 * of this register. This shouldn't be a problem because we are only modifying 3692 * it for a short period and this batch in non-premptible. We can ofcourse 3693 * use additional instructions that read the actual value of the register 3694 * at that time and set our bit of interest but it makes the WA complicated. 3695 * 3696 * This WA is also required for Gen9 so extracting as a function avoids 3697 * code duplication. 3698 */ 3699 static u32 * 3700 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3701 { 3702 /* NB no one else is allowed to scribble over scratch + 256! */ 3703 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3704 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3705 *batch++ = intel_gt_scratch_offset(engine->gt, 3706 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3707 *batch++ = 0; 3708 3709 *batch++ = MI_LOAD_REGISTER_IMM(1); 3710 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3711 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3712 3713 batch = gen8_emit_pipe_control(batch, 3714 PIPE_CONTROL_CS_STALL | 3715 PIPE_CONTROL_DC_FLUSH_ENABLE, 3716 0); 3717 3718 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3719 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3720 *batch++ = intel_gt_scratch_offset(engine->gt, 3721 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3722 *batch++ = 0; 3723 3724 return batch; 3725 } 3726 3727 /* 3728 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3729 * initialized at the beginning and shared across all contexts but this field 3730 * helps us to have multiple batches at different offsets and select them based 3731 * on a criteria. At the moment this batch always start at the beginning of the page 3732 * and at this point we don't have multiple wa_ctx batch buffers. 3733 * 3734 * The number of WA applied are not known at the beginning; we use this field 3735 * to return the no of DWORDS written. 3736 * 3737 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3738 * so it adds NOOPs as padding to make it cacheline aligned. 3739 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3740 * makes a complete batch buffer. 3741 */ 3742 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3743 { 3744 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3745 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3746 3747 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3748 if (IS_BROADWELL(engine->i915)) 3749 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3750 3751 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3752 /* Actual scratch location is at 128 bytes offset */ 3753 batch = gen8_emit_pipe_control(batch, 3754 PIPE_CONTROL_FLUSH_L3 | 3755 PIPE_CONTROL_STORE_DATA_INDEX | 3756 PIPE_CONTROL_CS_STALL | 3757 PIPE_CONTROL_QW_WRITE, 3758 LRC_PPHWSP_SCRATCH_ADDR); 3759 3760 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3761 3762 /* Pad to end of cacheline */ 3763 while ((unsigned long)batch % CACHELINE_BYTES) 3764 *batch++ = MI_NOOP; 3765 3766 /* 3767 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3768 * execution depends on the length specified in terms of cache lines 3769 * in the register CTX_RCS_INDIRECT_CTX 3770 */ 3771 3772 return batch; 3773 } 3774 3775 struct lri { 3776 i915_reg_t reg; 3777 u32 value; 3778 }; 3779 3780 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3781 { 3782 GEM_BUG_ON(!count || count > 63); 3783 3784 *batch++ = MI_LOAD_REGISTER_IMM(count); 3785 do { 3786 *batch++ = i915_mmio_reg_offset(lri->reg); 3787 *batch++ = lri->value; 3788 } while (lri++, --count); 3789 *batch++ = MI_NOOP; 3790 3791 return batch; 3792 } 3793 3794 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3795 { 3796 static const struct lri lri[] = { 3797 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3798 { 3799 COMMON_SLICE_CHICKEN2, 3800 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3801 0), 3802 }, 3803 3804 /* BSpec: 11391 */ 3805 { 3806 FF_SLICE_CHICKEN, 3807 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3808 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3809 }, 3810 3811 /* BSpec: 11299 */ 3812 { 3813 _3D_CHICKEN3, 3814 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3815 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3816 } 3817 }; 3818 3819 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3820 3821 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3822 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3823 3824 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3825 batch = gen8_emit_pipe_control(batch, 3826 PIPE_CONTROL_FLUSH_L3 | 3827 PIPE_CONTROL_STORE_DATA_INDEX | 3828 PIPE_CONTROL_CS_STALL | 3829 PIPE_CONTROL_QW_WRITE, 3830 LRC_PPHWSP_SCRATCH_ADDR); 3831 3832 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3833 3834 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3835 if (HAS_POOLED_EU(engine->i915)) { 3836 /* 3837 * EU pool configuration is setup along with golden context 3838 * during context initialization. This value depends on 3839 * device type (2x6 or 3x6) and needs to be updated based 3840 * on which subslice is disabled especially for 2x6 3841 * devices, however it is safe to load default 3842 * configuration of 3x6 device instead of masking off 3843 * corresponding bits because HW ignores bits of a disabled 3844 * subslice and drops down to appropriate config. Please 3845 * see render_state_setup() in i915_gem_render_state.c for 3846 * possible configurations, to avoid duplication they are 3847 * not shown here again. 3848 */ 3849 *batch++ = GEN9_MEDIA_POOL_STATE; 3850 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3851 *batch++ = 0x00777000; 3852 *batch++ = 0; 3853 *batch++ = 0; 3854 *batch++ = 0; 3855 } 3856 3857 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3858 3859 /* Pad to end of cacheline */ 3860 while ((unsigned long)batch % CACHELINE_BYTES) 3861 *batch++ = MI_NOOP; 3862 3863 return batch; 3864 } 3865 3866 static u32 * 3867 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3868 { 3869 int i; 3870 3871 /* 3872 * WaPipeControlBefore3DStateSamplePattern: cnl 3873 * 3874 * Ensure the engine is idle prior to programming a 3875 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3876 */ 3877 batch = gen8_emit_pipe_control(batch, 3878 PIPE_CONTROL_CS_STALL, 3879 0); 3880 /* 3881 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3882 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3883 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3884 * confusing. Since gen8_emit_pipe_control() already advances the 3885 * batch by 6 dwords, we advance the other 10 here, completing a 3886 * cacheline. It's not clear if the workaround requires this padding 3887 * before other commands, or if it's just the regular padding we would 3888 * already have for the workaround bb, so leave it here for now. 3889 */ 3890 for (i = 0; i < 10; i++) 3891 *batch++ = MI_NOOP; 3892 3893 /* Pad to end of cacheline */ 3894 while ((unsigned long)batch % CACHELINE_BYTES) 3895 *batch++ = MI_NOOP; 3896 3897 return batch; 3898 } 3899 3900 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3901 3902 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3903 { 3904 struct drm_i915_gem_object *obj; 3905 struct i915_vma *vma; 3906 int err; 3907 3908 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3909 if (IS_ERR(obj)) 3910 return PTR_ERR(obj); 3911 3912 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3913 if (IS_ERR(vma)) { 3914 err = PTR_ERR(vma); 3915 goto err; 3916 } 3917 3918 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH); 3919 if (err) 3920 goto err; 3921 3922 engine->wa_ctx.vma = vma; 3923 return 0; 3924 3925 err: 3926 i915_gem_object_put(obj); 3927 return err; 3928 } 3929 3930 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3931 { 3932 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3933 } 3934 3935 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3936 3937 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3938 { 3939 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3940 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3941 &wa_ctx->per_ctx }; 3942 wa_bb_func_t wa_bb_fn[2]; 3943 void *batch, *batch_ptr; 3944 unsigned int i; 3945 int ret; 3946 3947 if (engine->class != RENDER_CLASS) 3948 return 0; 3949 3950 switch (INTEL_GEN(engine->i915)) { 3951 case 12: 3952 case 11: 3953 return 0; 3954 case 10: 3955 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3956 wa_bb_fn[1] = NULL; 3957 break; 3958 case 9: 3959 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3960 wa_bb_fn[1] = NULL; 3961 break; 3962 case 8: 3963 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3964 wa_bb_fn[1] = NULL; 3965 break; 3966 default: 3967 MISSING_CASE(INTEL_GEN(engine->i915)); 3968 return 0; 3969 } 3970 3971 ret = lrc_setup_wa_ctx(engine); 3972 if (ret) { 3973 drm_dbg(&engine->i915->drm, 3974 "Failed to setup context WA page: %d\n", ret); 3975 return ret; 3976 } 3977 3978 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 3979 3980 /* 3981 * Emit the two workaround batch buffers, recording the offset from the 3982 * start of the workaround batch buffer object for each and their 3983 * respective sizes. 3984 */ 3985 batch_ptr = batch; 3986 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3987 wa_bb[i]->offset = batch_ptr - batch; 3988 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3989 CACHELINE_BYTES))) { 3990 ret = -EINVAL; 3991 break; 3992 } 3993 if (wa_bb_fn[i]) 3994 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3995 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3996 } 3997 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3998 3999 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 4000 __i915_gem_object_release_map(wa_ctx->vma->obj); 4001 if (ret) 4002 lrc_destroy_wa_ctx(engine); 4003 4004 return ret; 4005 } 4006 4007 static void reset_csb_pointers(struct intel_engine_cs *engine) 4008 { 4009 struct intel_engine_execlists * const execlists = &engine->execlists; 4010 const unsigned int reset_value = execlists->csb_size - 1; 4011 4012 ring_set_paused(engine, 0); 4013 4014 /* 4015 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 4016 * Bludgeon them with a mmio update to be sure. 4017 */ 4018 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4019 0xffff << 16 | reset_value << 8 | reset_value); 4020 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4021 4022 /* 4023 * After a reset, the HW starts writing into CSB entry [0]. We 4024 * therefore have to set our HEAD pointer back one entry so that 4025 * the *first* entry we check is entry 0. To complicate this further, 4026 * as we don't wait for the first interrupt after reset, we have to 4027 * fake the HW write to point back to the last entry so that our 4028 * inline comparison of our cached head position against the last HW 4029 * write works even before the first interrupt. 4030 */ 4031 execlists->csb_head = reset_value; 4032 WRITE_ONCE(*execlists->csb_write, reset_value); 4033 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4034 4035 /* Check that the GPU does indeed update the CSB entries! */ 4036 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64)); 4037 invalidate_csb_entries(&execlists->csb_status[0], 4038 &execlists->csb_status[reset_value]); 4039 4040 /* Once more for luck and our trusty paranoia */ 4041 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4042 0xffff << 16 | reset_value << 8 | reset_value); 4043 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4044 4045 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4046 } 4047 4048 static void execlists_sanitize(struct intel_engine_cs *engine) 4049 { 4050 /* 4051 * Poison residual state on resume, in case the suspend didn't! 4052 * 4053 * We have to assume that across suspend/resume (or other loss 4054 * of control) that the contents of our pinned buffers has been 4055 * lost, replaced by garbage. Since this doesn't always happen, 4056 * let's poison such state so that we more quickly spot when 4057 * we falsely assume it has been preserved. 4058 */ 4059 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4060 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4061 4062 reset_csb_pointers(engine); 4063 4064 /* 4065 * The kernel_context HWSP is stored in the status_page. As above, 4066 * that may be lost on resume/initialisation, and so we need to 4067 * reset the value in the HWSP. 4068 */ 4069 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4070 4071 /* And scrub the dirty cachelines for the HWSP */ 4072 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4073 } 4074 4075 static void enable_error_interrupt(struct intel_engine_cs *engine) 4076 { 4077 u32 status; 4078 4079 engine->execlists.error_interrupt = 0; 4080 ENGINE_WRITE(engine, RING_EMR, ~0u); 4081 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4082 4083 status = ENGINE_READ(engine, RING_ESR); 4084 if (unlikely(status)) { 4085 drm_err(&engine->i915->drm, 4086 "engine '%s' resumed still in error: %08x\n", 4087 engine->name, status); 4088 __intel_gt_reset(engine->gt, engine->mask); 4089 } 4090 4091 /* 4092 * On current gen8+, we have 2 signals to play with 4093 * 4094 * - I915_ERROR_INSTUCTION (bit 0) 4095 * 4096 * Generate an error if the command parser encounters an invalid 4097 * instruction 4098 * 4099 * This is a fatal error. 4100 * 4101 * - CP_PRIV (bit 2) 4102 * 4103 * Generate an error on privilege violation (where the CP replaces 4104 * the instruction with a no-op). This also fires for writes into 4105 * read-only scratch pages. 4106 * 4107 * This is a non-fatal error, parsing continues. 4108 * 4109 * * there are a few others defined for odd HW that we do not use 4110 * 4111 * Since CP_PRIV fires for cases where we have chosen to ignore the 4112 * error (as the HW is validating and suppressing the mistakes), we 4113 * only unmask the instruction error bit. 4114 */ 4115 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4116 } 4117 4118 static void enable_execlists(struct intel_engine_cs *engine) 4119 { 4120 u32 mode; 4121 4122 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4123 4124 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4125 4126 if (INTEL_GEN(engine->i915) >= 11) 4127 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4128 else 4129 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4130 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4131 4132 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4133 4134 ENGINE_WRITE_FW(engine, 4135 RING_HWS_PGA, 4136 i915_ggtt_offset(engine->status_page.vma)); 4137 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4138 4139 enable_error_interrupt(engine); 4140 4141 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4142 } 4143 4144 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4145 { 4146 bool unexpected = false; 4147 4148 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4149 drm_dbg(&engine->i915->drm, 4150 "STOP_RING still set in RING_MI_MODE\n"); 4151 unexpected = true; 4152 } 4153 4154 return unexpected; 4155 } 4156 4157 static int execlists_resume(struct intel_engine_cs *engine) 4158 { 4159 intel_mocs_init_engine(engine); 4160 4161 intel_breadcrumbs_reset(engine->breadcrumbs); 4162 4163 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4164 struct drm_printer p = drm_debug_printer(__func__); 4165 4166 intel_engine_dump(engine, &p, NULL); 4167 } 4168 4169 enable_execlists(engine); 4170 4171 return 0; 4172 } 4173 4174 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4175 { 4176 struct intel_engine_execlists * const execlists = &engine->execlists; 4177 unsigned long flags; 4178 4179 ENGINE_TRACE(engine, "depth<-%d\n", 4180 atomic_read(&execlists->tasklet.count)); 4181 4182 /* 4183 * Prevent request submission to the hardware until we have 4184 * completed the reset in i915_gem_reset_finish(). If a request 4185 * is completed by one engine, it may then queue a request 4186 * to a second via its execlists->tasklet *just* as we are 4187 * calling engine->resume() and also writing the ELSP. 4188 * Turning off the execlists->tasklet until the reset is over 4189 * prevents the race. 4190 */ 4191 __tasklet_disable_sync_once(&execlists->tasklet); 4192 GEM_BUG_ON(!reset_in_progress(execlists)); 4193 4194 /* And flush any current direct submission. */ 4195 spin_lock_irqsave(&engine->active.lock, flags); 4196 spin_unlock_irqrestore(&engine->active.lock, flags); 4197 4198 /* 4199 * We stop engines, otherwise we might get failed reset and a 4200 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4201 * from system hang if batchbuffer is progressing when 4202 * the reset is issued, regardless of READY_TO_RESET ack. 4203 * Thus assume it is best to stop engines on all gens 4204 * where we have a gpu reset. 4205 * 4206 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4207 * 4208 * FIXME: Wa for more modern gens needs to be validated 4209 */ 4210 ring_set_paused(engine, 1); 4211 intel_engine_stop_cs(engine); 4212 4213 engine->execlists.reset_ccid = active_ccid(engine); 4214 } 4215 4216 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4217 { 4218 int x; 4219 4220 x = lrc_ring_mi_mode(engine); 4221 if (x != -1) { 4222 regs[x + 1] &= ~STOP_RING; 4223 regs[x + 1] |= STOP_RING << 16; 4224 } 4225 } 4226 4227 static void __execlists_reset_reg_state(const struct intel_context *ce, 4228 const struct intel_engine_cs *engine) 4229 { 4230 u32 *regs = ce->lrc_reg_state; 4231 4232 __reset_stop_ring(regs, engine); 4233 } 4234 4235 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4236 { 4237 struct intel_engine_execlists * const execlists = &engine->execlists; 4238 struct intel_context *ce; 4239 struct i915_request *rq; 4240 u32 head; 4241 4242 mb(); /* paranoia: read the CSB pointers from after the reset */ 4243 clflush(execlists->csb_write); 4244 mb(); 4245 4246 process_csb(engine); /* drain preemption events */ 4247 4248 /* Following the reset, we need to reload the CSB read/write pointers */ 4249 reset_csb_pointers(engine); 4250 4251 /* 4252 * Save the currently executing context, even if we completed 4253 * its request, it was still running at the time of the 4254 * reset and will have been clobbered. 4255 */ 4256 rq = active_context(engine, engine->execlists.reset_ccid); 4257 if (!rq) 4258 goto unwind; 4259 4260 ce = rq->context; 4261 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4262 4263 if (i915_request_completed(rq)) { 4264 /* Idle context; tidy up the ring so we can restart afresh */ 4265 head = intel_ring_wrap(ce->ring, rq->tail); 4266 goto out_replay; 4267 } 4268 4269 /* We still have requests in-flight; the engine should be active */ 4270 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4271 4272 /* Context has requests still in-flight; it should not be idle! */ 4273 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4274 4275 rq = active_request(ce->timeline, rq); 4276 head = intel_ring_wrap(ce->ring, rq->head); 4277 GEM_BUG_ON(head == ce->ring->tail); 4278 4279 /* 4280 * If this request hasn't started yet, e.g. it is waiting on a 4281 * semaphore, we need to avoid skipping the request or else we 4282 * break the signaling chain. However, if the context is corrupt 4283 * the request will not restart and we will be stuck with a wedged 4284 * device. It is quite often the case that if we issue a reset 4285 * while the GPU is loading the context image, that the context 4286 * image becomes corrupt. 4287 * 4288 * Otherwise, if we have not started yet, the request should replay 4289 * perfectly and we do not need to flag the result as being erroneous. 4290 */ 4291 if (!i915_request_started(rq)) 4292 goto out_replay; 4293 4294 /* 4295 * If the request was innocent, we leave the request in the ELSP 4296 * and will try to replay it on restarting. The context image may 4297 * have been corrupted by the reset, in which case we may have 4298 * to service a new GPU hang, but more likely we can continue on 4299 * without impact. 4300 * 4301 * If the request was guilty, we presume the context is corrupt 4302 * and have to at least restore the RING register in the context 4303 * image back to the expected values to skip over the guilty request. 4304 */ 4305 __i915_request_reset(rq, stalled); 4306 4307 /* 4308 * We want a simple context + ring to execute the breadcrumb update. 4309 * We cannot rely on the context being intact across the GPU hang, 4310 * so clear it and rebuild just what we need for the breadcrumb. 4311 * All pending requests for this context will be zapped, and any 4312 * future request will be after userspace has had the opportunity 4313 * to recreate its own state. 4314 */ 4315 out_replay: 4316 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4317 head, ce->ring->tail); 4318 __execlists_reset_reg_state(ce, engine); 4319 __execlists_update_reg_state(ce, engine, head); 4320 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4321 4322 unwind: 4323 /* Push back any incomplete requests for replay after the reset. */ 4324 cancel_port_requests(execlists); 4325 __unwind_incomplete_requests(engine); 4326 } 4327 4328 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4329 { 4330 unsigned long flags; 4331 4332 ENGINE_TRACE(engine, "\n"); 4333 4334 spin_lock_irqsave(&engine->active.lock, flags); 4335 4336 __execlists_reset(engine, stalled); 4337 4338 spin_unlock_irqrestore(&engine->active.lock, flags); 4339 } 4340 4341 static void nop_submission_tasklet(unsigned long data) 4342 { 4343 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4344 4345 /* The driver is wedged; don't process any more events. */ 4346 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4347 } 4348 4349 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4350 { 4351 struct intel_engine_execlists * const execlists = &engine->execlists; 4352 struct i915_request *rq, *rn; 4353 struct rb_node *rb; 4354 unsigned long flags; 4355 4356 ENGINE_TRACE(engine, "\n"); 4357 4358 /* 4359 * Before we call engine->cancel_requests(), we should have exclusive 4360 * access to the submission state. This is arranged for us by the 4361 * caller disabling the interrupt generation, the tasklet and other 4362 * threads that may then access the same state, giving us a free hand 4363 * to reset state. However, we still need to let lockdep be aware that 4364 * we know this state may be accessed in hardirq context, so we 4365 * disable the irq around this manipulation and we want to keep 4366 * the spinlock focused on its duties and not accidentally conflate 4367 * coverage to the submission's irq state. (Similarly, although we 4368 * shouldn't need to disable irq around the manipulation of the 4369 * submission's irq state, we also wish to remind ourselves that 4370 * it is irq state.) 4371 */ 4372 spin_lock_irqsave(&engine->active.lock, flags); 4373 4374 __execlists_reset(engine, true); 4375 4376 /* Mark all executing requests as skipped. */ 4377 list_for_each_entry(rq, &engine->active.requests, sched.link) 4378 mark_eio(rq); 4379 4380 /* Flush the queued requests to the timeline list (for retiring). */ 4381 while ((rb = rb_first_cached(&execlists->queue))) { 4382 struct i915_priolist *p = to_priolist(rb); 4383 int i; 4384 4385 priolist_for_each_request_consume(rq, rn, p, i) { 4386 mark_eio(rq); 4387 __i915_request_submit(rq); 4388 } 4389 4390 rb_erase_cached(&p->node, &execlists->queue); 4391 i915_priolist_free(p); 4392 } 4393 4394 /* On-hold requests will be flushed to timeline upon their release */ 4395 list_for_each_entry(rq, &engine->active.hold, sched.link) 4396 mark_eio(rq); 4397 4398 /* Cancel all attached virtual engines */ 4399 while ((rb = rb_first_cached(&execlists->virtual))) { 4400 struct virtual_engine *ve = 4401 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4402 4403 rb_erase_cached(rb, &execlists->virtual); 4404 RB_CLEAR_NODE(rb); 4405 4406 spin_lock(&ve->base.active.lock); 4407 rq = fetch_and_zero(&ve->request); 4408 if (rq) { 4409 mark_eio(rq); 4410 4411 rq->engine = engine; 4412 __i915_request_submit(rq); 4413 i915_request_put(rq); 4414 4415 ve->base.execlists.queue_priority_hint = INT_MIN; 4416 } 4417 spin_unlock(&ve->base.active.lock); 4418 } 4419 4420 /* Remaining _unready_ requests will be nop'ed when submitted */ 4421 4422 execlists->queue_priority_hint = INT_MIN; 4423 execlists->queue = RB_ROOT_CACHED; 4424 4425 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4426 execlists->tasklet.func = nop_submission_tasklet; 4427 4428 spin_unlock_irqrestore(&engine->active.lock, flags); 4429 } 4430 4431 static void execlists_reset_finish(struct intel_engine_cs *engine) 4432 { 4433 struct intel_engine_execlists * const execlists = &engine->execlists; 4434 4435 /* 4436 * After a GPU reset, we may have requests to replay. Do so now while 4437 * we still have the forcewake to be sure that the GPU is not allowed 4438 * to sleep before we restart and reload a context. 4439 */ 4440 GEM_BUG_ON(!reset_in_progress(execlists)); 4441 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4442 execlists->tasklet.func(execlists->tasklet.data); 4443 4444 if (__tasklet_enable(&execlists->tasklet)) 4445 /* And kick in case we missed a new request submission. */ 4446 tasklet_hi_schedule(&execlists->tasklet); 4447 ENGINE_TRACE(engine, "depth->%d\n", 4448 atomic_read(&execlists->tasklet.count)); 4449 } 4450 4451 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4452 u64 offset, u32 len, 4453 const unsigned int flags) 4454 { 4455 u32 *cs; 4456 4457 cs = intel_ring_begin(rq, 4); 4458 if (IS_ERR(cs)) 4459 return PTR_ERR(cs); 4460 4461 /* 4462 * WaDisableCtxRestoreArbitration:bdw,chv 4463 * 4464 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4465 * particular all the gen that do not need the w/a at all!), if we 4466 * took care to make sure that on every switch into this context 4467 * (both ordinary and for preemption) that arbitrartion was enabled 4468 * we would be fine. However, for gen8 there is another w/a that 4469 * requires us to not preempt inside GPGPU execution, so we keep 4470 * arbitration disabled for gen8 batches. Arbitration will be 4471 * re-enabled before we close the request 4472 * (engine->emit_fini_breadcrumb). 4473 */ 4474 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4475 4476 /* FIXME(BDW+): Address space and security selectors. */ 4477 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4478 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4479 *cs++ = lower_32_bits(offset); 4480 *cs++ = upper_32_bits(offset); 4481 4482 intel_ring_advance(rq, cs); 4483 4484 return 0; 4485 } 4486 4487 static int gen8_emit_bb_start(struct i915_request *rq, 4488 u64 offset, u32 len, 4489 const unsigned int flags) 4490 { 4491 u32 *cs; 4492 4493 cs = intel_ring_begin(rq, 6); 4494 if (IS_ERR(cs)) 4495 return PTR_ERR(cs); 4496 4497 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4498 4499 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4500 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4501 *cs++ = lower_32_bits(offset); 4502 *cs++ = upper_32_bits(offset); 4503 4504 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4505 *cs++ = MI_NOOP; 4506 4507 intel_ring_advance(rq, cs); 4508 4509 return 0; 4510 } 4511 4512 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4513 { 4514 ENGINE_WRITE(engine, RING_IMR, 4515 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4516 ENGINE_POSTING_READ(engine, RING_IMR); 4517 } 4518 4519 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4520 { 4521 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4522 } 4523 4524 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4525 { 4526 u32 cmd, *cs; 4527 4528 cs = intel_ring_begin(request, 4); 4529 if (IS_ERR(cs)) 4530 return PTR_ERR(cs); 4531 4532 cmd = MI_FLUSH_DW + 1; 4533 4534 /* We always require a command barrier so that subsequent 4535 * commands, such as breadcrumb interrupts, are strictly ordered 4536 * wrt the contents of the write cache being flushed to memory 4537 * (and thus being coherent from the CPU). 4538 */ 4539 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4540 4541 if (mode & EMIT_INVALIDATE) { 4542 cmd |= MI_INVALIDATE_TLB; 4543 if (request->engine->class == VIDEO_DECODE_CLASS) 4544 cmd |= MI_INVALIDATE_BSD; 4545 } 4546 4547 *cs++ = cmd; 4548 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4549 *cs++ = 0; /* upper addr */ 4550 *cs++ = 0; /* value */ 4551 intel_ring_advance(request, cs); 4552 4553 return 0; 4554 } 4555 4556 static int gen8_emit_flush_render(struct i915_request *request, 4557 u32 mode) 4558 { 4559 bool vf_flush_wa = false, dc_flush_wa = false; 4560 u32 *cs, flags = 0; 4561 int len; 4562 4563 flags |= PIPE_CONTROL_CS_STALL; 4564 4565 if (mode & EMIT_FLUSH) { 4566 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4567 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4568 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4569 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4570 } 4571 4572 if (mode & EMIT_INVALIDATE) { 4573 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4574 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4575 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4576 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4577 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4578 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4579 flags |= PIPE_CONTROL_QW_WRITE; 4580 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4581 4582 /* 4583 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4584 * pipe control. 4585 */ 4586 if (IS_GEN(request->engine->i915, 9)) 4587 vf_flush_wa = true; 4588 4589 /* WaForGAMHang:kbl */ 4590 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4591 dc_flush_wa = true; 4592 } 4593 4594 len = 6; 4595 4596 if (vf_flush_wa) 4597 len += 6; 4598 4599 if (dc_flush_wa) 4600 len += 12; 4601 4602 cs = intel_ring_begin(request, len); 4603 if (IS_ERR(cs)) 4604 return PTR_ERR(cs); 4605 4606 if (vf_flush_wa) 4607 cs = gen8_emit_pipe_control(cs, 0, 0); 4608 4609 if (dc_flush_wa) 4610 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4611 0); 4612 4613 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4614 4615 if (dc_flush_wa) 4616 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4617 4618 intel_ring_advance(request, cs); 4619 4620 return 0; 4621 } 4622 4623 static int gen11_emit_flush_render(struct i915_request *request, 4624 u32 mode) 4625 { 4626 if (mode & EMIT_FLUSH) { 4627 u32 *cs; 4628 u32 flags = 0; 4629 4630 flags |= PIPE_CONTROL_CS_STALL; 4631 4632 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4633 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4634 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4635 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4636 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4637 flags |= PIPE_CONTROL_QW_WRITE; 4638 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4639 4640 cs = intel_ring_begin(request, 6); 4641 if (IS_ERR(cs)) 4642 return PTR_ERR(cs); 4643 4644 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4645 intel_ring_advance(request, cs); 4646 } 4647 4648 if (mode & EMIT_INVALIDATE) { 4649 u32 *cs; 4650 u32 flags = 0; 4651 4652 flags |= PIPE_CONTROL_CS_STALL; 4653 4654 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4655 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4656 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4657 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4658 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4659 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4660 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4661 flags |= PIPE_CONTROL_QW_WRITE; 4662 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4663 4664 cs = intel_ring_begin(request, 6); 4665 if (IS_ERR(cs)) 4666 return PTR_ERR(cs); 4667 4668 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4669 intel_ring_advance(request, cs); 4670 } 4671 4672 return 0; 4673 } 4674 4675 static u32 preparser_disable(bool state) 4676 { 4677 return MI_ARB_CHECK | 1 << 8 | state; 4678 } 4679 4680 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4681 { 4682 static const i915_reg_t vd[] = { 4683 GEN12_VD0_AUX_NV, 4684 GEN12_VD1_AUX_NV, 4685 GEN12_VD2_AUX_NV, 4686 GEN12_VD3_AUX_NV, 4687 }; 4688 4689 static const i915_reg_t ve[] = { 4690 GEN12_VE0_AUX_NV, 4691 GEN12_VE1_AUX_NV, 4692 }; 4693 4694 if (engine->class == VIDEO_DECODE_CLASS) 4695 return vd[engine->instance]; 4696 4697 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4698 return ve[engine->instance]; 4699 4700 GEM_BUG_ON("unknown aux_inv_reg\n"); 4701 4702 return INVALID_MMIO_REG; 4703 } 4704 4705 static u32 * 4706 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4707 { 4708 *cs++ = MI_LOAD_REGISTER_IMM(1); 4709 *cs++ = i915_mmio_reg_offset(inv_reg); 4710 *cs++ = AUX_INV; 4711 *cs++ = MI_NOOP; 4712 4713 return cs; 4714 } 4715 4716 static int gen12_emit_flush_render(struct i915_request *request, 4717 u32 mode) 4718 { 4719 if (mode & EMIT_FLUSH) { 4720 u32 flags = 0; 4721 u32 *cs; 4722 4723 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4724 flags |= PIPE_CONTROL_FLUSH_L3; 4725 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4726 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4727 /* Wa_1409600907:tgl */ 4728 flags |= PIPE_CONTROL_DEPTH_STALL; 4729 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4730 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4731 4732 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4733 flags |= PIPE_CONTROL_QW_WRITE; 4734 4735 flags |= PIPE_CONTROL_CS_STALL; 4736 4737 cs = intel_ring_begin(request, 6); 4738 if (IS_ERR(cs)) 4739 return PTR_ERR(cs); 4740 4741 cs = gen12_emit_pipe_control(cs, 4742 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4743 flags, LRC_PPHWSP_SCRATCH_ADDR); 4744 intel_ring_advance(request, cs); 4745 } 4746 4747 if (mode & EMIT_INVALIDATE) { 4748 u32 flags = 0; 4749 u32 *cs; 4750 4751 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4752 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4753 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4754 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4755 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4756 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4757 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4758 4759 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4760 flags |= PIPE_CONTROL_QW_WRITE; 4761 4762 flags |= PIPE_CONTROL_CS_STALL; 4763 4764 cs = intel_ring_begin(request, 8 + 4); 4765 if (IS_ERR(cs)) 4766 return PTR_ERR(cs); 4767 4768 /* 4769 * Prevent the pre-parser from skipping past the TLB 4770 * invalidate and loading a stale page for the batch 4771 * buffer / request payload. 4772 */ 4773 *cs++ = preparser_disable(true); 4774 4775 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4776 4777 /* hsdes: 1809175790 */ 4778 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4779 4780 *cs++ = preparser_disable(false); 4781 intel_ring_advance(request, cs); 4782 } 4783 4784 return 0; 4785 } 4786 4787 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4788 { 4789 intel_engine_mask_t aux_inv = 0; 4790 u32 cmd, *cs; 4791 4792 cmd = 4; 4793 if (mode & EMIT_INVALIDATE) 4794 cmd += 2; 4795 if (mode & EMIT_INVALIDATE) 4796 aux_inv = request->engine->mask & ~BIT(BCS0); 4797 if (aux_inv) 4798 cmd += 2 * hweight8(aux_inv) + 2; 4799 4800 cs = intel_ring_begin(request, cmd); 4801 if (IS_ERR(cs)) 4802 return PTR_ERR(cs); 4803 4804 if (mode & EMIT_INVALIDATE) 4805 *cs++ = preparser_disable(true); 4806 4807 cmd = MI_FLUSH_DW + 1; 4808 4809 /* We always require a command barrier so that subsequent 4810 * commands, such as breadcrumb interrupts, are strictly ordered 4811 * wrt the contents of the write cache being flushed to memory 4812 * (and thus being coherent from the CPU). 4813 */ 4814 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4815 4816 if (mode & EMIT_INVALIDATE) { 4817 cmd |= MI_INVALIDATE_TLB; 4818 if (request->engine->class == VIDEO_DECODE_CLASS) 4819 cmd |= MI_INVALIDATE_BSD; 4820 } 4821 4822 *cs++ = cmd; 4823 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4824 *cs++ = 0; /* upper addr */ 4825 *cs++ = 0; /* value */ 4826 4827 if (aux_inv) { /* hsdes: 1809175790 */ 4828 struct intel_engine_cs *engine; 4829 unsigned int tmp; 4830 4831 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4832 for_each_engine_masked(engine, request->engine->gt, 4833 aux_inv, tmp) { 4834 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4835 *cs++ = AUX_INV; 4836 } 4837 *cs++ = MI_NOOP; 4838 } 4839 4840 if (mode & EMIT_INVALIDATE) 4841 *cs++ = preparser_disable(false); 4842 4843 intel_ring_advance(request, cs); 4844 4845 return 0; 4846 } 4847 4848 static void assert_request_valid(struct i915_request *rq) 4849 { 4850 struct intel_ring *ring __maybe_unused = rq->ring; 4851 4852 /* Can we unwind this request without appearing to go forwards? */ 4853 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4854 } 4855 4856 /* 4857 * Reserve space for 2 NOOPs at the end of each request to be 4858 * used as a workaround for not being allowed to do lite 4859 * restore with HEAD==TAIL (WaIdleLiteRestore). 4860 */ 4861 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4862 { 4863 /* Ensure there's always at least one preemption point per-request. */ 4864 *cs++ = MI_ARB_CHECK; 4865 *cs++ = MI_NOOP; 4866 request->wa_tail = intel_ring_offset(request, cs); 4867 4868 /* Check that entire request is less than half the ring */ 4869 assert_request_valid(request); 4870 4871 return cs; 4872 } 4873 4874 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4875 { 4876 *cs++ = MI_SEMAPHORE_WAIT | 4877 MI_SEMAPHORE_GLOBAL_GTT | 4878 MI_SEMAPHORE_POLL | 4879 MI_SEMAPHORE_SAD_EQ_SDD; 4880 *cs++ = 0; 4881 *cs++ = intel_hws_preempt_address(request->engine); 4882 *cs++ = 0; 4883 4884 return cs; 4885 } 4886 4887 static __always_inline u32* 4888 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4889 { 4890 *cs++ = MI_USER_INTERRUPT; 4891 4892 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4893 if (intel_engine_has_semaphores(request->engine)) 4894 cs = emit_preempt_busywait(request, cs); 4895 4896 request->tail = intel_ring_offset(request, cs); 4897 assert_ring_tail_valid(request->ring, request->tail); 4898 4899 return gen8_emit_wa_tail(request, cs); 4900 } 4901 4902 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs) 4903 { 4904 return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0); 4905 } 4906 4907 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4908 { 4909 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4910 } 4911 4912 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4913 { 4914 cs = gen8_emit_pipe_control(cs, 4915 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4916 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4917 PIPE_CONTROL_DC_FLUSH_ENABLE, 4918 0); 4919 4920 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4921 cs = gen8_emit_ggtt_write_rcs(cs, 4922 request->fence.seqno, 4923 hwsp_offset(request), 4924 PIPE_CONTROL_FLUSH_ENABLE | 4925 PIPE_CONTROL_CS_STALL); 4926 4927 return gen8_emit_fini_breadcrumb_tail(request, cs); 4928 } 4929 4930 static u32 * 4931 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4932 { 4933 cs = gen8_emit_ggtt_write_rcs(cs, 4934 request->fence.seqno, 4935 hwsp_offset(request), 4936 PIPE_CONTROL_CS_STALL | 4937 PIPE_CONTROL_TILE_CACHE_FLUSH | 4938 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4939 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4940 PIPE_CONTROL_DC_FLUSH_ENABLE | 4941 PIPE_CONTROL_FLUSH_ENABLE); 4942 4943 return gen8_emit_fini_breadcrumb_tail(request, cs); 4944 } 4945 4946 /* 4947 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4948 * flush and will continue pre-fetching the instructions after it before the 4949 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4950 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4951 * of the next request before the memory has been flushed, we're guaranteed that 4952 * we won't access the batch itself too early. 4953 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4954 * so, if the current request is modifying an instruction in the next request on 4955 * the same intel_context, we might pre-fetch and then execute the pre-update 4956 * instruction. To avoid this, the users of self-modifying code should either 4957 * disable the parser around the code emitting the memory writes, via a new flag 4958 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4959 * the in-kernel use-cases we've opted to use a separate context, see 4960 * reloc_gpu() as an example. 4961 * All the above applies only to the instructions themselves. Non-inline data 4962 * used by the instructions is not pre-fetched. 4963 */ 4964 4965 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4966 { 4967 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4968 MI_SEMAPHORE_GLOBAL_GTT | 4969 MI_SEMAPHORE_POLL | 4970 MI_SEMAPHORE_SAD_EQ_SDD; 4971 *cs++ = 0; 4972 *cs++ = intel_hws_preempt_address(request->engine); 4973 *cs++ = 0; 4974 *cs++ = 0; 4975 *cs++ = MI_NOOP; 4976 4977 return cs; 4978 } 4979 4980 static __always_inline u32* 4981 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4982 { 4983 *cs++ = MI_USER_INTERRUPT; 4984 4985 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4986 if (intel_engine_has_semaphores(request->engine)) 4987 cs = gen12_emit_preempt_busywait(request, cs); 4988 4989 request->tail = intel_ring_offset(request, cs); 4990 assert_ring_tail_valid(request->ring, request->tail); 4991 4992 return gen8_emit_wa_tail(request, cs); 4993 } 4994 4995 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4996 { 4997 /* XXX Stalling flush before seqno write; post-sync not */ 4998 cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0)); 4999 return gen12_emit_fini_breadcrumb_tail(rq, cs); 5000 } 5001 5002 static u32 * 5003 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 5004 { 5005 cs = gen12_emit_ggtt_write_rcs(cs, 5006 request->fence.seqno, 5007 hwsp_offset(request), 5008 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 5009 PIPE_CONTROL_CS_STALL | 5010 PIPE_CONTROL_TILE_CACHE_FLUSH | 5011 PIPE_CONTROL_FLUSH_L3 | 5012 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 5013 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 5014 /* Wa_1409600907:tgl */ 5015 PIPE_CONTROL_DEPTH_STALL | 5016 PIPE_CONTROL_DC_FLUSH_ENABLE | 5017 PIPE_CONTROL_FLUSH_ENABLE); 5018 5019 return gen12_emit_fini_breadcrumb_tail(request, cs); 5020 } 5021 5022 static void execlists_park(struct intel_engine_cs *engine) 5023 { 5024 cancel_timer(&engine->execlists.timer); 5025 cancel_timer(&engine->execlists.preempt); 5026 } 5027 5028 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 5029 { 5030 engine->submit_request = execlists_submit_request; 5031 engine->schedule = i915_schedule; 5032 engine->execlists.tasklet.func = execlists_submission_tasklet; 5033 5034 engine->reset.prepare = execlists_reset_prepare; 5035 engine->reset.rewind = execlists_reset_rewind; 5036 engine->reset.cancel = execlists_reset_cancel; 5037 engine->reset.finish = execlists_reset_finish; 5038 5039 engine->park = execlists_park; 5040 engine->unpark = NULL; 5041 5042 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5043 if (!intel_vgpu_active(engine->i915)) { 5044 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5045 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5046 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5047 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5048 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5049 } 5050 } 5051 5052 if (INTEL_GEN(engine->i915) >= 12) 5053 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5054 5055 if (intel_engine_has_preemption(engine)) 5056 engine->emit_bb_start = gen8_emit_bb_start; 5057 else 5058 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5059 } 5060 5061 static void execlists_shutdown(struct intel_engine_cs *engine) 5062 { 5063 /* Synchronise with residual timers and any softirq they raise */ 5064 del_timer_sync(&engine->execlists.timer); 5065 del_timer_sync(&engine->execlists.preempt); 5066 tasklet_kill(&engine->execlists.tasklet); 5067 } 5068 5069 static void execlists_release(struct intel_engine_cs *engine) 5070 { 5071 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5072 5073 execlists_shutdown(engine); 5074 5075 intel_engine_cleanup_common(engine); 5076 lrc_destroy_wa_ctx(engine); 5077 } 5078 5079 static void 5080 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5081 { 5082 /* Default vfuncs which can be overriden by each engine. */ 5083 5084 engine->resume = execlists_resume; 5085 5086 engine->cops = &execlists_context_ops; 5087 engine->request_alloc = execlists_request_alloc; 5088 5089 engine->emit_flush = gen8_emit_flush; 5090 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5091 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5092 if (INTEL_GEN(engine->i915) >= 12) { 5093 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5094 engine->emit_flush = gen12_emit_flush; 5095 } 5096 engine->set_default_submission = intel_execlists_set_default_submission; 5097 5098 if (INTEL_GEN(engine->i915) < 11) { 5099 engine->irq_enable = gen8_logical_ring_enable_irq; 5100 engine->irq_disable = gen8_logical_ring_disable_irq; 5101 } else { 5102 /* 5103 * TODO: On Gen11 interrupt masks need to be clear 5104 * to allow C6 entry. Keep interrupts enabled at 5105 * and take the hit of generating extra interrupts 5106 * until a more refined solution exists. 5107 */ 5108 } 5109 } 5110 5111 static inline void 5112 logical_ring_default_irqs(struct intel_engine_cs *engine) 5113 { 5114 unsigned int shift = 0; 5115 5116 if (INTEL_GEN(engine->i915) < 11) { 5117 const u8 irq_shifts[] = { 5118 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5119 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5120 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5121 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5122 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5123 }; 5124 5125 shift = irq_shifts[engine->id]; 5126 } 5127 5128 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5129 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5130 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5131 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5132 } 5133 5134 static void rcs_submission_override(struct intel_engine_cs *engine) 5135 { 5136 switch (INTEL_GEN(engine->i915)) { 5137 case 12: 5138 engine->emit_flush = gen12_emit_flush_render; 5139 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5140 break; 5141 case 11: 5142 engine->emit_flush = gen11_emit_flush_render; 5143 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5144 break; 5145 default: 5146 engine->emit_flush = gen8_emit_flush_render; 5147 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5148 break; 5149 } 5150 } 5151 5152 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5153 { 5154 struct intel_engine_execlists * const execlists = &engine->execlists; 5155 struct drm_i915_private *i915 = engine->i915; 5156 struct intel_uncore *uncore = engine->uncore; 5157 u32 base = engine->mmio_base; 5158 5159 tasklet_init(&engine->execlists.tasklet, 5160 execlists_submission_tasklet, (unsigned long)engine); 5161 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5162 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5163 5164 logical_ring_default_vfuncs(engine); 5165 logical_ring_default_irqs(engine); 5166 5167 if (engine->class == RENDER_CLASS) 5168 rcs_submission_override(engine); 5169 5170 if (intel_init_workaround_bb(engine)) 5171 /* 5172 * We continue even if we fail to initialize WA batch 5173 * because we only expect rare glitches but nothing 5174 * critical to prevent us from using GPU 5175 */ 5176 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5177 5178 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5179 execlists->submit_reg = uncore->regs + 5180 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5181 execlists->ctrl_reg = uncore->regs + 5182 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5183 } else { 5184 execlists->submit_reg = uncore->regs + 5185 i915_mmio_reg_offset(RING_ELSP(base)); 5186 } 5187 5188 execlists->csb_status = 5189 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5190 5191 execlists->csb_write = 5192 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5193 5194 if (INTEL_GEN(i915) < 11) 5195 execlists->csb_size = GEN8_CSB_ENTRIES; 5196 else 5197 execlists->csb_size = GEN11_CSB_ENTRIES; 5198 5199 if (INTEL_GEN(engine->i915) >= 11) { 5200 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5201 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5202 } 5203 5204 /* Finally, take ownership and responsibility for cleanup! */ 5205 engine->sanitize = execlists_sanitize; 5206 engine->release = execlists_release; 5207 5208 return 0; 5209 } 5210 5211 static void init_common_reg_state(u32 * const regs, 5212 const struct intel_engine_cs *engine, 5213 const struct intel_ring *ring, 5214 bool inhibit) 5215 { 5216 u32 ctl; 5217 5218 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5219 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5220 if (inhibit) 5221 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5222 if (INTEL_GEN(engine->i915) < 11) 5223 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5224 CTX_CTRL_RS_CTX_ENABLE); 5225 regs[CTX_CONTEXT_CONTROL] = ctl; 5226 5227 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5228 regs[CTX_TIMESTAMP] = 0; 5229 } 5230 5231 static void init_wa_bb_reg_state(u32 * const regs, 5232 const struct intel_engine_cs *engine) 5233 { 5234 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5235 5236 if (wa_ctx->per_ctx.size) { 5237 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5238 5239 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5240 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5241 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5242 } 5243 5244 if (wa_ctx->indirect_ctx.size) { 5245 lrc_ring_setup_indirect_ctx(regs, engine, 5246 i915_ggtt_offset(wa_ctx->vma) + 5247 wa_ctx->indirect_ctx.offset, 5248 wa_ctx->indirect_ctx.size); 5249 } 5250 } 5251 5252 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5253 { 5254 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5255 /* 64b PPGTT (48bit canonical) 5256 * PDP0_DESCRIPTOR contains the base address to PML4 and 5257 * other PDP Descriptors are ignored. 5258 */ 5259 ASSIGN_CTX_PML4(ppgtt, regs); 5260 } else { 5261 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5262 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5263 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5264 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5265 } 5266 } 5267 5268 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5269 { 5270 if (i915_is_ggtt(vm)) 5271 return i915_vm_to_ggtt(vm)->alias; 5272 else 5273 return i915_vm_to_ppgtt(vm); 5274 } 5275 5276 static void execlists_init_reg_state(u32 *regs, 5277 const struct intel_context *ce, 5278 const struct intel_engine_cs *engine, 5279 const struct intel_ring *ring, 5280 bool inhibit) 5281 { 5282 /* 5283 * A context is actually a big batch buffer with several 5284 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5285 * values we are setting here are only for the first context restore: 5286 * on a subsequent save, the GPU will recreate this batchbuffer with new 5287 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5288 * we are not initializing here). 5289 * 5290 * Must keep consistent with virtual_update_register_offsets(). 5291 */ 5292 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5293 5294 init_common_reg_state(regs, engine, ring, inhibit); 5295 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5296 5297 init_wa_bb_reg_state(regs, engine); 5298 5299 __reset_stop_ring(regs, engine); 5300 } 5301 5302 static int 5303 populate_lr_context(struct intel_context *ce, 5304 struct drm_i915_gem_object *ctx_obj, 5305 struct intel_engine_cs *engine, 5306 struct intel_ring *ring) 5307 { 5308 bool inhibit = true; 5309 void *vaddr; 5310 5311 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5312 if (IS_ERR(vaddr)) { 5313 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5314 return PTR_ERR(vaddr); 5315 } 5316 5317 set_redzone(vaddr, engine); 5318 5319 if (engine->default_state) { 5320 shmem_read(engine->default_state, 0, 5321 vaddr, engine->context_size); 5322 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5323 inhibit = false; 5324 } 5325 5326 /* Clear the ppHWSP (inc. per-context counters) */ 5327 memset(vaddr, 0, PAGE_SIZE); 5328 5329 /* 5330 * The second page of the context object contains some registers which 5331 * must be set up prior to the first execution. 5332 */ 5333 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5334 ce, engine, ring, inhibit); 5335 5336 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5337 i915_gem_object_unpin_map(ctx_obj); 5338 return 0; 5339 } 5340 5341 static struct intel_timeline *pinned_timeline(struct intel_context *ce) 5342 { 5343 struct intel_timeline *tl = fetch_and_zero(&ce->timeline); 5344 5345 return intel_timeline_create_from_engine(ce->engine, 5346 page_unmask_bits(tl)); 5347 } 5348 5349 static int __execlists_context_alloc(struct intel_context *ce, 5350 struct intel_engine_cs *engine) 5351 { 5352 struct drm_i915_gem_object *ctx_obj; 5353 struct intel_ring *ring; 5354 struct i915_vma *vma; 5355 u32 context_size; 5356 int ret; 5357 5358 GEM_BUG_ON(ce->state); 5359 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5360 5361 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5362 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5363 5364 if (INTEL_GEN(engine->i915) == 12) { 5365 ce->wa_bb_page = context_size / PAGE_SIZE; 5366 context_size += PAGE_SIZE; 5367 } 5368 5369 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5370 if (IS_ERR(ctx_obj)) 5371 return PTR_ERR(ctx_obj); 5372 5373 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5374 if (IS_ERR(vma)) { 5375 ret = PTR_ERR(vma); 5376 goto error_deref_obj; 5377 } 5378 5379 if (!page_mask_bits(ce->timeline)) { 5380 struct intel_timeline *tl; 5381 5382 /* 5383 * Use the static global HWSP for the kernel context, and 5384 * a dynamically allocated cacheline for everyone else. 5385 */ 5386 if (unlikely(ce->timeline)) 5387 tl = pinned_timeline(ce); 5388 else 5389 tl = intel_timeline_create(engine->gt); 5390 if (IS_ERR(tl)) { 5391 ret = PTR_ERR(tl); 5392 goto error_deref_obj; 5393 } 5394 5395 ce->timeline = tl; 5396 } 5397 5398 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5399 if (IS_ERR(ring)) { 5400 ret = PTR_ERR(ring); 5401 goto error_deref_obj; 5402 } 5403 5404 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5405 if (ret) { 5406 drm_dbg(&engine->i915->drm, 5407 "Failed to populate LRC: %d\n", ret); 5408 goto error_ring_free; 5409 } 5410 5411 ce->ring = ring; 5412 ce->state = vma; 5413 5414 return 0; 5415 5416 error_ring_free: 5417 intel_ring_put(ring); 5418 error_deref_obj: 5419 i915_gem_object_put(ctx_obj); 5420 return ret; 5421 } 5422 5423 static struct list_head *virtual_queue(struct virtual_engine *ve) 5424 { 5425 return &ve->base.execlists.default_priolist.requests[0]; 5426 } 5427 5428 static void virtual_context_destroy(struct kref *kref) 5429 { 5430 struct virtual_engine *ve = 5431 container_of(kref, typeof(*ve), context.ref); 5432 unsigned int n; 5433 5434 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5435 GEM_BUG_ON(ve->request); 5436 GEM_BUG_ON(ve->context.inflight); 5437 5438 for (n = 0; n < ve->num_siblings; n++) { 5439 struct intel_engine_cs *sibling = ve->siblings[n]; 5440 struct rb_node *node = &ve->nodes[sibling->id].rb; 5441 unsigned long flags; 5442 5443 if (RB_EMPTY_NODE(node)) 5444 continue; 5445 5446 spin_lock_irqsave(&sibling->active.lock, flags); 5447 5448 /* Detachment is lazily performed in the execlists tasklet */ 5449 if (!RB_EMPTY_NODE(node)) 5450 rb_erase_cached(node, &sibling->execlists.virtual); 5451 5452 spin_unlock_irqrestore(&sibling->active.lock, flags); 5453 } 5454 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5455 5456 if (ve->context.state) 5457 __execlists_context_fini(&ve->context); 5458 intel_context_fini(&ve->context); 5459 5460 intel_engine_free_request_pool(&ve->base); 5461 5462 kfree(ve->bonds); 5463 kfree(ve); 5464 } 5465 5466 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5467 { 5468 int swp; 5469 5470 /* 5471 * Pick a random sibling on starting to help spread the load around. 5472 * 5473 * New contexts are typically created with exactly the same order 5474 * of siblings, and often started in batches. Due to the way we iterate 5475 * the array of sibling when submitting requests, sibling[0] is 5476 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5477 * randomised across the system, we also help spread the load by the 5478 * first engine we inspect being different each time. 5479 * 5480 * NB This does not force us to execute on this engine, it will just 5481 * typically be the first we inspect for submission. 5482 */ 5483 swp = prandom_u32_max(ve->num_siblings); 5484 if (swp) 5485 swap(ve->siblings[swp], ve->siblings[0]); 5486 } 5487 5488 static int virtual_context_alloc(struct intel_context *ce) 5489 { 5490 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5491 5492 return __execlists_context_alloc(ce, ve->siblings[0]); 5493 } 5494 5495 static int virtual_context_pin(struct intel_context *ce, void *vaddr) 5496 { 5497 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5498 5499 /* Note: we must use a real engine class for setting up reg state */ 5500 return __execlists_context_pin(ce, ve->siblings[0], vaddr); 5501 } 5502 5503 static void virtual_context_enter(struct intel_context *ce) 5504 { 5505 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5506 unsigned int n; 5507 5508 for (n = 0; n < ve->num_siblings; n++) 5509 intel_engine_pm_get(ve->siblings[n]); 5510 5511 intel_timeline_enter(ce->timeline); 5512 } 5513 5514 static void virtual_context_exit(struct intel_context *ce) 5515 { 5516 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5517 unsigned int n; 5518 5519 intel_timeline_exit(ce->timeline); 5520 5521 for (n = 0; n < ve->num_siblings; n++) 5522 intel_engine_pm_put(ve->siblings[n]); 5523 } 5524 5525 static const struct intel_context_ops virtual_context_ops = { 5526 .alloc = virtual_context_alloc, 5527 5528 .pre_pin = execlists_context_pre_pin, 5529 .pin = virtual_context_pin, 5530 .unpin = execlists_context_unpin, 5531 .post_unpin = execlists_context_post_unpin, 5532 5533 .enter = virtual_context_enter, 5534 .exit = virtual_context_exit, 5535 5536 .destroy = virtual_context_destroy, 5537 }; 5538 5539 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5540 { 5541 struct i915_request *rq; 5542 intel_engine_mask_t mask; 5543 5544 rq = READ_ONCE(ve->request); 5545 if (!rq) 5546 return 0; 5547 5548 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5549 mask = rq->execution_mask; 5550 if (unlikely(!mask)) { 5551 /* Invalid selection, submit to a random engine in error */ 5552 i915_request_set_error_once(rq, -ENODEV); 5553 mask = ve->siblings[0]->mask; 5554 } 5555 5556 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5557 rq->fence.context, rq->fence.seqno, 5558 mask, ve->base.execlists.queue_priority_hint); 5559 5560 return mask; 5561 } 5562 5563 static void virtual_submission_tasklet(unsigned long data) 5564 { 5565 struct virtual_engine * const ve = (struct virtual_engine *)data; 5566 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5567 intel_engine_mask_t mask; 5568 unsigned int n; 5569 5570 rcu_read_lock(); 5571 mask = virtual_submission_mask(ve); 5572 rcu_read_unlock(); 5573 if (unlikely(!mask)) 5574 return; 5575 5576 local_irq_disable(); 5577 for (n = 0; n < ve->num_siblings; n++) { 5578 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5579 struct ve_node * const node = &ve->nodes[sibling->id]; 5580 struct rb_node **parent, *rb; 5581 bool first; 5582 5583 if (!READ_ONCE(ve->request)) 5584 break; /* already handled by a sibling's tasklet */ 5585 5586 if (unlikely(!(mask & sibling->mask))) { 5587 if (!RB_EMPTY_NODE(&node->rb)) { 5588 spin_lock(&sibling->active.lock); 5589 rb_erase_cached(&node->rb, 5590 &sibling->execlists.virtual); 5591 RB_CLEAR_NODE(&node->rb); 5592 spin_unlock(&sibling->active.lock); 5593 } 5594 continue; 5595 } 5596 5597 spin_lock(&sibling->active.lock); 5598 5599 if (!RB_EMPTY_NODE(&node->rb)) { 5600 /* 5601 * Cheat and avoid rebalancing the tree if we can 5602 * reuse this node in situ. 5603 */ 5604 first = rb_first_cached(&sibling->execlists.virtual) == 5605 &node->rb; 5606 if (prio == node->prio || (prio > node->prio && first)) 5607 goto submit_engine; 5608 5609 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5610 } 5611 5612 rb = NULL; 5613 first = true; 5614 parent = &sibling->execlists.virtual.rb_root.rb_node; 5615 while (*parent) { 5616 struct ve_node *other; 5617 5618 rb = *parent; 5619 other = rb_entry(rb, typeof(*other), rb); 5620 if (prio > other->prio) { 5621 parent = &rb->rb_left; 5622 } else { 5623 parent = &rb->rb_right; 5624 first = false; 5625 } 5626 } 5627 5628 rb_link_node(&node->rb, rb, parent); 5629 rb_insert_color_cached(&node->rb, 5630 &sibling->execlists.virtual, 5631 first); 5632 5633 submit_engine: 5634 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5635 node->prio = prio; 5636 if (first && prio > sibling->execlists.queue_priority_hint) 5637 tasklet_hi_schedule(&sibling->execlists.tasklet); 5638 5639 spin_unlock(&sibling->active.lock); 5640 } 5641 local_irq_enable(); 5642 } 5643 5644 static void virtual_submit_request(struct i915_request *rq) 5645 { 5646 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5647 struct i915_request *old; 5648 unsigned long flags; 5649 5650 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5651 rq->fence.context, 5652 rq->fence.seqno); 5653 5654 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5655 5656 spin_lock_irqsave(&ve->base.active.lock, flags); 5657 5658 old = ve->request; 5659 if (old) { /* background completion event from preempt-to-busy */ 5660 GEM_BUG_ON(!i915_request_completed(old)); 5661 __i915_request_submit(old); 5662 i915_request_put(old); 5663 } 5664 5665 if (i915_request_completed(rq)) { 5666 __i915_request_submit(rq); 5667 5668 ve->base.execlists.queue_priority_hint = INT_MIN; 5669 ve->request = NULL; 5670 } else { 5671 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5672 ve->request = i915_request_get(rq); 5673 5674 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5675 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5676 5677 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5678 } 5679 5680 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5681 } 5682 5683 static struct ve_bond * 5684 virtual_find_bond(struct virtual_engine *ve, 5685 const struct intel_engine_cs *master) 5686 { 5687 int i; 5688 5689 for (i = 0; i < ve->num_bonds; i++) { 5690 if (ve->bonds[i].master == master) 5691 return &ve->bonds[i]; 5692 } 5693 5694 return NULL; 5695 } 5696 5697 static void 5698 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5699 { 5700 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5701 intel_engine_mask_t allowed, exec; 5702 struct ve_bond *bond; 5703 5704 allowed = ~to_request(signal)->engine->mask; 5705 5706 bond = virtual_find_bond(ve, to_request(signal)->engine); 5707 if (bond) 5708 allowed &= bond->sibling_mask; 5709 5710 /* Restrict the bonded request to run on only the available engines */ 5711 exec = READ_ONCE(rq->execution_mask); 5712 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5713 ; 5714 5715 /* Prevent the master from being re-run on the bonded engines */ 5716 to_request(signal)->execution_mask &= ~allowed; 5717 } 5718 5719 struct intel_context * 5720 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5721 unsigned int count) 5722 { 5723 struct virtual_engine *ve; 5724 unsigned int n; 5725 int err; 5726 5727 if (count == 0) 5728 return ERR_PTR(-EINVAL); 5729 5730 if (count == 1) 5731 return intel_context_create(siblings[0]); 5732 5733 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5734 if (!ve) 5735 return ERR_PTR(-ENOMEM); 5736 5737 ve->base.i915 = siblings[0]->i915; 5738 ve->base.gt = siblings[0]->gt; 5739 ve->base.uncore = siblings[0]->uncore; 5740 ve->base.id = -1; 5741 5742 ve->base.class = OTHER_CLASS; 5743 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5744 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5745 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5746 5747 /* 5748 * The decision on whether to submit a request using semaphores 5749 * depends on the saturated state of the engine. We only compute 5750 * this during HW submission of the request, and we need for this 5751 * state to be globally applied to all requests being submitted 5752 * to this engine. Virtual engines encompass more than one physical 5753 * engine and so we cannot accurately tell in advance if one of those 5754 * engines is already saturated and so cannot afford to use a semaphore 5755 * and be pessimized in priority for doing so -- if we are the only 5756 * context using semaphores after all other clients have stopped, we 5757 * will be starved on the saturated system. Such a global switch for 5758 * semaphores is less than ideal, but alas is the current compromise. 5759 */ 5760 ve->base.saturated = ALL_ENGINES; 5761 5762 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5763 5764 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5765 intel_engine_init_execlists(&ve->base); 5766 5767 ve->base.cops = &virtual_context_ops; 5768 ve->base.request_alloc = execlists_request_alloc; 5769 5770 ve->base.schedule = i915_schedule; 5771 ve->base.submit_request = virtual_submit_request; 5772 ve->base.bond_execute = virtual_bond_execute; 5773 5774 INIT_LIST_HEAD(virtual_queue(ve)); 5775 ve->base.execlists.queue_priority_hint = INT_MIN; 5776 tasklet_init(&ve->base.execlists.tasklet, 5777 virtual_submission_tasklet, 5778 (unsigned long)ve); 5779 5780 intel_context_init(&ve->context, &ve->base); 5781 5782 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL); 5783 if (!ve->base.breadcrumbs) { 5784 err = -ENOMEM; 5785 goto err_put; 5786 } 5787 5788 for (n = 0; n < count; n++) { 5789 struct intel_engine_cs *sibling = siblings[n]; 5790 5791 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5792 if (sibling->mask & ve->base.mask) { 5793 DRM_DEBUG("duplicate %s entry in load balancer\n", 5794 sibling->name); 5795 err = -EINVAL; 5796 goto err_put; 5797 } 5798 5799 /* 5800 * The virtual engine implementation is tightly coupled to 5801 * the execlists backend -- we push out request directly 5802 * into a tree inside each physical engine. We could support 5803 * layering if we handle cloning of the requests and 5804 * submitting a copy into each backend. 5805 */ 5806 if (sibling->execlists.tasklet.func != 5807 execlists_submission_tasklet) { 5808 err = -ENODEV; 5809 goto err_put; 5810 } 5811 5812 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5813 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5814 5815 ve->siblings[ve->num_siblings++] = sibling; 5816 ve->base.mask |= sibling->mask; 5817 5818 /* 5819 * All physical engines must be compatible for their emission 5820 * functions (as we build the instructions during request 5821 * construction and do not alter them before submission 5822 * on the physical engine). We use the engine class as a guide 5823 * here, although that could be refined. 5824 */ 5825 if (ve->base.class != OTHER_CLASS) { 5826 if (ve->base.class != sibling->class) { 5827 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5828 sibling->class, ve->base.class); 5829 err = -EINVAL; 5830 goto err_put; 5831 } 5832 continue; 5833 } 5834 5835 ve->base.class = sibling->class; 5836 ve->base.uabi_class = sibling->uabi_class; 5837 snprintf(ve->base.name, sizeof(ve->base.name), 5838 "v%dx%d", ve->base.class, count); 5839 ve->base.context_size = sibling->context_size; 5840 5841 ve->base.emit_bb_start = sibling->emit_bb_start; 5842 ve->base.emit_flush = sibling->emit_flush; 5843 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5844 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5845 ve->base.emit_fini_breadcrumb_dw = 5846 sibling->emit_fini_breadcrumb_dw; 5847 5848 ve->base.flags = sibling->flags; 5849 } 5850 5851 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5852 5853 virtual_engine_initial_hint(ve); 5854 return &ve->context; 5855 5856 err_put: 5857 intel_context_put(&ve->context); 5858 return ERR_PTR(err); 5859 } 5860 5861 struct intel_context * 5862 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5863 { 5864 struct virtual_engine *se = to_virtual_engine(src); 5865 struct intel_context *dst; 5866 5867 dst = intel_execlists_create_virtual(se->siblings, 5868 se->num_siblings); 5869 if (IS_ERR(dst)) 5870 return dst; 5871 5872 if (se->num_bonds) { 5873 struct virtual_engine *de = to_virtual_engine(dst->engine); 5874 5875 de->bonds = kmemdup(se->bonds, 5876 sizeof(*se->bonds) * se->num_bonds, 5877 GFP_KERNEL); 5878 if (!de->bonds) { 5879 intel_context_put(dst); 5880 return ERR_PTR(-ENOMEM); 5881 } 5882 5883 de->num_bonds = se->num_bonds; 5884 } 5885 5886 return dst; 5887 } 5888 5889 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5890 const struct intel_engine_cs *master, 5891 const struct intel_engine_cs *sibling) 5892 { 5893 struct virtual_engine *ve = to_virtual_engine(engine); 5894 struct ve_bond *bond; 5895 int n; 5896 5897 /* Sanity check the sibling is part of the virtual engine */ 5898 for (n = 0; n < ve->num_siblings; n++) 5899 if (sibling == ve->siblings[n]) 5900 break; 5901 if (n == ve->num_siblings) 5902 return -EINVAL; 5903 5904 bond = virtual_find_bond(ve, master); 5905 if (bond) { 5906 bond->sibling_mask |= sibling->mask; 5907 return 0; 5908 } 5909 5910 bond = krealloc(ve->bonds, 5911 sizeof(*bond) * (ve->num_bonds + 1), 5912 GFP_KERNEL); 5913 if (!bond) 5914 return -ENOMEM; 5915 5916 bond[ve->num_bonds].master = master; 5917 bond[ve->num_bonds].sibling_mask = sibling->mask; 5918 5919 ve->bonds = bond; 5920 ve->num_bonds++; 5921 5922 return 0; 5923 } 5924 5925 struct intel_engine_cs * 5926 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5927 unsigned int sibling) 5928 { 5929 struct virtual_engine *ve = to_virtual_engine(engine); 5930 5931 if (sibling >= ve->num_siblings) 5932 return NULL; 5933 5934 return ve->siblings[sibling]; 5935 } 5936 5937 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5938 struct drm_printer *m, 5939 void (*show_request)(struct drm_printer *m, 5940 struct i915_request *rq, 5941 const char *prefix), 5942 unsigned int max) 5943 { 5944 const struct intel_engine_execlists *execlists = &engine->execlists; 5945 struct i915_request *rq, *last; 5946 unsigned long flags; 5947 unsigned int count; 5948 struct rb_node *rb; 5949 5950 spin_lock_irqsave(&engine->active.lock, flags); 5951 5952 last = NULL; 5953 count = 0; 5954 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5955 if (count++ < max - 1) 5956 show_request(m, rq, "\t\tE "); 5957 else 5958 last = rq; 5959 } 5960 if (last) { 5961 if (count > max) { 5962 drm_printf(m, 5963 "\t\t...skipping %d executing requests...\n", 5964 count - max); 5965 } 5966 show_request(m, last, "\t\tE "); 5967 } 5968 5969 if (execlists->switch_priority_hint != INT_MIN) 5970 drm_printf(m, "\t\tSwitch priority hint: %d\n", 5971 READ_ONCE(execlists->switch_priority_hint)); 5972 if (execlists->queue_priority_hint != INT_MIN) 5973 drm_printf(m, "\t\tQueue priority hint: %d\n", 5974 READ_ONCE(execlists->queue_priority_hint)); 5975 5976 last = NULL; 5977 count = 0; 5978 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5979 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5980 int i; 5981 5982 priolist_for_each_request(rq, p, i) { 5983 if (count++ < max - 1) 5984 show_request(m, rq, "\t\tQ "); 5985 else 5986 last = rq; 5987 } 5988 } 5989 if (last) { 5990 if (count > max) { 5991 drm_printf(m, 5992 "\t\t...skipping %d queued requests...\n", 5993 count - max); 5994 } 5995 show_request(m, last, "\t\tQ "); 5996 } 5997 5998 last = NULL; 5999 count = 0; 6000 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 6001 struct virtual_engine *ve = 6002 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 6003 struct i915_request *rq = READ_ONCE(ve->request); 6004 6005 if (rq) { 6006 if (count++ < max - 1) 6007 show_request(m, rq, "\t\tV "); 6008 else 6009 last = rq; 6010 } 6011 } 6012 if (last) { 6013 if (count > max) { 6014 drm_printf(m, 6015 "\t\t...skipping %d virtual requests...\n", 6016 count - max); 6017 } 6018 show_request(m, last, "\t\tV "); 6019 } 6020 6021 spin_unlock_irqrestore(&engine->active.lock, flags); 6022 } 6023 6024 void intel_lr_context_reset(struct intel_engine_cs *engine, 6025 struct intel_context *ce, 6026 u32 head, 6027 bool scrub) 6028 { 6029 GEM_BUG_ON(!intel_context_is_pinned(ce)); 6030 6031 /* 6032 * We want a simple context + ring to execute the breadcrumb update. 6033 * We cannot rely on the context being intact across the GPU hang, 6034 * so clear it and rebuild just what we need for the breadcrumb. 6035 * All pending requests for this context will be zapped, and any 6036 * future request will be after userspace has had the opportunity 6037 * to recreate its own state. 6038 */ 6039 if (scrub) 6040 restore_default_state(ce, engine); 6041 6042 /* Rerun the request; its payload has been neutered (if guilty). */ 6043 __execlists_update_reg_state(ce, engine, head); 6044 } 6045 6046 bool 6047 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 6048 { 6049 return engine->set_default_submission == 6050 intel_execlists_set_default_submission; 6051 } 6052 6053 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 6054 #include "selftest_lrc.c" 6055 #endif 6056