1 /* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Ben Widawsky <ben@bwidawsk.net> 25 * Michel Thierry <michel.thierry@intel.com> 26 * Thomas Daniel <thomas.daniel@intel.com> 27 * Oscar Mateo <oscar.mateo@intel.com> 28 * 29 */ 30 31 /** 32 * DOC: Logical Rings, Logical Ring Contexts and Execlists 33 * 34 * Motivation: 35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". 36 * These expanded contexts enable a number of new abilities, especially 37 * "Execlists" (also implemented in this file). 38 * 39 * One of the main differences with the legacy HW contexts is that logical 40 * ring contexts incorporate many more things to the context's state, like 41 * PDPs or ringbuffer control registers: 42 * 43 * The reason why PDPs are included in the context is straightforward: as 44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs 45 * contained there mean you don't need to do a ppgtt->switch_mm yourself, 46 * instead, the GPU will do it for you on the context switch. 47 * 48 * But, what about the ringbuffer control registers (head, tail, etc..)? 49 * shouldn't we just need a set of those per engine command streamer? This is 50 * where the name "Logical Rings" starts to make sense: by virtualizing the 51 * rings, the engine cs shifts to a new "ring buffer" with every context 52 * switch. When you want to submit a workload to the GPU you: A) choose your 53 * context, B) find its appropriate virtualized ring, C) write commands to it 54 * and then, finally, D) tell the GPU to switch to that context. 55 * 56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch 57 * to a contexts is via a context execution list, ergo "Execlists". 58 * 59 * LRC implementation: 60 * Regarding the creation of contexts, we have: 61 * 62 * - One global default context. 63 * - One local default context for each opened fd. 64 * - One local extra context for each context create ioctl call. 65 * 66 * Now that ringbuffers belong per-context (and not per-engine, like before) 67 * and that contexts are uniquely tied to a given engine (and not reusable, 68 * like before) we need: 69 * 70 * - One ringbuffer per-engine inside each context. 71 * - One backing object per-engine inside each context. 72 * 73 * The global default context starts its life with these new objects fully 74 * allocated and populated. The local default context for each opened fd is 75 * more complex, because we don't know at creation time which engine is going 76 * to use them. To handle this, we have implemented a deferred creation of LR 77 * contexts: 78 * 79 * The local context starts its life as a hollow or blank holder, that only 80 * gets populated for a given engine once we receive an execbuffer. If later 81 * on we receive another execbuffer ioctl for the same context but a different 82 * engine, we allocate/populate a new ringbuffer and context backing object and 83 * so on. 84 * 85 * Finally, regarding local contexts created using the ioctl call: as they are 86 * only allowed with the render ring, we can allocate & populate them right 87 * away (no need to defer anything, at least for now). 88 * 89 * Execlists implementation: 90 * Execlists are the new method by which, on gen8+ hardware, workloads are 91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method). 92 * This method works as follows: 93 * 94 * When a request is committed, its commands (the BB start and any leading or 95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer 96 * for the appropriate context. The tail pointer in the hardware context is not 97 * updated at this time, but instead, kept by the driver in the ringbuffer 98 * structure. A structure representing this request is added to a request queue 99 * for the appropriate engine: this structure contains a copy of the context's 100 * tail after the request was written to the ring buffer and a pointer to the 101 * context itself. 102 * 103 * If the engine's request queue was empty before the request was added, the 104 * queue is processed immediately. Otherwise the queue will be processed during 105 * a context switch interrupt. In any case, elements on the queue will get sent 106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a 107 * globally unique 20-bits submission ID. 108 * 109 * When execution of a request completes, the GPU updates the context status 110 * buffer with a context complete event and generates a context switch interrupt. 111 * During the interrupt handling, the driver examines the events in the buffer: 112 * for each context complete event, if the announced ID matches that on the head 113 * of the request queue, then that request is retired and removed from the queue. 114 * 115 * After processing, if any requests were retired and the queue is not empty 116 * then a new execution list can be submitted. The two requests at the front of 117 * the queue are next to be submitted but since a context may not occur twice in 118 * an execution list, if subsequent requests have the same ID as the first then 119 * the two requests must be combined. This is done simply by discarding requests 120 * at the head of the queue until either only one requests is left (in which case 121 * we use a NULL second context) or the first two requests have unique IDs. 122 * 123 * By always executing the first two requests in the queue the driver ensures 124 * that the GPU is kept as busy as possible. In the case where a single context 125 * completes but a second context is still executing, the request for this second 126 * context will be at the head of the queue when we remove the first one. This 127 * request will then be resubmitted along with a new request for a different context, 128 * which will cause the hardware to continue executing the second request and queue 129 * the new request (the GPU detects the condition of a context getting preempted 130 * with the same context and optimizes the context switch flow by not doing 131 * preemption, but just sampling the new tail pointer). 132 * 133 */ 134 #include <linux/interrupt.h> 135 136 #include "i915_drv.h" 137 #include "i915_perf.h" 138 #include "i915_trace.h" 139 #include "i915_vgpu.h" 140 #include "intel_context.h" 141 #include "intel_engine_pm.h" 142 #include "intel_gt.h" 143 #include "intel_gt_pm.h" 144 #include "intel_gt_requests.h" 145 #include "intel_lrc_reg.h" 146 #include "intel_mocs.h" 147 #include "intel_reset.h" 148 #include "intel_ring.h" 149 #include "intel_workarounds.h" 150 #include "shmem_utils.h" 151 152 #define RING_EXECLIST_QFULL (1 << 0x2) 153 #define RING_EXECLIST1_VALID (1 << 0x3) 154 #define RING_EXECLIST0_VALID (1 << 0x4) 155 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) 156 #define RING_EXECLIST1_ACTIVE (1 << 0x11) 157 #define RING_EXECLIST0_ACTIVE (1 << 0x12) 158 159 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) 160 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1) 161 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) 162 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) 163 #define GEN8_CTX_STATUS_COMPLETE (1 << 4) 164 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) 165 166 #define GEN8_CTX_STATUS_COMPLETED_MASK \ 167 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) 168 169 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2) 170 171 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */ 172 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */ 173 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15) 174 #define GEN12_IDLE_CTX_ID 0x7FF 175 #define GEN12_CSB_CTX_VALID(csb_dw) \ 176 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID) 177 178 /* Typical size of the average request (2 pipecontrols and a MI_BB) */ 179 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ 180 181 struct virtual_engine { 182 struct intel_engine_cs base; 183 struct intel_context context; 184 185 /* 186 * We allow only a single request through the virtual engine at a time 187 * (each request in the timeline waits for the completion fence of 188 * the previous before being submitted). By restricting ourselves to 189 * only submitting a single request, each request is placed on to a 190 * physical to maximise load spreading (by virtue of the late greedy 191 * scheduling -- each real engine takes the next available request 192 * upon idling). 193 */ 194 struct i915_request *request; 195 196 /* 197 * We keep a rbtree of available virtual engines inside each physical 198 * engine, sorted by priority. Here we preallocate the nodes we need 199 * for the virtual engine, indexed by physical_engine->id. 200 */ 201 struct ve_node { 202 struct rb_node rb; 203 int prio; 204 } nodes[I915_NUM_ENGINES]; 205 206 /* 207 * Keep track of bonded pairs -- restrictions upon on our selection 208 * of physical engines any particular request may be submitted to. 209 * If we receive a submit-fence from a master engine, we will only 210 * use one of sibling_mask physical engines. 211 */ 212 struct ve_bond { 213 const struct intel_engine_cs *master; 214 intel_engine_mask_t sibling_mask; 215 } *bonds; 216 unsigned int num_bonds; 217 218 /* And finally, which physical engines this virtual engine maps onto. */ 219 unsigned int num_siblings; 220 struct intel_engine_cs *siblings[]; 221 }; 222 223 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) 224 { 225 GEM_BUG_ON(!intel_engine_is_virtual(engine)); 226 return container_of(engine, struct virtual_engine, base); 227 } 228 229 static int __execlists_context_alloc(struct intel_context *ce, 230 struct intel_engine_cs *engine); 231 232 static void execlists_init_reg_state(u32 *reg_state, 233 const struct intel_context *ce, 234 const struct intel_engine_cs *engine, 235 const struct intel_ring *ring, 236 bool close); 237 static void 238 __execlists_update_reg_state(const struct intel_context *ce, 239 const struct intel_engine_cs *engine, 240 u32 head); 241 242 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) 243 { 244 if (INTEL_GEN(engine->i915) >= 12) 245 return 0x60; 246 else if (INTEL_GEN(engine->i915) >= 9) 247 return 0x54; 248 else if (engine->class == RENDER_CLASS) 249 return 0x58; 250 else 251 return -1; 252 } 253 254 static int lrc_ring_gpr0(const struct intel_engine_cs *engine) 255 { 256 if (INTEL_GEN(engine->i915) >= 12) 257 return 0x74; 258 else if (INTEL_GEN(engine->i915) >= 9) 259 return 0x68; 260 else if (engine->class == RENDER_CLASS) 261 return 0xd8; 262 else 263 return -1; 264 } 265 266 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) 267 { 268 if (INTEL_GEN(engine->i915) >= 12) 269 return 0x12; 270 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) 271 return 0x18; 272 else 273 return -1; 274 } 275 276 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) 277 { 278 int x; 279 280 x = lrc_ring_wa_bb_per_ctx(engine); 281 if (x < 0) 282 return x; 283 284 return x + 2; 285 } 286 287 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) 288 { 289 int x; 290 291 x = lrc_ring_indirect_ptr(engine); 292 if (x < 0) 293 return x; 294 295 return x + 2; 296 } 297 298 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) 299 { 300 if (engine->class != RENDER_CLASS) 301 return -1; 302 303 if (INTEL_GEN(engine->i915) >= 12) 304 return 0xb6; 305 else if (INTEL_GEN(engine->i915) >= 11) 306 return 0xaa; 307 else 308 return -1; 309 } 310 311 static u32 312 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) 313 { 314 switch (INTEL_GEN(engine->i915)) { 315 default: 316 MISSING_CASE(INTEL_GEN(engine->i915)); 317 fallthrough; 318 case 12: 319 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 320 case 11: 321 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 322 case 10: 323 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 324 case 9: 325 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 326 case 8: 327 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; 328 } 329 } 330 331 static void 332 lrc_ring_setup_indirect_ctx(u32 *regs, 333 const struct intel_engine_cs *engine, 334 u32 ctx_bb_ggtt_addr, 335 u32 size) 336 { 337 GEM_BUG_ON(!size); 338 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); 339 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); 340 regs[lrc_ring_indirect_ptr(engine) + 1] = 341 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); 342 343 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); 344 regs[lrc_ring_indirect_offset(engine) + 1] = 345 lrc_ring_indirect_offset_default(engine) << 6; 346 } 347 348 static u32 intel_context_get_runtime(const struct intel_context *ce) 349 { 350 /* 351 * We can use either ppHWSP[16] which is recorded before the context 352 * switch (and so excludes the cost of context switches) or use the 353 * value from the context image itself, which is saved/restored earlier 354 * and so includes the cost of the save. 355 */ 356 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); 357 } 358 359 static void mark_eio(struct i915_request *rq) 360 { 361 if (i915_request_completed(rq)) 362 return; 363 364 GEM_BUG_ON(i915_request_signaled(rq)); 365 366 i915_request_set_error_once(rq, -EIO); 367 i915_request_mark_complete(rq); 368 } 369 370 static struct i915_request * 371 active_request(const struct intel_timeline * const tl, struct i915_request *rq) 372 { 373 struct i915_request *active = rq; 374 375 rcu_read_lock(); 376 list_for_each_entry_continue_reverse(rq, &tl->requests, link) { 377 if (i915_request_completed(rq)) 378 break; 379 380 active = rq; 381 } 382 rcu_read_unlock(); 383 384 return active; 385 } 386 387 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) 388 { 389 return (i915_ggtt_offset(engine->status_page.vma) + 390 I915_GEM_HWS_PREEMPT_ADDR); 391 } 392 393 static inline void 394 ring_set_paused(const struct intel_engine_cs *engine, int state) 395 { 396 /* 397 * We inspect HWS_PREEMPT with a semaphore inside 398 * engine->emit_fini_breadcrumb. If the dword is true, 399 * the ring is paused as the semaphore will busywait 400 * until the dword is false. 401 */ 402 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state; 403 if (state) 404 wmb(); 405 } 406 407 static inline struct i915_priolist *to_priolist(struct rb_node *rb) 408 { 409 return rb_entry(rb, struct i915_priolist, node); 410 } 411 412 static inline int rq_prio(const struct i915_request *rq) 413 { 414 return READ_ONCE(rq->sched.attr.priority); 415 } 416 417 static int effective_prio(const struct i915_request *rq) 418 { 419 int prio = rq_prio(rq); 420 421 /* 422 * If this request is special and must not be interrupted at any 423 * cost, so be it. Note we are only checking the most recent request 424 * in the context and so may be masking an earlier vip request. It 425 * is hoped that under the conditions where nopreempt is used, this 426 * will not matter (i.e. all requests to that context will be 427 * nopreempt for as long as desired). 428 */ 429 if (i915_request_has_nopreempt(rq)) 430 prio = I915_PRIORITY_UNPREEMPTABLE; 431 432 return prio; 433 } 434 435 static int queue_prio(const struct intel_engine_execlists *execlists) 436 { 437 struct i915_priolist *p; 438 struct rb_node *rb; 439 440 rb = rb_first_cached(&execlists->queue); 441 if (!rb) 442 return INT_MIN; 443 444 /* 445 * As the priolist[] are inverted, with the highest priority in [0], 446 * we have to flip the index value to become priority. 447 */ 448 p = to_priolist(rb); 449 if (!I915_USER_PRIORITY_SHIFT) 450 return p->priority; 451 452 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); 453 } 454 455 static inline bool need_preempt(const struct intel_engine_cs *engine, 456 const struct i915_request *rq, 457 struct rb_node *rb) 458 { 459 int last_prio; 460 461 if (!intel_engine_has_semaphores(engine)) 462 return false; 463 464 /* 465 * Check if the current priority hint merits a preemption attempt. 466 * 467 * We record the highest value priority we saw during rescheduling 468 * prior to this dequeue, therefore we know that if it is strictly 469 * less than the current tail of ESLP[0], we do not need to force 470 * a preempt-to-idle cycle. 471 * 472 * However, the priority hint is a mere hint that we may need to 473 * preempt. If that hint is stale or we may be trying to preempt 474 * ourselves, ignore the request. 475 * 476 * More naturally we would write 477 * prio >= max(0, last); 478 * except that we wish to prevent triggering preemption at the same 479 * priority level: the task that is running should remain running 480 * to preserve FIFO ordering of dependencies. 481 */ 482 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1); 483 if (engine->execlists.queue_priority_hint <= last_prio) 484 return false; 485 486 /* 487 * Check against the first request in ELSP[1], it will, thanks to the 488 * power of PI, be the highest priority of that context. 489 */ 490 if (!list_is_last(&rq->sched.link, &engine->active.requests) && 491 rq_prio(list_next_entry(rq, sched.link)) > last_prio) 492 return true; 493 494 if (rb) { 495 struct virtual_engine *ve = 496 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 497 bool preempt = false; 498 499 if (engine == ve->siblings[0]) { /* only preempt one sibling */ 500 struct i915_request *next; 501 502 rcu_read_lock(); 503 next = READ_ONCE(ve->request); 504 if (next) 505 preempt = rq_prio(next) > last_prio; 506 rcu_read_unlock(); 507 } 508 509 if (preempt) 510 return preempt; 511 } 512 513 /* 514 * If the inflight context did not trigger the preemption, then maybe 515 * it was the set of queued requests? Pick the highest priority in 516 * the queue (the first active priolist) and see if it deserves to be 517 * running instead of ELSP[0]. 518 * 519 * The highest priority request in the queue can not be either 520 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same 521 * context, it's priority would not exceed ELSP[0] aka last_prio. 522 */ 523 return queue_prio(&engine->execlists) > last_prio; 524 } 525 526 __maybe_unused static inline bool 527 assert_priority_queue(const struct i915_request *prev, 528 const struct i915_request *next) 529 { 530 /* 531 * Without preemption, the prev may refer to the still active element 532 * which we refuse to let go. 533 * 534 * Even with preemption, there are times when we think it is better not 535 * to preempt and leave an ostensibly lower priority request in flight. 536 */ 537 if (i915_request_is_active(prev)) 538 return true; 539 540 return rq_prio(prev) >= rq_prio(next); 541 } 542 543 /* 544 * The context descriptor encodes various attributes of a context, 545 * including its GTT address and some flags. Because it's fairly 546 * expensive to calculate, we'll just do it once and cache the result, 547 * which remains valid until the context is unpinned. 548 * 549 * This is what a descriptor looks like, from LSB to MSB:: 550 * 551 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) 552 * bits 12-31: LRCA, GTT address of (the HWSP of) this context 553 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) 554 * bits 53-54: mbz, reserved for use by hardware 555 * bits 55-63: group ID, currently unused and set to 0 556 * 557 * Starting from Gen11, the upper dword of the descriptor has a new format: 558 * 559 * bits 32-36: reserved 560 * bits 37-47: SW context ID 561 * bits 48:53: engine instance 562 * bit 54: mbz, reserved for use by hardware 563 * bits 55-60: SW counter 564 * bits 61-63: engine class 565 * 566 * engine info, SW context ID and SW counter need to form a unique number 567 * (Context ID) per lrc. 568 */ 569 static u32 570 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) 571 { 572 u32 desc; 573 574 desc = INTEL_LEGACY_32B_CONTEXT; 575 if (i915_vm_is_4lvl(ce->vm)) 576 desc = INTEL_LEGACY_64B_CONTEXT; 577 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT; 578 579 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE; 580 if (IS_GEN(engine->i915, 8)) 581 desc |= GEN8_CTX_L3LLC_COHERENT; 582 583 return i915_ggtt_offset(ce->state) | desc; 584 } 585 586 static inline unsigned int dword_in_page(void *addr) 587 { 588 return offset_in_page(addr) / sizeof(u32); 589 } 590 591 static void set_offsets(u32 *regs, 592 const u8 *data, 593 const struct intel_engine_cs *engine, 594 bool clear) 595 #define NOP(x) (BIT(7) | (x)) 596 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) 597 #define POSTED BIT(0) 598 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) 599 #define REG16(x) \ 600 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ 601 (((x) >> 2) & 0x7f) 602 #define END(total_state_size) 0, (total_state_size) 603 { 604 const u32 base = engine->mmio_base; 605 606 while (*data) { 607 u8 count, flags; 608 609 if (*data & BIT(7)) { /* skip */ 610 count = *data++ & ~BIT(7); 611 if (clear) 612 memset32(regs, MI_NOOP, count); 613 regs += count; 614 continue; 615 } 616 617 count = *data & 0x3f; 618 flags = *data >> 6; 619 data++; 620 621 *regs = MI_LOAD_REGISTER_IMM(count); 622 if (flags & POSTED) 623 *regs |= MI_LRI_FORCE_POSTED; 624 if (INTEL_GEN(engine->i915) >= 11) 625 *regs |= MI_LRI_LRM_CS_MMIO; 626 regs++; 627 628 GEM_BUG_ON(!count); 629 do { 630 u32 offset = 0; 631 u8 v; 632 633 do { 634 v = *data++; 635 offset <<= 7; 636 offset |= v & ~BIT(7); 637 } while (v & BIT(7)); 638 639 regs[0] = base + (offset << 2); 640 if (clear) 641 regs[1] = 0; 642 regs += 2; 643 } while (--count); 644 } 645 646 if (clear) { 647 u8 count = *++data; 648 649 /* Clear past the tail for HW access */ 650 GEM_BUG_ON(dword_in_page(regs) > count); 651 memset32(regs, MI_NOOP, count - dword_in_page(regs)); 652 653 /* Close the batch; used mainly by live_lrc_layout() */ 654 *regs = MI_BATCH_BUFFER_END; 655 if (INTEL_GEN(engine->i915) >= 10) 656 *regs |= BIT(0); 657 } 658 } 659 660 static const u8 gen8_xcs_offsets[] = { 661 NOP(1), 662 LRI(11, 0), 663 REG16(0x244), 664 REG(0x034), 665 REG(0x030), 666 REG(0x038), 667 REG(0x03c), 668 REG(0x168), 669 REG(0x140), 670 REG(0x110), 671 REG(0x11c), 672 REG(0x114), 673 REG(0x118), 674 675 NOP(9), 676 LRI(9, 0), 677 REG16(0x3a8), 678 REG16(0x28c), 679 REG16(0x288), 680 REG16(0x284), 681 REG16(0x280), 682 REG16(0x27c), 683 REG16(0x278), 684 REG16(0x274), 685 REG16(0x270), 686 687 NOP(13), 688 LRI(2, 0), 689 REG16(0x200), 690 REG(0x028), 691 692 END(80) 693 }; 694 695 static const u8 gen9_xcs_offsets[] = { 696 NOP(1), 697 LRI(14, POSTED), 698 REG16(0x244), 699 REG(0x034), 700 REG(0x030), 701 REG(0x038), 702 REG(0x03c), 703 REG(0x168), 704 REG(0x140), 705 REG(0x110), 706 REG(0x11c), 707 REG(0x114), 708 REG(0x118), 709 REG(0x1c0), 710 REG(0x1c4), 711 REG(0x1c8), 712 713 NOP(3), 714 LRI(9, POSTED), 715 REG16(0x3a8), 716 REG16(0x28c), 717 REG16(0x288), 718 REG16(0x284), 719 REG16(0x280), 720 REG16(0x27c), 721 REG16(0x278), 722 REG16(0x274), 723 REG16(0x270), 724 725 NOP(13), 726 LRI(1, POSTED), 727 REG16(0x200), 728 729 NOP(13), 730 LRI(44, POSTED), 731 REG(0x028), 732 REG(0x09c), 733 REG(0x0c0), 734 REG(0x178), 735 REG(0x17c), 736 REG16(0x358), 737 REG(0x170), 738 REG(0x150), 739 REG(0x154), 740 REG(0x158), 741 REG16(0x41c), 742 REG16(0x600), 743 REG16(0x604), 744 REG16(0x608), 745 REG16(0x60c), 746 REG16(0x610), 747 REG16(0x614), 748 REG16(0x618), 749 REG16(0x61c), 750 REG16(0x620), 751 REG16(0x624), 752 REG16(0x628), 753 REG16(0x62c), 754 REG16(0x630), 755 REG16(0x634), 756 REG16(0x638), 757 REG16(0x63c), 758 REG16(0x640), 759 REG16(0x644), 760 REG16(0x648), 761 REG16(0x64c), 762 REG16(0x650), 763 REG16(0x654), 764 REG16(0x658), 765 REG16(0x65c), 766 REG16(0x660), 767 REG16(0x664), 768 REG16(0x668), 769 REG16(0x66c), 770 REG16(0x670), 771 REG16(0x674), 772 REG16(0x678), 773 REG16(0x67c), 774 REG(0x068), 775 776 END(176) 777 }; 778 779 static const u8 gen12_xcs_offsets[] = { 780 NOP(1), 781 LRI(13, POSTED), 782 REG16(0x244), 783 REG(0x034), 784 REG(0x030), 785 REG(0x038), 786 REG(0x03c), 787 REG(0x168), 788 REG(0x140), 789 REG(0x110), 790 REG(0x1c0), 791 REG(0x1c4), 792 REG(0x1c8), 793 REG(0x180), 794 REG16(0x2b4), 795 796 NOP(5), 797 LRI(9, POSTED), 798 REG16(0x3a8), 799 REG16(0x28c), 800 REG16(0x288), 801 REG16(0x284), 802 REG16(0x280), 803 REG16(0x27c), 804 REG16(0x278), 805 REG16(0x274), 806 REG16(0x270), 807 808 END(80) 809 }; 810 811 static const u8 gen8_rcs_offsets[] = { 812 NOP(1), 813 LRI(14, POSTED), 814 REG16(0x244), 815 REG(0x034), 816 REG(0x030), 817 REG(0x038), 818 REG(0x03c), 819 REG(0x168), 820 REG(0x140), 821 REG(0x110), 822 REG(0x11c), 823 REG(0x114), 824 REG(0x118), 825 REG(0x1c0), 826 REG(0x1c4), 827 REG(0x1c8), 828 829 NOP(3), 830 LRI(9, POSTED), 831 REG16(0x3a8), 832 REG16(0x28c), 833 REG16(0x288), 834 REG16(0x284), 835 REG16(0x280), 836 REG16(0x27c), 837 REG16(0x278), 838 REG16(0x274), 839 REG16(0x270), 840 841 NOP(13), 842 LRI(1, 0), 843 REG(0x0c8), 844 845 END(80) 846 }; 847 848 static const u8 gen9_rcs_offsets[] = { 849 NOP(1), 850 LRI(14, POSTED), 851 REG16(0x244), 852 REG(0x34), 853 REG(0x30), 854 REG(0x38), 855 REG(0x3c), 856 REG(0x168), 857 REG(0x140), 858 REG(0x110), 859 REG(0x11c), 860 REG(0x114), 861 REG(0x118), 862 REG(0x1c0), 863 REG(0x1c4), 864 REG(0x1c8), 865 866 NOP(3), 867 LRI(9, POSTED), 868 REG16(0x3a8), 869 REG16(0x28c), 870 REG16(0x288), 871 REG16(0x284), 872 REG16(0x280), 873 REG16(0x27c), 874 REG16(0x278), 875 REG16(0x274), 876 REG16(0x270), 877 878 NOP(13), 879 LRI(1, 0), 880 REG(0xc8), 881 882 NOP(13), 883 LRI(44, POSTED), 884 REG(0x28), 885 REG(0x9c), 886 REG(0xc0), 887 REG(0x178), 888 REG(0x17c), 889 REG16(0x358), 890 REG(0x170), 891 REG(0x150), 892 REG(0x154), 893 REG(0x158), 894 REG16(0x41c), 895 REG16(0x600), 896 REG16(0x604), 897 REG16(0x608), 898 REG16(0x60c), 899 REG16(0x610), 900 REG16(0x614), 901 REG16(0x618), 902 REG16(0x61c), 903 REG16(0x620), 904 REG16(0x624), 905 REG16(0x628), 906 REG16(0x62c), 907 REG16(0x630), 908 REG16(0x634), 909 REG16(0x638), 910 REG16(0x63c), 911 REG16(0x640), 912 REG16(0x644), 913 REG16(0x648), 914 REG16(0x64c), 915 REG16(0x650), 916 REG16(0x654), 917 REG16(0x658), 918 REG16(0x65c), 919 REG16(0x660), 920 REG16(0x664), 921 REG16(0x668), 922 REG16(0x66c), 923 REG16(0x670), 924 REG16(0x674), 925 REG16(0x678), 926 REG16(0x67c), 927 REG(0x68), 928 929 END(176) 930 }; 931 932 static const u8 gen11_rcs_offsets[] = { 933 NOP(1), 934 LRI(15, POSTED), 935 REG16(0x244), 936 REG(0x034), 937 REG(0x030), 938 REG(0x038), 939 REG(0x03c), 940 REG(0x168), 941 REG(0x140), 942 REG(0x110), 943 REG(0x11c), 944 REG(0x114), 945 REG(0x118), 946 REG(0x1c0), 947 REG(0x1c4), 948 REG(0x1c8), 949 REG(0x180), 950 951 NOP(1), 952 LRI(9, POSTED), 953 REG16(0x3a8), 954 REG16(0x28c), 955 REG16(0x288), 956 REG16(0x284), 957 REG16(0x280), 958 REG16(0x27c), 959 REG16(0x278), 960 REG16(0x274), 961 REG16(0x270), 962 963 LRI(1, POSTED), 964 REG(0x1b0), 965 966 NOP(10), 967 LRI(1, 0), 968 REG(0x0c8), 969 970 END(80) 971 }; 972 973 static const u8 gen12_rcs_offsets[] = { 974 NOP(1), 975 LRI(13, POSTED), 976 REG16(0x244), 977 REG(0x034), 978 REG(0x030), 979 REG(0x038), 980 REG(0x03c), 981 REG(0x168), 982 REG(0x140), 983 REG(0x110), 984 REG(0x1c0), 985 REG(0x1c4), 986 REG(0x1c8), 987 REG(0x180), 988 REG16(0x2b4), 989 990 NOP(5), 991 LRI(9, POSTED), 992 REG16(0x3a8), 993 REG16(0x28c), 994 REG16(0x288), 995 REG16(0x284), 996 REG16(0x280), 997 REG16(0x27c), 998 REG16(0x278), 999 REG16(0x274), 1000 REG16(0x270), 1001 1002 LRI(3, POSTED), 1003 REG(0x1b0), 1004 REG16(0x5a8), 1005 REG16(0x5ac), 1006 1007 NOP(6), 1008 LRI(1, 0), 1009 REG(0x0c8), 1010 NOP(3 + 9 + 1), 1011 1012 LRI(51, POSTED), 1013 REG16(0x588), 1014 REG16(0x588), 1015 REG16(0x588), 1016 REG16(0x588), 1017 REG16(0x588), 1018 REG16(0x588), 1019 REG(0x028), 1020 REG(0x09c), 1021 REG(0x0c0), 1022 REG(0x178), 1023 REG(0x17c), 1024 REG16(0x358), 1025 REG(0x170), 1026 REG(0x150), 1027 REG(0x154), 1028 REG(0x158), 1029 REG16(0x41c), 1030 REG16(0x600), 1031 REG16(0x604), 1032 REG16(0x608), 1033 REG16(0x60c), 1034 REG16(0x610), 1035 REG16(0x614), 1036 REG16(0x618), 1037 REG16(0x61c), 1038 REG16(0x620), 1039 REG16(0x624), 1040 REG16(0x628), 1041 REG16(0x62c), 1042 REG16(0x630), 1043 REG16(0x634), 1044 REG16(0x638), 1045 REG16(0x63c), 1046 REG16(0x640), 1047 REG16(0x644), 1048 REG16(0x648), 1049 REG16(0x64c), 1050 REG16(0x650), 1051 REG16(0x654), 1052 REG16(0x658), 1053 REG16(0x65c), 1054 REG16(0x660), 1055 REG16(0x664), 1056 REG16(0x668), 1057 REG16(0x66c), 1058 REG16(0x670), 1059 REG16(0x674), 1060 REG16(0x678), 1061 REG16(0x67c), 1062 REG(0x068), 1063 REG(0x084), 1064 NOP(1), 1065 1066 END(192) 1067 }; 1068 1069 #undef END 1070 #undef REG16 1071 #undef REG 1072 #undef LRI 1073 #undef NOP 1074 1075 static const u8 *reg_offsets(const struct intel_engine_cs *engine) 1076 { 1077 /* 1078 * The gen12+ lists only have the registers we program in the basic 1079 * default state. We rely on the context image using relative 1080 * addressing to automatic fixup the register state between the 1081 * physical engines for virtual engine. 1082 */ 1083 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 && 1084 !intel_engine_has_relative_mmio(engine)); 1085 1086 if (engine->class == RENDER_CLASS) { 1087 if (INTEL_GEN(engine->i915) >= 12) 1088 return gen12_rcs_offsets; 1089 else if (INTEL_GEN(engine->i915) >= 11) 1090 return gen11_rcs_offsets; 1091 else if (INTEL_GEN(engine->i915) >= 9) 1092 return gen9_rcs_offsets; 1093 else 1094 return gen8_rcs_offsets; 1095 } else { 1096 if (INTEL_GEN(engine->i915) >= 12) 1097 return gen12_xcs_offsets; 1098 else if (INTEL_GEN(engine->i915) >= 9) 1099 return gen9_xcs_offsets; 1100 else 1101 return gen8_xcs_offsets; 1102 } 1103 } 1104 1105 static struct i915_request * 1106 __unwind_incomplete_requests(struct intel_engine_cs *engine) 1107 { 1108 struct i915_request *rq, *rn, *active = NULL; 1109 struct list_head *pl; 1110 int prio = I915_PRIORITY_INVALID; 1111 1112 lockdep_assert_held(&engine->active.lock); 1113 1114 list_for_each_entry_safe_reverse(rq, rn, 1115 &engine->active.requests, 1116 sched.link) { 1117 if (i915_request_completed(rq)) 1118 continue; /* XXX */ 1119 1120 __i915_request_unsubmit(rq); 1121 1122 /* 1123 * Push the request back into the queue for later resubmission. 1124 * If this request is not native to this physical engine (i.e. 1125 * it came from a virtual source), push it back onto the virtual 1126 * engine so that it can be moved across onto another physical 1127 * engine as load dictates. 1128 */ 1129 if (likely(rq->execution_mask == engine->mask)) { 1130 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); 1131 if (rq_prio(rq) != prio) { 1132 prio = rq_prio(rq); 1133 pl = i915_sched_lookup_priolist(engine, prio); 1134 } 1135 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 1136 1137 list_move(&rq->sched.link, pl); 1138 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 1139 1140 /* Check in case we rollback so far we wrap [size/2] */ 1141 if (intel_ring_direction(rq->ring, 1142 intel_ring_wrap(rq->ring, 1143 rq->tail), 1144 rq->ring->tail) > 0) 1145 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1146 1147 active = rq; 1148 } else { 1149 struct intel_engine_cs *owner = rq->context->engine; 1150 1151 /* 1152 * Decouple the virtual breadcrumb before moving it 1153 * back to the virtual engine -- we don't want the 1154 * request to complete in the background and try 1155 * and cancel the breadcrumb on the virtual engine 1156 * (instead of the old engine where it is linked)! 1157 */ 1158 if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, 1159 &rq->fence.flags)) { 1160 spin_lock_nested(&rq->lock, 1161 SINGLE_DEPTH_NESTING); 1162 i915_request_cancel_breadcrumb(rq); 1163 spin_unlock(&rq->lock); 1164 } 1165 WRITE_ONCE(rq->engine, owner); 1166 owner->submit_request(rq); 1167 active = NULL; 1168 } 1169 } 1170 1171 return active; 1172 } 1173 1174 struct i915_request * 1175 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) 1176 { 1177 struct intel_engine_cs *engine = 1178 container_of(execlists, typeof(*engine), execlists); 1179 1180 return __unwind_incomplete_requests(engine); 1181 } 1182 1183 static inline void 1184 execlists_context_status_change(struct i915_request *rq, unsigned long status) 1185 { 1186 /* 1187 * Only used when GVT-g is enabled now. When GVT-g is disabled, 1188 * The compiler should eliminate this function as dead-code. 1189 */ 1190 if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) 1191 return; 1192 1193 atomic_notifier_call_chain(&rq->engine->context_status_notifier, 1194 status, rq); 1195 } 1196 1197 static void intel_engine_context_in(struct intel_engine_cs *engine) 1198 { 1199 unsigned long flags; 1200 1201 if (atomic_add_unless(&engine->stats.active, 1, 0)) 1202 return; 1203 1204 write_seqlock_irqsave(&engine->stats.lock, flags); 1205 if (!atomic_add_unless(&engine->stats.active, 1, 0)) { 1206 engine->stats.start = ktime_get(); 1207 atomic_inc(&engine->stats.active); 1208 } 1209 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1210 } 1211 1212 static void intel_engine_context_out(struct intel_engine_cs *engine) 1213 { 1214 unsigned long flags; 1215 1216 GEM_BUG_ON(!atomic_read(&engine->stats.active)); 1217 1218 if (atomic_add_unless(&engine->stats.active, -1, 1)) 1219 return; 1220 1221 write_seqlock_irqsave(&engine->stats.lock, flags); 1222 if (atomic_dec_and_test(&engine->stats.active)) { 1223 engine->stats.total = 1224 ktime_add(engine->stats.total, 1225 ktime_sub(ktime_get(), engine->stats.start)); 1226 } 1227 write_sequnlock_irqrestore(&engine->stats.lock, flags); 1228 } 1229 1230 static void 1231 execlists_check_context(const struct intel_context *ce, 1232 const struct intel_engine_cs *engine) 1233 { 1234 const struct intel_ring *ring = ce->ring; 1235 u32 *regs = ce->lrc_reg_state; 1236 bool valid = true; 1237 int x; 1238 1239 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) { 1240 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n", 1241 engine->name, 1242 regs[CTX_RING_START], 1243 i915_ggtt_offset(ring->vma)); 1244 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 1245 valid = false; 1246 } 1247 1248 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) != 1249 (RING_CTL_SIZE(ring->size) | RING_VALID)) { 1250 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n", 1251 engine->name, 1252 regs[CTX_RING_CTL], 1253 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID)); 1254 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 1255 valid = false; 1256 } 1257 1258 x = lrc_ring_mi_mode(engine); 1259 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) { 1260 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n", 1261 engine->name, regs[x + 1]); 1262 regs[x + 1] &= ~STOP_RING; 1263 regs[x + 1] |= STOP_RING << 16; 1264 valid = false; 1265 } 1266 1267 WARN_ONCE(!valid, "Invalid lrc state found before submission\n"); 1268 } 1269 1270 static void restore_default_state(struct intel_context *ce, 1271 struct intel_engine_cs *engine) 1272 { 1273 u32 *regs; 1274 1275 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); 1276 execlists_init_reg_state(regs, ce, engine, ce->ring, true); 1277 1278 ce->runtime.last = intel_context_get_runtime(ce); 1279 } 1280 1281 static void reset_active(struct i915_request *rq, 1282 struct intel_engine_cs *engine) 1283 { 1284 struct intel_context * const ce = rq->context; 1285 u32 head; 1286 1287 /* 1288 * The executing context has been cancelled. We want to prevent 1289 * further execution along this context and propagate the error on 1290 * to anything depending on its results. 1291 * 1292 * In __i915_request_submit(), we apply the -EIO and remove the 1293 * requests' payloads for any banned requests. But first, we must 1294 * rewind the context back to the start of the incomplete request so 1295 * that we do not jump back into the middle of the batch. 1296 * 1297 * We preserve the breadcrumbs and semaphores of the incomplete 1298 * requests so that inter-timeline dependencies (i.e other timelines) 1299 * remain correctly ordered. And we defer to __i915_request_submit() 1300 * so that all asynchronous waits are correctly handled. 1301 */ 1302 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n", 1303 rq->fence.context, rq->fence.seqno); 1304 1305 /* On resubmission of the active request, payload will be scrubbed */ 1306 if (i915_request_completed(rq)) 1307 head = rq->tail; 1308 else 1309 head = active_request(ce->timeline, rq)->head; 1310 head = intel_ring_wrap(ce->ring, head); 1311 1312 /* Scrub the context image to prevent replaying the previous batch */ 1313 restore_default_state(ce, engine); 1314 __execlists_update_reg_state(ce, engine, head); 1315 1316 /* We've switched away, so this should be a no-op, but intent matters */ 1317 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 1318 } 1319 1320 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) 1321 { 1322 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 1323 ce->runtime.num_underflow += dt < 0; 1324 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt); 1325 #endif 1326 } 1327 1328 static void intel_context_update_runtime(struct intel_context *ce) 1329 { 1330 u32 old; 1331 s32 dt; 1332 1333 if (intel_context_is_barrier(ce)) 1334 return; 1335 1336 old = ce->runtime.last; 1337 ce->runtime.last = intel_context_get_runtime(ce); 1338 dt = ce->runtime.last - old; 1339 1340 if (unlikely(dt <= 0)) { 1341 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n", 1342 old, ce->runtime.last, dt); 1343 st_update_runtime_underflow(ce, dt); 1344 return; 1345 } 1346 1347 ewma_runtime_add(&ce->runtime.avg, dt); 1348 ce->runtime.total += dt; 1349 } 1350 1351 static inline struct intel_engine_cs * 1352 __execlists_schedule_in(struct i915_request *rq) 1353 { 1354 struct intel_engine_cs * const engine = rq->engine; 1355 struct intel_context * const ce = rq->context; 1356 1357 intel_context_get(ce); 1358 1359 if (unlikely(intel_context_is_banned(ce))) 1360 reset_active(rq, engine); 1361 1362 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 1363 execlists_check_context(ce, engine); 1364 1365 if (ce->tag) { 1366 /* Use a fixed tag for OA and friends */ 1367 GEM_BUG_ON(ce->tag <= BITS_PER_LONG); 1368 ce->lrc.ccid = ce->tag; 1369 } else { 1370 /* We don't need a strict matching tag, just different values */ 1371 unsigned int tag = ffs(READ_ONCE(engine->context_tag)); 1372 1373 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); 1374 clear_bit(tag - 1, &engine->context_tag); 1375 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32); 1376 1377 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID); 1378 } 1379 1380 ce->lrc.ccid |= engine->execlists.ccid; 1381 1382 __intel_gt_pm_get(engine->gt); 1383 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active)) 1384 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain); 1385 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); 1386 intel_engine_context_in(engine); 1387 1388 return engine; 1389 } 1390 1391 static inline struct i915_request * 1392 execlists_schedule_in(struct i915_request *rq, int idx) 1393 { 1394 struct intel_context * const ce = rq->context; 1395 struct intel_engine_cs *old; 1396 1397 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine)); 1398 trace_i915_request_in(rq, idx); 1399 1400 old = READ_ONCE(ce->inflight); 1401 do { 1402 if (!old) { 1403 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq)); 1404 break; 1405 } 1406 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old))); 1407 1408 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); 1409 return i915_request_get(rq); 1410 } 1411 1412 static void kick_siblings(struct i915_request *rq, struct intel_context *ce) 1413 { 1414 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 1415 struct i915_request *next = READ_ONCE(ve->request); 1416 1417 if (next == rq || (next && next->execution_mask & ~rq->execution_mask)) 1418 tasklet_hi_schedule(&ve->base.execlists.tasklet); 1419 } 1420 1421 static inline void 1422 __execlists_schedule_out(struct i915_request *rq, 1423 struct intel_engine_cs * const engine, 1424 unsigned int ccid) 1425 { 1426 struct intel_context * const ce = rq->context; 1427 1428 /* 1429 * NB process_csb() is not under the engine->active.lock and hence 1430 * schedule_out can race with schedule_in meaning that we should 1431 * refrain from doing non-trivial work here. 1432 */ 1433 1434 /* 1435 * If we have just completed this context, the engine may now be 1436 * idle and we want to re-enter powersaving. 1437 */ 1438 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) && 1439 i915_request_completed(rq)) 1440 intel_engine_add_retire(engine, ce->timeline); 1441 1442 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32; 1443 ccid &= GEN12_MAX_CONTEXT_HW_ID; 1444 if (ccid < BITS_PER_LONG) { 1445 GEM_BUG_ON(ccid == 0); 1446 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag)); 1447 set_bit(ccid - 1, &engine->context_tag); 1448 } 1449 1450 intel_context_update_runtime(ce); 1451 intel_engine_context_out(engine); 1452 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); 1453 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active)) 1454 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain); 1455 intel_gt_pm_put_async(engine->gt); 1456 1457 /* 1458 * If this is part of a virtual engine, its next request may 1459 * have been blocked waiting for access to the active context. 1460 * We have to kick all the siblings again in case we need to 1461 * switch (e.g. the next request is not runnable on this 1462 * engine). Hopefully, we will already have submitted the next 1463 * request before the tasklet runs and do not need to rebuild 1464 * each virtual tree and kick everyone again. 1465 */ 1466 if (ce->engine != engine) 1467 kick_siblings(rq, ce); 1468 1469 intel_context_put(ce); 1470 } 1471 1472 static inline void 1473 execlists_schedule_out(struct i915_request *rq) 1474 { 1475 struct intel_context * const ce = rq->context; 1476 struct intel_engine_cs *cur, *old; 1477 u32 ccid; 1478 1479 trace_i915_request_out(rq); 1480 1481 ccid = rq->context->lrc.ccid; 1482 old = READ_ONCE(ce->inflight); 1483 do 1484 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL; 1485 while (!try_cmpxchg(&ce->inflight, &old, cur)); 1486 if (!cur) 1487 __execlists_schedule_out(rq, old, ccid); 1488 1489 i915_request_put(rq); 1490 } 1491 1492 static u64 execlists_update_context(struct i915_request *rq) 1493 { 1494 struct intel_context *ce = rq->context; 1495 u64 desc = ce->lrc.desc; 1496 u32 tail, prev; 1497 1498 /* 1499 * WaIdleLiteRestore:bdw,skl 1500 * 1501 * We should never submit the context with the same RING_TAIL twice 1502 * just in case we submit an empty ring, which confuses the HW. 1503 * 1504 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of 1505 * the normal request to be able to always advance the RING_TAIL on 1506 * subsequent resubmissions (for lite restore). Should that fail us, 1507 * and we try and submit the same tail again, force the context 1508 * reload. 1509 * 1510 * If we need to return to a preempted context, we need to skip the 1511 * lite-restore and force it to reload the RING_TAIL. Otherwise, the 1512 * HW has a tendency to ignore us rewinding the TAIL to the end of 1513 * an earlier request. 1514 */ 1515 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); 1516 prev = rq->ring->tail; 1517 tail = intel_ring_set_tail(rq->ring, rq->tail); 1518 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) 1519 desc |= CTX_DESC_FORCE_RESTORE; 1520 ce->lrc_reg_state[CTX_RING_TAIL] = tail; 1521 rq->tail = rq->wa_tail; 1522 1523 /* 1524 * Make sure the context image is complete before we submit it to HW. 1525 * 1526 * Ostensibly, writes (including the WCB) should be flushed prior to 1527 * an uncached write such as our mmio register access, the empirical 1528 * evidence (esp. on Braswell) suggests that the WC write into memory 1529 * may not be visible to the HW prior to the completion of the UC 1530 * register write and that we may begin execution from the context 1531 * before its image is complete leading to invalid PD chasing. 1532 */ 1533 wmb(); 1534 1535 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE; 1536 return desc; 1537 } 1538 1539 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) 1540 { 1541 if (execlists->ctrl_reg) { 1542 writel(lower_32_bits(desc), execlists->submit_reg + port * 2); 1543 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); 1544 } else { 1545 writel(upper_32_bits(desc), execlists->submit_reg); 1546 writel(lower_32_bits(desc), execlists->submit_reg); 1547 } 1548 } 1549 1550 static __maybe_unused char * 1551 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) 1552 { 1553 if (!rq) 1554 return ""; 1555 1556 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", 1557 prefix, 1558 rq->context->lrc.ccid, 1559 rq->fence.context, rq->fence.seqno, 1560 i915_request_completed(rq) ? "!" : 1561 i915_request_started(rq) ? "*" : 1562 "", 1563 rq_prio(rq)); 1564 1565 return buf; 1566 } 1567 1568 static __maybe_unused void 1569 trace_ports(const struct intel_engine_execlists *execlists, 1570 const char *msg, 1571 struct i915_request * const *ports) 1572 { 1573 const struct intel_engine_cs *engine = 1574 container_of(execlists, typeof(*engine), execlists); 1575 char __maybe_unused p0[40], p1[40]; 1576 1577 if (!ports[0]) 1578 return; 1579 1580 ENGINE_TRACE(engine, "%s { %s%s }\n", msg, 1581 dump_port(p0, sizeof(p0), "", ports[0]), 1582 dump_port(p1, sizeof(p1), ", ", ports[1])); 1583 } 1584 1585 static inline bool 1586 reset_in_progress(const struct intel_engine_execlists *execlists) 1587 { 1588 return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); 1589 } 1590 1591 static __maybe_unused bool 1592 assert_pending_valid(const struct intel_engine_execlists *execlists, 1593 const char *msg) 1594 { 1595 struct intel_engine_cs *engine = 1596 container_of(execlists, typeof(*engine), execlists); 1597 struct i915_request * const *port, *rq; 1598 struct intel_context *ce = NULL; 1599 bool sentinel = false; 1600 u32 ccid = -1; 1601 1602 trace_ports(execlists, msg, execlists->pending); 1603 1604 /* We may be messing around with the lists during reset, lalala */ 1605 if (reset_in_progress(execlists)) 1606 return true; 1607 1608 if (!execlists->pending[0]) { 1609 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", 1610 engine->name); 1611 return false; 1612 } 1613 1614 if (execlists->pending[execlists_num_ports(execlists)]) { 1615 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", 1616 engine->name, execlists_num_ports(execlists)); 1617 return false; 1618 } 1619 1620 for (port = execlists->pending; (rq = *port); port++) { 1621 unsigned long flags; 1622 bool ok = true; 1623 1624 GEM_BUG_ON(!kref_read(&rq->fence.refcount)); 1625 GEM_BUG_ON(!i915_request_is_active(rq)); 1626 1627 if (ce == rq->context) { 1628 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", 1629 engine->name, 1630 ce->timeline->fence_context, 1631 port - execlists->pending); 1632 return false; 1633 } 1634 ce = rq->context; 1635 1636 if (ccid == ce->lrc.ccid) { 1637 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", 1638 engine->name, 1639 ccid, ce->timeline->fence_context, 1640 port - execlists->pending); 1641 return false; 1642 } 1643 ccid = ce->lrc.ccid; 1644 1645 /* 1646 * Sentinels are supposed to be the last request so they flush 1647 * the current execution off the HW. Check that they are the only 1648 * request in the pending submission. 1649 */ 1650 if (sentinel) { 1651 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", 1652 engine->name, 1653 ce->timeline->fence_context, 1654 port - execlists->pending); 1655 return false; 1656 } 1657 sentinel = i915_request_has_sentinel(rq); 1658 1659 /* Hold tightly onto the lock to prevent concurrent retires! */ 1660 if (!spin_trylock_irqsave(&rq->lock, flags)) 1661 continue; 1662 1663 if (i915_request_completed(rq)) 1664 goto unlock; 1665 1666 if (i915_active_is_idle(&ce->active) && 1667 !intel_context_is_barrier(ce)) { 1668 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", 1669 engine->name, 1670 ce->timeline->fence_context, 1671 port - execlists->pending); 1672 ok = false; 1673 goto unlock; 1674 } 1675 1676 if (!i915_vma_is_pinned(ce->state)) { 1677 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", 1678 engine->name, 1679 ce->timeline->fence_context, 1680 port - execlists->pending); 1681 ok = false; 1682 goto unlock; 1683 } 1684 1685 if (!i915_vma_is_pinned(ce->ring->vma)) { 1686 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", 1687 engine->name, 1688 ce->timeline->fence_context, 1689 port - execlists->pending); 1690 ok = false; 1691 goto unlock; 1692 } 1693 1694 unlock: 1695 spin_unlock_irqrestore(&rq->lock, flags); 1696 if (!ok) 1697 return false; 1698 } 1699 1700 return ce; 1701 } 1702 1703 static void execlists_submit_ports(struct intel_engine_cs *engine) 1704 { 1705 struct intel_engine_execlists *execlists = &engine->execlists; 1706 unsigned int n; 1707 1708 GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); 1709 1710 /* 1711 * We can skip acquiring intel_runtime_pm_get() here as it was taken 1712 * on our behalf by the request (see i915_gem_mark_busy()) and it will 1713 * not be relinquished until the device is idle (see 1714 * i915_gem_idle_work_handler()). As a precaution, we make sure 1715 * that all ELSP are drained i.e. we have processed the CSB, 1716 * before allowing ourselves to idle and calling intel_runtime_pm_put(). 1717 */ 1718 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 1719 1720 /* 1721 * ELSQ note: the submit queue is not cleared after being submitted 1722 * to the HW so we need to make sure we always clean it up. This is 1723 * currently ensured by the fact that we always write the same number 1724 * of elsq entries, keep this in mind before changing the loop below. 1725 */ 1726 for (n = execlists_num_ports(execlists); n--; ) { 1727 struct i915_request *rq = execlists->pending[n]; 1728 1729 write_desc(execlists, 1730 rq ? execlists_update_context(rq) : 0, 1731 n); 1732 } 1733 1734 /* we need to manually load the submit queue */ 1735 if (execlists->ctrl_reg) 1736 writel(EL_CTRL_LOAD, execlists->ctrl_reg); 1737 } 1738 1739 static bool ctx_single_port_submission(const struct intel_context *ce) 1740 { 1741 return (IS_ENABLED(CONFIG_DRM_I915_GVT) && 1742 intel_context_force_single_submission(ce)); 1743 } 1744 1745 static bool can_merge_ctx(const struct intel_context *prev, 1746 const struct intel_context *next) 1747 { 1748 if (prev != next) 1749 return false; 1750 1751 if (ctx_single_port_submission(prev)) 1752 return false; 1753 1754 return true; 1755 } 1756 1757 static unsigned long i915_request_flags(const struct i915_request *rq) 1758 { 1759 return READ_ONCE(rq->fence.flags); 1760 } 1761 1762 static bool can_merge_rq(const struct i915_request *prev, 1763 const struct i915_request *next) 1764 { 1765 GEM_BUG_ON(prev == next); 1766 GEM_BUG_ON(!assert_priority_queue(prev, next)); 1767 1768 /* 1769 * We do not submit known completed requests. Therefore if the next 1770 * request is already completed, we can pretend to merge it in 1771 * with the previous context (and we will skip updating the ELSP 1772 * and tracking). Thus hopefully keeping the ELSP full with active 1773 * contexts, despite the best efforts of preempt-to-busy to confuse 1774 * us. 1775 */ 1776 if (i915_request_completed(next)) 1777 return true; 1778 1779 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) & 1780 (BIT(I915_FENCE_FLAG_NOPREEMPT) | 1781 BIT(I915_FENCE_FLAG_SENTINEL)))) 1782 return false; 1783 1784 if (!can_merge_ctx(prev->context, next->context)) 1785 return false; 1786 1787 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno)); 1788 return true; 1789 } 1790 1791 static void virtual_update_register_offsets(u32 *regs, 1792 struct intel_engine_cs *engine) 1793 { 1794 set_offsets(regs, reg_offsets(engine), engine, false); 1795 } 1796 1797 static bool virtual_matches(const struct virtual_engine *ve, 1798 const struct i915_request *rq, 1799 const struct intel_engine_cs *engine) 1800 { 1801 const struct intel_engine_cs *inflight; 1802 1803 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ 1804 return false; 1805 1806 /* 1807 * We track when the HW has completed saving the context image 1808 * (i.e. when we have seen the final CS event switching out of 1809 * the context) and must not overwrite the context image before 1810 * then. This restricts us to only using the active engine 1811 * while the previous virtualized request is inflight (so 1812 * we reuse the register offsets). This is a very small 1813 * hystersis on the greedy seelction algorithm. 1814 */ 1815 inflight = intel_context_inflight(&ve->context); 1816 if (inflight && inflight != engine) 1817 return false; 1818 1819 return true; 1820 } 1821 1822 static void virtual_xfer_breadcrumbs(struct virtual_engine *ve) 1823 { 1824 /* 1825 * All the outstanding signals on ve->siblings[0] must have 1826 * been completed, just pending the interrupt handler. As those 1827 * signals still refer to the old sibling (via rq->engine), we must 1828 * transfer those to the old irq_worker to keep our locking 1829 * consistent. 1830 */ 1831 intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context); 1832 } 1833 1834 #define for_each_waiter(p__, rq__) \ 1835 list_for_each_entry_lockless(p__, \ 1836 &(rq__)->sched.waiters_list, \ 1837 wait_link) 1838 1839 #define for_each_signaler(p__, rq__) \ 1840 list_for_each_entry_rcu(p__, \ 1841 &(rq__)->sched.signalers_list, \ 1842 signal_link) 1843 1844 static void defer_request(struct i915_request *rq, struct list_head * const pl) 1845 { 1846 LIST_HEAD(list); 1847 1848 /* 1849 * We want to move the interrupted request to the back of 1850 * the round-robin list (i.e. its priority level), but 1851 * in doing so, we must then move all requests that were in 1852 * flight and were waiting for the interrupted request to 1853 * be run after it again. 1854 */ 1855 do { 1856 struct i915_dependency *p; 1857 1858 GEM_BUG_ON(i915_request_is_active(rq)); 1859 list_move_tail(&rq->sched.link, pl); 1860 1861 for_each_waiter(p, rq) { 1862 struct i915_request *w = 1863 container_of(p->waiter, typeof(*w), sched); 1864 1865 if (p->flags & I915_DEPENDENCY_WEAK) 1866 continue; 1867 1868 /* Leave semaphores spinning on the other engines */ 1869 if (w->engine != rq->engine) 1870 continue; 1871 1872 /* No waiter should start before its signaler */ 1873 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && 1874 i915_request_started(w) && 1875 !i915_request_completed(rq)); 1876 1877 GEM_BUG_ON(i915_request_is_active(w)); 1878 if (!i915_request_is_ready(w)) 1879 continue; 1880 1881 if (rq_prio(w) < rq_prio(rq)) 1882 continue; 1883 1884 GEM_BUG_ON(rq_prio(w) > rq_prio(rq)); 1885 list_move_tail(&w->sched.link, &list); 1886 } 1887 1888 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 1889 } while (rq); 1890 } 1891 1892 static void defer_active(struct intel_engine_cs *engine) 1893 { 1894 struct i915_request *rq; 1895 1896 rq = __unwind_incomplete_requests(engine); 1897 if (!rq) 1898 return; 1899 1900 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq))); 1901 } 1902 1903 static bool 1904 need_timeslice(const struct intel_engine_cs *engine, 1905 const struct i915_request *rq, 1906 const struct rb_node *rb) 1907 { 1908 int hint; 1909 1910 if (!intel_engine_has_timeslices(engine)) 1911 return false; 1912 1913 hint = engine->execlists.queue_priority_hint; 1914 1915 if (rb) { 1916 const struct virtual_engine *ve = 1917 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 1918 const struct intel_engine_cs *inflight = 1919 intel_context_inflight(&ve->context); 1920 1921 if (!inflight || inflight == engine) { 1922 struct i915_request *next; 1923 1924 rcu_read_lock(); 1925 next = READ_ONCE(ve->request); 1926 if (next) 1927 hint = max(hint, rq_prio(next)); 1928 rcu_read_unlock(); 1929 } 1930 } 1931 1932 if (!list_is_last(&rq->sched.link, &engine->active.requests)) 1933 hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); 1934 1935 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); 1936 return hint >= effective_prio(rq); 1937 } 1938 1939 static bool 1940 timeslice_yield(const struct intel_engine_execlists *el, 1941 const struct i915_request *rq) 1942 { 1943 /* 1944 * Once bitten, forever smitten! 1945 * 1946 * If the active context ever busy-waited on a semaphore, 1947 * it will be treated as a hog until the end of its timeslice (i.e. 1948 * until it is scheduled out and replaced by a new submission, 1949 * possibly even its own lite-restore). The HW only sends an interrupt 1950 * on the first miss, and we do know if that semaphore has been 1951 * signaled, or even if it is now stuck on another semaphore. Play 1952 * safe, yield if it might be stuck -- it will be given a fresh 1953 * timeslice in the near future. 1954 */ 1955 return rq->context->lrc.ccid == READ_ONCE(el->yield); 1956 } 1957 1958 static bool 1959 timeslice_expired(const struct intel_engine_execlists *el, 1960 const struct i915_request *rq) 1961 { 1962 return timer_expired(&el->timer) || timeslice_yield(el, rq); 1963 } 1964 1965 static int 1966 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) 1967 { 1968 if (list_is_last(&rq->sched.link, &engine->active.requests)) 1969 return engine->execlists.queue_priority_hint; 1970 1971 return rq_prio(list_next_entry(rq, sched.link)); 1972 } 1973 1974 static inline unsigned long 1975 timeslice(const struct intel_engine_cs *engine) 1976 { 1977 return READ_ONCE(engine->props.timeslice_duration_ms); 1978 } 1979 1980 static unsigned long active_timeslice(const struct intel_engine_cs *engine) 1981 { 1982 const struct intel_engine_execlists *execlists = &engine->execlists; 1983 const struct i915_request *rq = *execlists->active; 1984 1985 if (!rq || i915_request_completed(rq)) 1986 return 0; 1987 1988 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq)) 1989 return 0; 1990 1991 return timeslice(engine); 1992 } 1993 1994 static void set_timeslice(struct intel_engine_cs *engine) 1995 { 1996 unsigned long duration; 1997 1998 if (!intel_engine_has_timeslices(engine)) 1999 return; 2000 2001 duration = active_timeslice(engine); 2002 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); 2003 2004 set_timer_ms(&engine->execlists.timer, duration); 2005 } 2006 2007 static void start_timeslice(struct intel_engine_cs *engine, int prio) 2008 { 2009 struct intel_engine_execlists *execlists = &engine->execlists; 2010 unsigned long duration; 2011 2012 if (!intel_engine_has_timeslices(engine)) 2013 return; 2014 2015 WRITE_ONCE(execlists->switch_priority_hint, prio); 2016 if (prio == INT_MIN) 2017 return; 2018 2019 if (timer_pending(&execlists->timer)) 2020 return; 2021 2022 duration = timeslice(engine); 2023 ENGINE_TRACE(engine, 2024 "start timeslicing, prio:%d, interval:%lu", 2025 prio, duration); 2026 2027 set_timer_ms(&execlists->timer, duration); 2028 } 2029 2030 static void record_preemption(struct intel_engine_execlists *execlists) 2031 { 2032 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); 2033 } 2034 2035 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine, 2036 const struct i915_request *rq) 2037 { 2038 if (!rq) 2039 return 0; 2040 2041 /* Force a fast reset for terminated contexts (ignoring sysfs!) */ 2042 if (unlikely(intel_context_is_banned(rq->context))) 2043 return 1; 2044 2045 return READ_ONCE(engine->props.preempt_timeout_ms); 2046 } 2047 2048 static void set_preempt_timeout(struct intel_engine_cs *engine, 2049 const struct i915_request *rq) 2050 { 2051 if (!intel_engine_has_preempt_reset(engine)) 2052 return; 2053 2054 set_timer_ms(&engine->execlists.preempt, 2055 active_preempt_timeout(engine, rq)); 2056 } 2057 2058 static inline void clear_ports(struct i915_request **ports, int count) 2059 { 2060 memset_p((void **)ports, NULL, count); 2061 } 2062 2063 static void execlists_dequeue(struct intel_engine_cs *engine) 2064 { 2065 struct intel_engine_execlists * const execlists = &engine->execlists; 2066 struct i915_request **port = execlists->pending; 2067 struct i915_request ** const last_port = port + execlists->port_mask; 2068 struct i915_request * const *active; 2069 struct i915_request *last; 2070 struct rb_node *rb; 2071 bool submit = false; 2072 2073 /* 2074 * Hardware submission is through 2 ports. Conceptually each port 2075 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is 2076 * static for a context, and unique to each, so we only execute 2077 * requests belonging to a single context from each ring. RING_HEAD 2078 * is maintained by the CS in the context image, it marks the place 2079 * where it got up to last time, and through RING_TAIL we tell the CS 2080 * where we want to execute up to this time. 2081 * 2082 * In this list the requests are in order of execution. Consecutive 2083 * requests from the same context are adjacent in the ringbuffer. We 2084 * can combine these requests into a single RING_TAIL update: 2085 * 2086 * RING_HEAD...req1...req2 2087 * ^- RING_TAIL 2088 * since to execute req2 the CS must first execute req1. 2089 * 2090 * Our goal then is to point each port to the end of a consecutive 2091 * sequence of requests as being the most optimal (fewest wake ups 2092 * and context switches) submission. 2093 */ 2094 2095 for (rb = rb_first_cached(&execlists->virtual); rb; ) { 2096 struct virtual_engine *ve = 2097 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2098 struct i915_request *rq = READ_ONCE(ve->request); 2099 2100 if (!rq) { /* lazily cleanup after another engine handled rq */ 2101 rb_erase_cached(rb, &execlists->virtual); 2102 RB_CLEAR_NODE(rb); 2103 rb = rb_first_cached(&execlists->virtual); 2104 continue; 2105 } 2106 2107 if (!virtual_matches(ve, rq, engine)) { 2108 rb = rb_next(rb); 2109 continue; 2110 } 2111 2112 break; 2113 } 2114 2115 /* 2116 * If the queue is higher priority than the last 2117 * request in the currently active context, submit afresh. 2118 * We will resubmit again afterwards in case we need to split 2119 * the active context to interject the preemption request, 2120 * i.e. we will retrigger preemption following the ack in case 2121 * of trouble. 2122 */ 2123 active = READ_ONCE(execlists->active); 2124 2125 /* 2126 * In theory we can skip over completed contexts that have not 2127 * yet been processed by events (as those events are in flight): 2128 * 2129 * while ((last = *active) && i915_request_completed(last)) 2130 * active++; 2131 * 2132 * However, the GPU cannot handle this as it will ultimately 2133 * find itself trying to jump back into a context it has just 2134 * completed and barf. 2135 */ 2136 2137 if ((last = *active)) { 2138 if (need_preempt(engine, last, rb)) { 2139 if (i915_request_completed(last)) { 2140 tasklet_hi_schedule(&execlists->tasklet); 2141 return; 2142 } 2143 2144 ENGINE_TRACE(engine, 2145 "preempting last=%llx:%lld, prio=%d, hint=%d\n", 2146 last->fence.context, 2147 last->fence.seqno, 2148 last->sched.attr.priority, 2149 execlists->queue_priority_hint); 2150 record_preemption(execlists); 2151 2152 /* 2153 * Don't let the RING_HEAD advance past the breadcrumb 2154 * as we unwind (and until we resubmit) so that we do 2155 * not accidentally tell it to go backwards. 2156 */ 2157 ring_set_paused(engine, 1); 2158 2159 /* 2160 * Note that we have not stopped the GPU at this point, 2161 * so we are unwinding the incomplete requests as they 2162 * remain inflight and so by the time we do complete 2163 * the preemption, some of the unwound requests may 2164 * complete! 2165 */ 2166 __unwind_incomplete_requests(engine); 2167 2168 last = NULL; 2169 } else if (need_timeslice(engine, last, rb) && 2170 timeslice_expired(execlists, last)) { 2171 if (i915_request_completed(last)) { 2172 tasklet_hi_schedule(&execlists->tasklet); 2173 return; 2174 } 2175 2176 ENGINE_TRACE(engine, 2177 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", 2178 last->fence.context, 2179 last->fence.seqno, 2180 last->sched.attr.priority, 2181 execlists->queue_priority_hint, 2182 yesno(timeslice_yield(execlists, last))); 2183 2184 ring_set_paused(engine, 1); 2185 defer_active(engine); 2186 2187 /* 2188 * Unlike for preemption, if we rewind and continue 2189 * executing the same context as previously active, 2190 * the order of execution will remain the same and 2191 * the tail will only advance. We do not need to 2192 * force a full context restore, as a lite-restore 2193 * is sufficient to resample the monotonic TAIL. 2194 * 2195 * If we switch to any other context, similarly we 2196 * will not rewind TAIL of current context, and 2197 * normal save/restore will preserve state and allow 2198 * us to later continue executing the same request. 2199 */ 2200 last = NULL; 2201 } else { 2202 /* 2203 * Otherwise if we already have a request pending 2204 * for execution after the current one, we can 2205 * just wait until the next CS event before 2206 * queuing more. In either case we will force a 2207 * lite-restore preemption event, but if we wait 2208 * we hopefully coalesce several updates into a single 2209 * submission. 2210 */ 2211 if (!list_is_last(&last->sched.link, 2212 &engine->active.requests)) { 2213 /* 2214 * Even if ELSP[1] is occupied and not worthy 2215 * of timeslices, our queue might be. 2216 */ 2217 start_timeslice(engine, queue_prio(execlists)); 2218 return; 2219 } 2220 } 2221 } 2222 2223 while (rb) { /* XXX virtual is always taking precedence */ 2224 struct virtual_engine *ve = 2225 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 2226 struct i915_request *rq; 2227 2228 spin_lock(&ve->base.active.lock); 2229 2230 rq = ve->request; 2231 if (unlikely(!rq)) { /* lost the race to a sibling */ 2232 spin_unlock(&ve->base.active.lock); 2233 rb_erase_cached(rb, &execlists->virtual); 2234 RB_CLEAR_NODE(rb); 2235 rb = rb_first_cached(&execlists->virtual); 2236 continue; 2237 } 2238 2239 GEM_BUG_ON(rq != ve->request); 2240 GEM_BUG_ON(rq->engine != &ve->base); 2241 GEM_BUG_ON(rq->context != &ve->context); 2242 2243 if (rq_prio(rq) >= queue_prio(execlists)) { 2244 if (!virtual_matches(ve, rq, engine)) { 2245 spin_unlock(&ve->base.active.lock); 2246 rb = rb_next(rb); 2247 continue; 2248 } 2249 2250 if (last && !can_merge_rq(last, rq)) { 2251 spin_unlock(&ve->base.active.lock); 2252 start_timeslice(engine, rq_prio(rq)); 2253 return; /* leave this for another sibling */ 2254 } 2255 2256 ENGINE_TRACE(engine, 2257 "virtual rq=%llx:%lld%s, new engine? %s\n", 2258 rq->fence.context, 2259 rq->fence.seqno, 2260 i915_request_completed(rq) ? "!" : 2261 i915_request_started(rq) ? "*" : 2262 "", 2263 yesno(engine != ve->siblings[0])); 2264 2265 WRITE_ONCE(ve->request, NULL); 2266 WRITE_ONCE(ve->base.execlists.queue_priority_hint, 2267 INT_MIN); 2268 rb_erase_cached(rb, &execlists->virtual); 2269 RB_CLEAR_NODE(rb); 2270 2271 GEM_BUG_ON(!(rq->execution_mask & engine->mask)); 2272 WRITE_ONCE(rq->engine, engine); 2273 2274 if (engine != ve->siblings[0]) { 2275 u32 *regs = ve->context.lrc_reg_state; 2276 unsigned int n; 2277 2278 GEM_BUG_ON(READ_ONCE(ve->context.inflight)); 2279 2280 if (!intel_engine_has_relative_mmio(engine)) 2281 virtual_update_register_offsets(regs, 2282 engine); 2283 2284 if (!list_empty(&ve->context.signals)) 2285 virtual_xfer_breadcrumbs(ve); 2286 2287 /* 2288 * Move the bound engine to the top of the list 2289 * for future execution. We then kick this 2290 * tasklet first before checking others, so that 2291 * we preferentially reuse this set of bound 2292 * registers. 2293 */ 2294 for (n = 1; n < ve->num_siblings; n++) { 2295 if (ve->siblings[n] == engine) { 2296 swap(ve->siblings[n], 2297 ve->siblings[0]); 2298 break; 2299 } 2300 } 2301 2302 GEM_BUG_ON(ve->siblings[0] != engine); 2303 } 2304 2305 if (__i915_request_submit(rq)) { 2306 submit = true; 2307 last = rq; 2308 } 2309 i915_request_put(rq); 2310 2311 /* 2312 * Hmm, we have a bunch of virtual engine requests, 2313 * but the first one was already completed (thanks 2314 * preempt-to-busy!). Keep looking at the veng queue 2315 * until we have no more relevant requests (i.e. 2316 * the normal submit queue has higher priority). 2317 */ 2318 if (!submit) { 2319 spin_unlock(&ve->base.active.lock); 2320 rb = rb_first_cached(&execlists->virtual); 2321 continue; 2322 } 2323 } 2324 2325 spin_unlock(&ve->base.active.lock); 2326 break; 2327 } 2328 2329 while ((rb = rb_first_cached(&execlists->queue))) { 2330 struct i915_priolist *p = to_priolist(rb); 2331 struct i915_request *rq, *rn; 2332 int i; 2333 2334 priolist_for_each_request_consume(rq, rn, p, i) { 2335 bool merge = true; 2336 2337 /* 2338 * Can we combine this request with the current port? 2339 * It has to be the same context/ringbuffer and not 2340 * have any exceptions (e.g. GVT saying never to 2341 * combine contexts). 2342 * 2343 * If we can combine the requests, we can execute both 2344 * by updating the RING_TAIL to point to the end of the 2345 * second request, and so we never need to tell the 2346 * hardware about the first. 2347 */ 2348 if (last && !can_merge_rq(last, rq)) { 2349 /* 2350 * If we are on the second port and cannot 2351 * combine this request with the last, then we 2352 * are done. 2353 */ 2354 if (port == last_port) 2355 goto done; 2356 2357 /* 2358 * We must not populate both ELSP[] with the 2359 * same LRCA, i.e. we must submit 2 different 2360 * contexts if we submit 2 ELSP. 2361 */ 2362 if (last->context == rq->context) 2363 goto done; 2364 2365 if (i915_request_has_sentinel(last)) 2366 goto done; 2367 2368 /* 2369 * If GVT overrides us we only ever submit 2370 * port[0], leaving port[1] empty. Note that we 2371 * also have to be careful that we don't queue 2372 * the same context (even though a different 2373 * request) to the second port. 2374 */ 2375 if (ctx_single_port_submission(last->context) || 2376 ctx_single_port_submission(rq->context)) 2377 goto done; 2378 2379 merge = false; 2380 } 2381 2382 if (__i915_request_submit(rq)) { 2383 if (!merge) { 2384 *port = execlists_schedule_in(last, port - execlists->pending); 2385 port++; 2386 last = NULL; 2387 } 2388 2389 GEM_BUG_ON(last && 2390 !can_merge_ctx(last->context, 2391 rq->context)); 2392 GEM_BUG_ON(last && 2393 i915_seqno_passed(last->fence.seqno, 2394 rq->fence.seqno)); 2395 2396 submit = true; 2397 last = rq; 2398 } 2399 } 2400 2401 rb_erase_cached(&p->node, &execlists->queue); 2402 i915_priolist_free(p); 2403 } 2404 2405 done: 2406 /* 2407 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. 2408 * 2409 * We choose the priority hint such that if we add a request of greater 2410 * priority than this, we kick the submission tasklet to decide on 2411 * the right order of submitting the requests to hardware. We must 2412 * also be prepared to reorder requests as they are in-flight on the 2413 * HW. We derive the priority hint then as the first "hole" in 2414 * the HW submission ports and if there are no available slots, 2415 * the priority of the lowest executing request, i.e. last. 2416 * 2417 * When we do receive a higher priority request ready to run from the 2418 * user, see queue_request(), the priority hint is bumped to that 2419 * request triggering preemption on the next dequeue (or subsequent 2420 * interrupt for secondary ports). 2421 */ 2422 execlists->queue_priority_hint = queue_prio(execlists); 2423 2424 if (submit) { 2425 *port = execlists_schedule_in(last, port - execlists->pending); 2426 execlists->switch_priority_hint = 2427 switch_prio(engine, *execlists->pending); 2428 2429 /* 2430 * Skip if we ended up with exactly the same set of requests, 2431 * e.g. trying to timeslice a pair of ordered contexts 2432 */ 2433 if (!memcmp(active, execlists->pending, 2434 (port - execlists->pending + 1) * sizeof(*port))) { 2435 do 2436 execlists_schedule_out(fetch_and_zero(port)); 2437 while (port-- != execlists->pending); 2438 2439 goto skip_submit; 2440 } 2441 clear_ports(port + 1, last_port - port); 2442 2443 WRITE_ONCE(execlists->yield, -1); 2444 set_preempt_timeout(engine, *active); 2445 execlists_submit_ports(engine); 2446 } else { 2447 start_timeslice(engine, execlists->queue_priority_hint); 2448 skip_submit: 2449 ring_set_paused(engine, 0); 2450 } 2451 } 2452 2453 static void 2454 cancel_port_requests(struct intel_engine_execlists * const execlists) 2455 { 2456 struct i915_request * const *port; 2457 2458 for (port = execlists->pending; *port; port++) 2459 execlists_schedule_out(*port); 2460 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending)); 2461 2462 /* Mark the end of active before we overwrite *active */ 2463 for (port = xchg(&execlists->active, execlists->pending); *port; port++) 2464 execlists_schedule_out(*port); 2465 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight)); 2466 2467 smp_wmb(); /* complete the seqlock for execlists_active() */ 2468 WRITE_ONCE(execlists->active, execlists->inflight); 2469 } 2470 2471 static inline void 2472 invalidate_csb_entries(const u32 *first, const u32 *last) 2473 { 2474 clflush((void *)first); 2475 clflush((void *)last); 2476 } 2477 2478 /* 2479 * Starting with Gen12, the status has a new format: 2480 * 2481 * bit 0: switched to new queue 2482 * bit 1: reserved 2483 * bit 2: semaphore wait mode (poll or signal), only valid when 2484 * switch detail is set to "wait on semaphore" 2485 * bits 3-5: engine class 2486 * bits 6-11: engine instance 2487 * bits 12-14: reserved 2488 * bits 15-25: sw context id of the lrc the GT switched to 2489 * bits 26-31: sw counter of the lrc the GT switched to 2490 * bits 32-35: context switch detail 2491 * - 0: ctx complete 2492 * - 1: wait on sync flip 2493 * - 2: wait on vblank 2494 * - 3: wait on scanline 2495 * - 4: wait on semaphore 2496 * - 5: context preempted (not on SEMAPHORE_WAIT or 2497 * WAIT_FOR_EVENT) 2498 * bit 36: reserved 2499 * bits 37-43: wait detail (for switch detail 1 to 4) 2500 * bits 44-46: reserved 2501 * bits 47-57: sw context id of the lrc the GT switched away from 2502 * bits 58-63: sw counter of the lrc the GT switched away from 2503 */ 2504 static inline bool 2505 gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2506 { 2507 u32 lower_dw = csb[0]; 2508 u32 upper_dw = csb[1]; 2509 bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw); 2510 bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw); 2511 bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE; 2512 2513 /* 2514 * The context switch detail is not guaranteed to be 5 when a preemption 2515 * occurs, so we can't just check for that. The check below works for 2516 * all the cases we care about, including preemptions of WAIT 2517 * instructions and lite-restore. Preempt-to-idle via the CTRL register 2518 * would require some extra handling, but we don't support that. 2519 */ 2520 if (!ctx_away_valid || new_queue) { 2521 GEM_BUG_ON(!ctx_to_valid); 2522 return true; 2523 } 2524 2525 /* 2526 * switch detail = 5 is covered by the case above and we do not expect a 2527 * context switch on an unsuccessful wait instruction since we always 2528 * use polling mode. 2529 */ 2530 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw)); 2531 return false; 2532 } 2533 2534 static inline bool 2535 gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb) 2536 { 2537 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED); 2538 } 2539 2540 static void process_csb(struct intel_engine_cs *engine) 2541 { 2542 struct intel_engine_execlists * const execlists = &engine->execlists; 2543 const u32 * const buf = execlists->csb_status; 2544 const u8 num_entries = execlists->csb_size; 2545 u8 head, tail; 2546 2547 /* 2548 * As we modify our execlists state tracking we require exclusive 2549 * access. Either we are inside the tasklet, or the tasklet is disabled 2550 * and we assume that is only inside the reset paths and so serialised. 2551 */ 2552 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) && 2553 !reset_in_progress(execlists)); 2554 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine)); 2555 2556 /* 2557 * Note that csb_write, csb_status may be either in HWSP or mmio. 2558 * When reading from the csb_write mmio register, we have to be 2559 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is 2560 * the low 4bits. As it happens we know the next 4bits are always 2561 * zero and so we can simply masked off the low u8 of the register 2562 * and treat it identically to reading from the HWSP (without having 2563 * to use explicit shifting and masking, and probably bifurcating 2564 * the code to handle the legacy mmio read). 2565 */ 2566 head = execlists->csb_head; 2567 tail = READ_ONCE(*execlists->csb_write); 2568 if (unlikely(head == tail)) 2569 return; 2570 2571 /* 2572 * We will consume all events from HW, or at least pretend to. 2573 * 2574 * The sequence of events from the HW is deterministic, and derived 2575 * from our writes to the ELSP, with a smidgen of variability for 2576 * the arrival of the asynchronous requests wrt to the inflight 2577 * execution. If the HW sends an event that does not correspond with 2578 * the one we are expecting, we have to abandon all hope as we lose 2579 * all tracking of what the engine is actually executing. We will 2580 * only detect we are out of sequence with the HW when we get an 2581 * 'impossible' event because we have already drained our own 2582 * preemption/promotion queue. If this occurs, we know that we likely 2583 * lost track of execution earlier and must unwind and restart, the 2584 * simplest way is by stop processing the event queue and force the 2585 * engine to reset. 2586 */ 2587 execlists->csb_head = tail; 2588 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail); 2589 2590 /* 2591 * Hopefully paired with a wmb() in HW! 2592 * 2593 * We must complete the read of the write pointer before any reads 2594 * from the CSB, so that we do not see stale values. Without an rmb 2595 * (lfence) the HW may speculatively perform the CSB[] reads *before* 2596 * we perform the READ_ONCE(*csb_write). 2597 */ 2598 rmb(); 2599 do { 2600 bool promote; 2601 2602 if (++head == num_entries) 2603 head = 0; 2604 2605 /* 2606 * We are flying near dragons again. 2607 * 2608 * We hold a reference to the request in execlist_port[] 2609 * but no more than that. We are operating in softirq 2610 * context and so cannot hold any mutex or sleep. That 2611 * prevents us stopping the requests we are processing 2612 * in port[] from being retired simultaneously (the 2613 * breadcrumb will be complete before we see the 2614 * context-switch). As we only hold the reference to the 2615 * request, any pointer chasing underneath the request 2616 * is subject to a potential use-after-free. Thus we 2617 * store all of the bookkeeping within port[] as 2618 * required, and avoid using unguarded pointers beneath 2619 * request itself. The same applies to the atomic 2620 * status notifier. 2621 */ 2622 2623 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n", 2624 head, buf[2 * head + 0], buf[2 * head + 1]); 2625 2626 if (INTEL_GEN(engine->i915) >= 12) 2627 promote = gen12_csb_parse(execlists, buf + 2 * head); 2628 else 2629 promote = gen8_csb_parse(execlists, buf + 2 * head); 2630 if (promote) { 2631 struct i915_request * const *old = execlists->active; 2632 2633 if (GEM_WARN_ON(!*execlists->pending)) { 2634 execlists->error_interrupt |= ERROR_CSB; 2635 break; 2636 } 2637 2638 ring_set_paused(engine, 0); 2639 2640 /* Point active to the new ELSP; prevent overwriting */ 2641 WRITE_ONCE(execlists->active, execlists->pending); 2642 smp_wmb(); /* notify execlists_active() */ 2643 2644 /* cancel old inflight, prepare for switch */ 2645 trace_ports(execlists, "preempted", old); 2646 while (*old) 2647 execlists_schedule_out(*old++); 2648 2649 /* switch pending to inflight */ 2650 GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); 2651 memcpy(execlists->inflight, 2652 execlists->pending, 2653 execlists_num_ports(execlists) * 2654 sizeof(*execlists->pending)); 2655 smp_wmb(); /* complete the seqlock */ 2656 WRITE_ONCE(execlists->active, execlists->inflight); 2657 2658 WRITE_ONCE(execlists->pending[0], NULL); 2659 } else { 2660 if (GEM_WARN_ON(!*execlists->active)) { 2661 execlists->error_interrupt |= ERROR_CSB; 2662 break; 2663 } 2664 2665 /* port0 completed, advanced to port1 */ 2666 trace_ports(execlists, "completed", execlists->active); 2667 2668 /* 2669 * We rely on the hardware being strongly 2670 * ordered, that the breadcrumb write is 2671 * coherent (visible from the CPU) before the 2672 * user interrupt is processed. One might assume 2673 * that the breadcrumb write being before the 2674 * user interrupt and the CS event for the context 2675 * switch would therefore be before the CS event 2676 * itself... 2677 */ 2678 if (GEM_SHOW_DEBUG() && 2679 !i915_request_completed(*execlists->active)) { 2680 struct i915_request *rq = *execlists->active; 2681 const u32 *regs __maybe_unused = 2682 rq->context->lrc_reg_state; 2683 2684 ENGINE_TRACE(engine, 2685 "context completed before request!\n"); 2686 ENGINE_TRACE(engine, 2687 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", 2688 ENGINE_READ(engine, RING_START), 2689 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, 2690 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR, 2691 ENGINE_READ(engine, RING_CTL), 2692 ENGINE_READ(engine, RING_MI_MODE)); 2693 ENGINE_TRACE(engine, 2694 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ", 2695 i915_ggtt_offset(rq->ring->vma), 2696 rq->head, rq->tail, 2697 rq->fence.context, 2698 lower_32_bits(rq->fence.seqno), 2699 hwsp_seqno(rq)); 2700 ENGINE_TRACE(engine, 2701 "ctx:{start:%08x, head:%04x, tail:%04x}, ", 2702 regs[CTX_RING_START], 2703 regs[CTX_RING_HEAD], 2704 regs[CTX_RING_TAIL]); 2705 } 2706 2707 execlists_schedule_out(*execlists->active++); 2708 2709 GEM_BUG_ON(execlists->active - execlists->inflight > 2710 execlists_num_ports(execlists)); 2711 } 2712 } while (head != tail); 2713 2714 set_timeslice(engine); 2715 2716 /* 2717 * Gen11 has proven to fail wrt global observation point between 2718 * entry and tail update, failing on the ordering and thus 2719 * we see an old entry in the context status buffer. 2720 * 2721 * Forcibly evict out entries for the next gpu csb update, 2722 * to increase the odds that we get a fresh entries with non 2723 * working hardware. The cost for doing so comes out mostly with 2724 * the wash as hardware, working or not, will need to do the 2725 * invalidation before. 2726 */ 2727 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); 2728 } 2729 2730 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) 2731 { 2732 lockdep_assert_held(&engine->active.lock); 2733 if (!READ_ONCE(engine->execlists.pending[0])) { 2734 rcu_read_lock(); /* protect peeking at execlists->active */ 2735 execlists_dequeue(engine); 2736 rcu_read_unlock(); 2737 } 2738 } 2739 2740 static void __execlists_hold(struct i915_request *rq) 2741 { 2742 LIST_HEAD(list); 2743 2744 do { 2745 struct i915_dependency *p; 2746 2747 if (i915_request_is_active(rq)) 2748 __i915_request_unsubmit(rq); 2749 2750 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2751 list_move_tail(&rq->sched.link, &rq->engine->active.hold); 2752 i915_request_set_hold(rq); 2753 RQ_TRACE(rq, "on hold\n"); 2754 2755 for_each_waiter(p, rq) { 2756 struct i915_request *w = 2757 container_of(p->waiter, typeof(*w), sched); 2758 2759 /* Leave semaphores spinning on the other engines */ 2760 if (w->engine != rq->engine) 2761 continue; 2762 2763 if (!i915_request_is_ready(w)) 2764 continue; 2765 2766 if (i915_request_completed(w)) 2767 continue; 2768 2769 if (i915_request_on_hold(w)) 2770 continue; 2771 2772 list_move_tail(&w->sched.link, &list); 2773 } 2774 2775 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2776 } while (rq); 2777 } 2778 2779 static bool execlists_hold(struct intel_engine_cs *engine, 2780 struct i915_request *rq) 2781 { 2782 spin_lock_irq(&engine->active.lock); 2783 2784 if (i915_request_completed(rq)) { /* too late! */ 2785 rq = NULL; 2786 goto unlock; 2787 } 2788 2789 if (rq->engine != engine) { /* preempted virtual engine */ 2790 struct virtual_engine *ve = to_virtual_engine(rq->engine); 2791 2792 /* 2793 * intel_context_inflight() is only protected by virtue 2794 * of process_csb() being called only by the tasklet (or 2795 * directly from inside reset while the tasklet is suspended). 2796 * Assert that neither of those are allowed to run while we 2797 * poke at the request queues. 2798 */ 2799 GEM_BUG_ON(!reset_in_progress(&engine->execlists)); 2800 2801 /* 2802 * An unsubmitted request along a virtual engine will 2803 * remain on the active (this) engine until we are able 2804 * to process the context switch away (and so mark the 2805 * context as no longer in flight). That cannot have happened 2806 * yet, otherwise we would not be hanging! 2807 */ 2808 spin_lock(&ve->base.active.lock); 2809 GEM_BUG_ON(intel_context_inflight(rq->context) != engine); 2810 GEM_BUG_ON(ve->request != rq); 2811 ve->request = NULL; 2812 spin_unlock(&ve->base.active.lock); 2813 i915_request_put(rq); 2814 2815 rq->engine = engine; 2816 } 2817 2818 /* 2819 * Transfer this request onto the hold queue to prevent it 2820 * being resumbitted to HW (and potentially completed) before we have 2821 * released it. Since we may have already submitted following 2822 * requests, we need to remove those as well. 2823 */ 2824 GEM_BUG_ON(i915_request_on_hold(rq)); 2825 GEM_BUG_ON(rq->engine != engine); 2826 __execlists_hold(rq); 2827 GEM_BUG_ON(list_empty(&engine->active.hold)); 2828 2829 unlock: 2830 spin_unlock_irq(&engine->active.lock); 2831 return rq; 2832 } 2833 2834 static bool hold_request(const struct i915_request *rq) 2835 { 2836 struct i915_dependency *p; 2837 bool result = false; 2838 2839 /* 2840 * If one of our ancestors is on hold, we must also be on hold, 2841 * otherwise we will bypass it and execute before it. 2842 */ 2843 rcu_read_lock(); 2844 for_each_signaler(p, rq) { 2845 const struct i915_request *s = 2846 container_of(p->signaler, typeof(*s), sched); 2847 2848 if (s->engine != rq->engine) 2849 continue; 2850 2851 result = i915_request_on_hold(s); 2852 if (result) 2853 break; 2854 } 2855 rcu_read_unlock(); 2856 2857 return result; 2858 } 2859 2860 static void __execlists_unhold(struct i915_request *rq) 2861 { 2862 LIST_HEAD(list); 2863 2864 do { 2865 struct i915_dependency *p; 2866 2867 RQ_TRACE(rq, "hold release\n"); 2868 2869 GEM_BUG_ON(!i915_request_on_hold(rq)); 2870 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); 2871 2872 i915_request_clear_hold(rq); 2873 list_move_tail(&rq->sched.link, 2874 i915_sched_lookup_priolist(rq->engine, 2875 rq_prio(rq))); 2876 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 2877 2878 /* Also release any children on this engine that are ready */ 2879 for_each_waiter(p, rq) { 2880 struct i915_request *w = 2881 container_of(p->waiter, typeof(*w), sched); 2882 2883 /* Propagate any change in error status */ 2884 if (rq->fence.error) 2885 i915_request_set_error_once(w, rq->fence.error); 2886 2887 if (w->engine != rq->engine) 2888 continue; 2889 2890 if (!i915_request_on_hold(w)) 2891 continue; 2892 2893 /* Check that no other parents are also on hold */ 2894 if (hold_request(w)) 2895 continue; 2896 2897 list_move_tail(&w->sched.link, &list); 2898 } 2899 2900 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link); 2901 } while (rq); 2902 } 2903 2904 static void execlists_unhold(struct intel_engine_cs *engine, 2905 struct i915_request *rq) 2906 { 2907 spin_lock_irq(&engine->active.lock); 2908 2909 /* 2910 * Move this request back to the priority queue, and all of its 2911 * children and grandchildren that were suspended along with it. 2912 */ 2913 __execlists_unhold(rq); 2914 2915 if (rq_prio(rq) > engine->execlists.queue_priority_hint) { 2916 engine->execlists.queue_priority_hint = rq_prio(rq); 2917 tasklet_hi_schedule(&engine->execlists.tasklet); 2918 } 2919 2920 spin_unlock_irq(&engine->active.lock); 2921 } 2922 2923 struct execlists_capture { 2924 struct work_struct work; 2925 struct i915_request *rq; 2926 struct i915_gpu_coredump *error; 2927 }; 2928 2929 static void execlists_capture_work(struct work_struct *work) 2930 { 2931 struct execlists_capture *cap = container_of(work, typeof(*cap), work); 2932 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 2933 struct intel_engine_cs *engine = cap->rq->engine; 2934 struct intel_gt_coredump *gt = cap->error->gt; 2935 struct intel_engine_capture_vma *vma; 2936 2937 /* Compress all the objects attached to the request, slow! */ 2938 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp); 2939 if (vma) { 2940 struct i915_vma_compress *compress = 2941 i915_vma_capture_prepare(gt); 2942 2943 intel_engine_coredump_add_vma(gt->engine, vma, compress); 2944 i915_vma_capture_finish(gt, compress); 2945 } 2946 2947 gt->simulated = gt->engine->simulated; 2948 cap->error->simulated = gt->simulated; 2949 2950 /* Publish the error state, and announce it to the world */ 2951 i915_error_state_store(cap->error); 2952 i915_gpu_coredump_put(cap->error); 2953 2954 /* Return this request and all that depend upon it for signaling */ 2955 execlists_unhold(engine, cap->rq); 2956 i915_request_put(cap->rq); 2957 2958 kfree(cap); 2959 } 2960 2961 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine) 2962 { 2963 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; 2964 struct execlists_capture *cap; 2965 2966 cap = kmalloc(sizeof(*cap), gfp); 2967 if (!cap) 2968 return NULL; 2969 2970 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp); 2971 if (!cap->error) 2972 goto err_cap; 2973 2974 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp); 2975 if (!cap->error->gt) 2976 goto err_gpu; 2977 2978 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp); 2979 if (!cap->error->gt->engine) 2980 goto err_gt; 2981 2982 return cap; 2983 2984 err_gt: 2985 kfree(cap->error->gt); 2986 err_gpu: 2987 kfree(cap->error); 2988 err_cap: 2989 kfree(cap); 2990 return NULL; 2991 } 2992 2993 static struct i915_request * 2994 active_context(struct intel_engine_cs *engine, u32 ccid) 2995 { 2996 const struct intel_engine_execlists * const el = &engine->execlists; 2997 struct i915_request * const *port, *rq; 2998 2999 /* 3000 * Use the most recent result from process_csb(), but just in case 3001 * we trigger an error (via interrupt) before the first CS event has 3002 * been written, peek at the next submission. 3003 */ 3004 3005 for (port = el->active; (rq = *port); port++) { 3006 if (rq->context->lrc.ccid == ccid) { 3007 ENGINE_TRACE(engine, 3008 "ccid found at active:%zd\n", 3009 port - el->active); 3010 return rq; 3011 } 3012 } 3013 3014 for (port = el->pending; (rq = *port); port++) { 3015 if (rq->context->lrc.ccid == ccid) { 3016 ENGINE_TRACE(engine, 3017 "ccid found at pending:%zd\n", 3018 port - el->pending); 3019 return rq; 3020 } 3021 } 3022 3023 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); 3024 return NULL; 3025 } 3026 3027 static u32 active_ccid(struct intel_engine_cs *engine) 3028 { 3029 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); 3030 } 3031 3032 static void execlists_capture(struct intel_engine_cs *engine) 3033 { 3034 struct execlists_capture *cap; 3035 3036 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)) 3037 return; 3038 3039 /* 3040 * We need to _quickly_ capture the engine state before we reset. 3041 * We are inside an atomic section (softirq) here and we are delaying 3042 * the forced preemption event. 3043 */ 3044 cap = capture_regs(engine); 3045 if (!cap) 3046 return; 3047 3048 spin_lock_irq(&engine->active.lock); 3049 cap->rq = active_context(engine, active_ccid(engine)); 3050 if (cap->rq) { 3051 cap->rq = active_request(cap->rq->context->timeline, cap->rq); 3052 cap->rq = i915_request_get_rcu(cap->rq); 3053 } 3054 spin_unlock_irq(&engine->active.lock); 3055 if (!cap->rq) 3056 goto err_free; 3057 3058 /* 3059 * Remove the request from the execlists queue, and take ownership 3060 * of the request. We pass it to our worker who will _slowly_ compress 3061 * all the pages the _user_ requested for debugging their batch, after 3062 * which we return it to the queue for signaling. 3063 * 3064 * By removing them from the execlists queue, we also remove the 3065 * requests from being processed by __unwind_incomplete_requests() 3066 * during the intel_engine_reset(), and so they will *not* be replayed 3067 * afterwards. 3068 * 3069 * Note that because we have not yet reset the engine at this point, 3070 * it is possible for the request that we have identified as being 3071 * guilty, did in fact complete and we will then hit an arbitration 3072 * point allowing the outstanding preemption to succeed. The likelihood 3073 * of that is very low (as capturing of the engine registers should be 3074 * fast enough to run inside an irq-off atomic section!), so we will 3075 * simply hold that request accountable for being non-preemptible 3076 * long enough to force the reset. 3077 */ 3078 if (!execlists_hold(engine, cap->rq)) 3079 goto err_rq; 3080 3081 INIT_WORK(&cap->work, execlists_capture_work); 3082 schedule_work(&cap->work); 3083 return; 3084 3085 err_rq: 3086 i915_request_put(cap->rq); 3087 err_free: 3088 i915_gpu_coredump_put(cap->error); 3089 kfree(cap); 3090 } 3091 3092 static void execlists_reset(struct intel_engine_cs *engine, const char *msg) 3093 { 3094 const unsigned int bit = I915_RESET_ENGINE + engine->id; 3095 unsigned long *lock = &engine->gt->reset.flags; 3096 3097 if (!intel_has_reset_engine(engine->gt)) 3098 return; 3099 3100 if (test_and_set_bit(bit, lock)) 3101 return; 3102 3103 ENGINE_TRACE(engine, "reset for %s\n", msg); 3104 3105 /* Mark this tasklet as disabled to avoid waiting for it to complete */ 3106 tasklet_disable_nosync(&engine->execlists.tasklet); 3107 3108 ring_set_paused(engine, 1); /* Freeze the current request in place */ 3109 execlists_capture(engine); 3110 intel_engine_reset(engine, msg); 3111 3112 tasklet_enable(&engine->execlists.tasklet); 3113 clear_and_wake_up_bit(bit, lock); 3114 } 3115 3116 static bool preempt_timeout(const struct intel_engine_cs *const engine) 3117 { 3118 const struct timer_list *t = &engine->execlists.preempt; 3119 3120 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT) 3121 return false; 3122 3123 if (!timer_expired(t)) 3124 return false; 3125 3126 return READ_ONCE(engine->execlists.pending[0]); 3127 } 3128 3129 /* 3130 * Check the unread Context Status Buffers and manage the submission of new 3131 * contexts to the ELSP accordingly. 3132 */ 3133 static void execlists_submission_tasklet(unsigned long data) 3134 { 3135 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 3136 bool timeout = preempt_timeout(engine); 3137 3138 process_csb(engine); 3139 3140 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { 3141 const char *msg; 3142 3143 /* Generate the error message in priority wrt to the user! */ 3144 if (engine->execlists.error_interrupt & GENMASK(15, 0)) 3145 msg = "CS error"; /* thrown by a user payload */ 3146 else if (engine->execlists.error_interrupt & ERROR_CSB) 3147 msg = "invalid CSB event"; 3148 else 3149 msg = "internal error"; 3150 3151 engine->execlists.error_interrupt = 0; 3152 execlists_reset(engine, msg); 3153 } 3154 3155 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { 3156 unsigned long flags; 3157 3158 spin_lock_irqsave(&engine->active.lock, flags); 3159 __execlists_submission_tasklet(engine); 3160 spin_unlock_irqrestore(&engine->active.lock, flags); 3161 3162 /* Recheck after serialising with direct-submission */ 3163 if (unlikely(timeout && preempt_timeout(engine))) 3164 execlists_reset(engine, "preemption time out"); 3165 } 3166 } 3167 3168 static void __execlists_kick(struct intel_engine_execlists *execlists) 3169 { 3170 /* Kick the tasklet for some interrupt coalescing and reset handling */ 3171 tasklet_hi_schedule(&execlists->tasklet); 3172 } 3173 3174 #define execlists_kick(t, member) \ 3175 __execlists_kick(container_of(t, struct intel_engine_execlists, member)) 3176 3177 static void execlists_timeslice(struct timer_list *timer) 3178 { 3179 execlists_kick(timer, timer); 3180 } 3181 3182 static void execlists_preempt(struct timer_list *timer) 3183 { 3184 execlists_kick(timer, preempt); 3185 } 3186 3187 static void queue_request(struct intel_engine_cs *engine, 3188 struct i915_request *rq) 3189 { 3190 GEM_BUG_ON(!list_empty(&rq->sched.link)); 3191 list_add_tail(&rq->sched.link, 3192 i915_sched_lookup_priolist(engine, rq_prio(rq))); 3193 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); 3194 } 3195 3196 static void __submit_queue_imm(struct intel_engine_cs *engine) 3197 { 3198 struct intel_engine_execlists * const execlists = &engine->execlists; 3199 3200 if (reset_in_progress(execlists)) 3201 return; /* defer until we restart the engine following reset */ 3202 3203 __execlists_submission_tasklet(engine); 3204 } 3205 3206 static void submit_queue(struct intel_engine_cs *engine, 3207 const struct i915_request *rq) 3208 { 3209 struct intel_engine_execlists *execlists = &engine->execlists; 3210 3211 if (rq_prio(rq) <= execlists->queue_priority_hint) 3212 return; 3213 3214 execlists->queue_priority_hint = rq_prio(rq); 3215 __submit_queue_imm(engine); 3216 } 3217 3218 static bool ancestor_on_hold(const struct intel_engine_cs *engine, 3219 const struct i915_request *rq) 3220 { 3221 GEM_BUG_ON(i915_request_on_hold(rq)); 3222 return !list_empty(&engine->active.hold) && hold_request(rq); 3223 } 3224 3225 static void flush_csb(struct intel_engine_cs *engine) 3226 { 3227 struct intel_engine_execlists *el = &engine->execlists; 3228 3229 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) { 3230 if (!reset_in_progress(el)) 3231 process_csb(engine); 3232 tasklet_unlock(&el->tasklet); 3233 } 3234 } 3235 3236 static void execlists_submit_request(struct i915_request *request) 3237 { 3238 struct intel_engine_cs *engine = request->engine; 3239 unsigned long flags; 3240 3241 /* Hopefully we clear execlists->pending[] to let us through */ 3242 flush_csb(engine); 3243 3244 /* Will be called from irq-context when using foreign fences. */ 3245 spin_lock_irqsave(&engine->active.lock, flags); 3246 3247 if (unlikely(ancestor_on_hold(engine, request))) { 3248 RQ_TRACE(request, "ancestor on hold\n"); 3249 list_add_tail(&request->sched.link, &engine->active.hold); 3250 i915_request_set_hold(request); 3251 } else { 3252 queue_request(engine, request); 3253 3254 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); 3255 GEM_BUG_ON(list_empty(&request->sched.link)); 3256 3257 submit_queue(engine, request); 3258 } 3259 3260 spin_unlock_irqrestore(&engine->active.lock, flags); 3261 } 3262 3263 static void __execlists_context_fini(struct intel_context *ce) 3264 { 3265 intel_ring_put(ce->ring); 3266 i915_vma_put(ce->state); 3267 } 3268 3269 static void execlists_context_destroy(struct kref *kref) 3270 { 3271 struct intel_context *ce = container_of(kref, typeof(*ce), ref); 3272 3273 GEM_BUG_ON(!i915_active_is_idle(&ce->active)); 3274 GEM_BUG_ON(intel_context_is_pinned(ce)); 3275 3276 if (ce->state) 3277 __execlists_context_fini(ce); 3278 3279 intel_context_fini(ce); 3280 intel_context_free(ce); 3281 } 3282 3283 static void 3284 set_redzone(void *vaddr, const struct intel_engine_cs *engine) 3285 { 3286 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3287 return; 3288 3289 vaddr += engine->context_size; 3290 3291 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE); 3292 } 3293 3294 static void 3295 check_redzone(const void *vaddr, const struct intel_engine_cs *engine) 3296 { 3297 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 3298 return; 3299 3300 vaddr += engine->context_size; 3301 3302 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) 3303 drm_err_once(&engine->i915->drm, 3304 "%s context redzone overwritten!\n", 3305 engine->name); 3306 } 3307 3308 static void execlists_context_unpin(struct intel_context *ce) 3309 { 3310 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, 3311 ce->engine); 3312 3313 i915_gem_object_unpin_map(ce->state->obj); 3314 } 3315 3316 static u32 * 3317 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) 3318 { 3319 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3320 MI_SRM_LRM_GLOBAL_GTT | 3321 MI_LRI_LRM_CS_MMIO; 3322 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3323 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3324 CTX_TIMESTAMP * sizeof(u32); 3325 *cs++ = 0; 3326 3327 *cs++ = MI_LOAD_REGISTER_REG | 3328 MI_LRR_SOURCE_CS_MMIO | 3329 MI_LRI_LRM_CS_MMIO; 3330 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3331 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3332 3333 *cs++ = MI_LOAD_REGISTER_REG | 3334 MI_LRR_SOURCE_CS_MMIO | 3335 MI_LRI_LRM_CS_MMIO; 3336 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3337 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); 3338 3339 return cs; 3340 } 3341 3342 static u32 * 3343 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) 3344 { 3345 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); 3346 3347 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3348 MI_SRM_LRM_GLOBAL_GTT | 3349 MI_LRI_LRM_CS_MMIO; 3350 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3351 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3352 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); 3353 *cs++ = 0; 3354 3355 return cs; 3356 } 3357 3358 static u32 * 3359 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) 3360 { 3361 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); 3362 3363 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | 3364 MI_SRM_LRM_GLOBAL_GTT | 3365 MI_LRI_LRM_CS_MMIO; 3366 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3367 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + 3368 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); 3369 *cs++ = 0; 3370 3371 *cs++ = MI_LOAD_REGISTER_REG | 3372 MI_LRR_SOURCE_CS_MMIO | 3373 MI_LRI_LRM_CS_MMIO; 3374 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); 3375 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); 3376 3377 return cs; 3378 } 3379 3380 static u32 * 3381 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) 3382 { 3383 cs = gen12_emit_timestamp_wa(ce, cs); 3384 cs = gen12_emit_cmd_buf_wa(ce, cs); 3385 cs = gen12_emit_restore_scratch(ce, cs); 3386 3387 return cs; 3388 } 3389 3390 static u32 * 3391 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) 3392 { 3393 cs = gen12_emit_timestamp_wa(ce, cs); 3394 cs = gen12_emit_restore_scratch(ce, cs); 3395 3396 return cs; 3397 } 3398 3399 static inline u32 context_wa_bb_offset(const struct intel_context *ce) 3400 { 3401 return PAGE_SIZE * ce->wa_bb_page; 3402 } 3403 3404 static u32 *context_indirect_bb(const struct intel_context *ce) 3405 { 3406 void *ptr; 3407 3408 GEM_BUG_ON(!ce->wa_bb_page); 3409 3410 ptr = ce->lrc_reg_state; 3411 ptr -= LRC_STATE_OFFSET; /* back to start of context image */ 3412 ptr += context_wa_bb_offset(ce); 3413 3414 return ptr; 3415 } 3416 3417 static void 3418 setup_indirect_ctx_bb(const struct intel_context *ce, 3419 const struct intel_engine_cs *engine, 3420 u32 *(*emit)(const struct intel_context *, u32 *)) 3421 { 3422 u32 * const start = context_indirect_bb(ce); 3423 u32 *cs; 3424 3425 cs = emit(ce, start); 3426 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); 3427 while ((unsigned long)cs % CACHELINE_BYTES) 3428 *cs++ = MI_NOOP; 3429 3430 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, 3431 i915_ggtt_offset(ce->state) + 3432 context_wa_bb_offset(ce), 3433 (cs - start) * sizeof(*cs)); 3434 } 3435 3436 static void 3437 __execlists_update_reg_state(const struct intel_context *ce, 3438 const struct intel_engine_cs *engine, 3439 u32 head) 3440 { 3441 struct intel_ring *ring = ce->ring; 3442 u32 *regs = ce->lrc_reg_state; 3443 3444 GEM_BUG_ON(!intel_ring_offset_valid(ring, head)); 3445 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); 3446 3447 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma); 3448 regs[CTX_RING_HEAD] = head; 3449 regs[CTX_RING_TAIL] = ring->tail; 3450 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 3451 3452 /* RPCS */ 3453 if (engine->class == RENDER_CLASS) { 3454 regs[CTX_R_PWR_CLK_STATE] = 3455 intel_sseu_make_rpcs(engine->gt, &ce->sseu); 3456 3457 i915_oa_init_reg_state(ce, engine); 3458 } 3459 3460 if (ce->wa_bb_page) { 3461 u32 *(*fn)(const struct intel_context *ce, u32 *cs); 3462 3463 fn = gen12_emit_indirect_ctx_xcs; 3464 if (ce->engine->class == RENDER_CLASS) 3465 fn = gen12_emit_indirect_ctx_rcs; 3466 3467 /* Mutually exclusive wrt to global indirect bb */ 3468 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); 3469 setup_indirect_ctx_bb(ce, engine, fn); 3470 } 3471 } 3472 3473 static int 3474 __execlists_context_pin(struct intel_context *ce, 3475 struct intel_engine_cs *engine) 3476 { 3477 void *vaddr; 3478 3479 GEM_BUG_ON(!ce->state); 3480 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 3481 3482 vaddr = i915_gem_object_pin_map(ce->state->obj, 3483 i915_coherent_map_type(engine->i915) | 3484 I915_MAP_OVERRIDE); 3485 if (IS_ERR(vaddr)) 3486 return PTR_ERR(vaddr); 3487 3488 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; 3489 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; 3490 __execlists_update_reg_state(ce, engine, ce->ring->tail); 3491 3492 return 0; 3493 } 3494 3495 static int execlists_context_pin(struct intel_context *ce) 3496 { 3497 return __execlists_context_pin(ce, ce->engine); 3498 } 3499 3500 static int execlists_context_alloc(struct intel_context *ce) 3501 { 3502 return __execlists_context_alloc(ce, ce->engine); 3503 } 3504 3505 static void execlists_context_reset(struct intel_context *ce) 3506 { 3507 CE_TRACE(ce, "reset\n"); 3508 GEM_BUG_ON(!intel_context_is_pinned(ce)); 3509 3510 intel_ring_reset(ce->ring, ce->ring->emit); 3511 3512 /* Scrub away the garbage */ 3513 execlists_init_reg_state(ce->lrc_reg_state, 3514 ce, ce->engine, ce->ring, true); 3515 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail); 3516 3517 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; 3518 } 3519 3520 static const struct intel_context_ops execlists_context_ops = { 3521 .alloc = execlists_context_alloc, 3522 3523 .pin = execlists_context_pin, 3524 .unpin = execlists_context_unpin, 3525 3526 .enter = intel_context_enter_engine, 3527 .exit = intel_context_exit_engine, 3528 3529 .reset = execlists_context_reset, 3530 .destroy = execlists_context_destroy, 3531 }; 3532 3533 static int gen8_emit_init_breadcrumb(struct i915_request *rq) 3534 { 3535 u32 *cs; 3536 3537 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); 3538 if (!i915_request_timeline(rq)->has_initial_breadcrumb) 3539 return 0; 3540 3541 cs = intel_ring_begin(rq, 6); 3542 if (IS_ERR(cs)) 3543 return PTR_ERR(cs); 3544 3545 /* 3546 * Check if we have been preempted before we even get started. 3547 * 3548 * After this point i915_request_started() reports true, even if 3549 * we get preempted and so are no longer running. 3550 */ 3551 *cs++ = MI_ARB_CHECK; 3552 *cs++ = MI_NOOP; 3553 3554 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; 3555 *cs++ = i915_request_timeline(rq)->hwsp_offset; 3556 *cs++ = 0; 3557 *cs++ = rq->fence.seqno - 1; 3558 3559 intel_ring_advance(rq, cs); 3560 3561 /* Record the updated position of the request's payload */ 3562 rq->infix = intel_ring_offset(rq, cs); 3563 3564 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); 3565 3566 return 0; 3567 } 3568 3569 static int emit_pdps(struct i915_request *rq) 3570 { 3571 const struct intel_engine_cs * const engine = rq->engine; 3572 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); 3573 int err, i; 3574 u32 *cs; 3575 3576 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915)); 3577 3578 /* 3579 * Beware ye of the dragons, this sequence is magic! 3580 * 3581 * Small changes to this sequence can cause anything from 3582 * GPU hangs to forcewake errors and machine lockups! 3583 */ 3584 3585 /* Flush any residual operations from the context load */ 3586 err = engine->emit_flush(rq, EMIT_FLUSH); 3587 if (err) 3588 return err; 3589 3590 /* Magic required to prevent forcewake errors! */ 3591 err = engine->emit_flush(rq, EMIT_INVALIDATE); 3592 if (err) 3593 return err; 3594 3595 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); 3596 if (IS_ERR(cs)) 3597 return PTR_ERR(cs); 3598 3599 /* Ensure the LRI have landed before we invalidate & continue */ 3600 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; 3601 for (i = GEN8_3LVL_PDPES; i--; ) { 3602 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); 3603 u32 base = engine->mmio_base; 3604 3605 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); 3606 *cs++ = upper_32_bits(pd_daddr); 3607 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); 3608 *cs++ = lower_32_bits(pd_daddr); 3609 } 3610 *cs++ = MI_NOOP; 3611 3612 intel_ring_advance(rq, cs); 3613 3614 return 0; 3615 } 3616 3617 static int execlists_request_alloc(struct i915_request *request) 3618 { 3619 int ret; 3620 3621 GEM_BUG_ON(!intel_context_is_pinned(request->context)); 3622 3623 /* 3624 * Flush enough space to reduce the likelihood of waiting after 3625 * we start building the request - in which case we will just 3626 * have to repeat work. 3627 */ 3628 request->reserved_space += EXECLISTS_REQUEST_SIZE; 3629 3630 /* 3631 * Note that after this point, we have committed to using 3632 * this request as it is being used to both track the 3633 * state of engine initialisation and liveness of the 3634 * golden renderstate above. Think twice before you try 3635 * to cancel/unwind this request now. 3636 */ 3637 3638 if (!i915_vm_is_4lvl(request->context->vm)) { 3639 ret = emit_pdps(request); 3640 if (ret) 3641 return ret; 3642 } 3643 3644 /* Unconditionally invalidate GPU caches and TLBs. */ 3645 ret = request->engine->emit_flush(request, EMIT_INVALIDATE); 3646 if (ret) 3647 return ret; 3648 3649 request->reserved_space -= EXECLISTS_REQUEST_SIZE; 3650 return 0; 3651 } 3652 3653 /* 3654 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after 3655 * PIPE_CONTROL instruction. This is required for the flush to happen correctly 3656 * but there is a slight complication as this is applied in WA batch where the 3657 * values are only initialized once so we cannot take register value at the 3658 * beginning and reuse it further; hence we save its value to memory, upload a 3659 * constant value with bit21 set and then we restore it back with the saved value. 3660 * To simplify the WA, a constant value is formed by using the default value 3661 * of this register. This shouldn't be a problem because we are only modifying 3662 * it for a short period and this batch in non-premptible. We can ofcourse 3663 * use additional instructions that read the actual value of the register 3664 * at that time and set our bit of interest but it makes the WA complicated. 3665 * 3666 * This WA is also required for Gen9 so extracting as a function avoids 3667 * code duplication. 3668 */ 3669 static u32 * 3670 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) 3671 { 3672 /* NB no one else is allowed to scribble over scratch + 256! */ 3673 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3674 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3675 *batch++ = intel_gt_scratch_offset(engine->gt, 3676 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3677 *batch++ = 0; 3678 3679 *batch++ = MI_LOAD_REGISTER_IMM(1); 3680 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3681 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; 3682 3683 batch = gen8_emit_pipe_control(batch, 3684 PIPE_CONTROL_CS_STALL | 3685 PIPE_CONTROL_DC_FLUSH_ENABLE, 3686 0); 3687 3688 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; 3689 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); 3690 *batch++ = intel_gt_scratch_offset(engine->gt, 3691 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA); 3692 *batch++ = 0; 3693 3694 return batch; 3695 } 3696 3697 /* 3698 * Typically we only have one indirect_ctx and per_ctx batch buffer which are 3699 * initialized at the beginning and shared across all contexts but this field 3700 * helps us to have multiple batches at different offsets and select them based 3701 * on a criteria. At the moment this batch always start at the beginning of the page 3702 * and at this point we don't have multiple wa_ctx batch buffers. 3703 * 3704 * The number of WA applied are not known at the beginning; we use this field 3705 * to return the no of DWORDS written. 3706 * 3707 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END 3708 * so it adds NOOPs as padding to make it cacheline aligned. 3709 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together 3710 * makes a complete batch buffer. 3711 */ 3712 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3713 { 3714 /* WaDisableCtxRestoreArbitration:bdw,chv */ 3715 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3716 3717 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ 3718 if (IS_BROADWELL(engine->i915)) 3719 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3720 3721 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ 3722 /* Actual scratch location is at 128 bytes offset */ 3723 batch = gen8_emit_pipe_control(batch, 3724 PIPE_CONTROL_FLUSH_L3 | 3725 PIPE_CONTROL_STORE_DATA_INDEX | 3726 PIPE_CONTROL_CS_STALL | 3727 PIPE_CONTROL_QW_WRITE, 3728 LRC_PPHWSP_SCRATCH_ADDR); 3729 3730 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3731 3732 /* Pad to end of cacheline */ 3733 while ((unsigned long)batch % CACHELINE_BYTES) 3734 *batch++ = MI_NOOP; 3735 3736 /* 3737 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because 3738 * execution depends on the length specified in terms of cache lines 3739 * in the register CTX_RCS_INDIRECT_CTX 3740 */ 3741 3742 return batch; 3743 } 3744 3745 struct lri { 3746 i915_reg_t reg; 3747 u32 value; 3748 }; 3749 3750 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) 3751 { 3752 GEM_BUG_ON(!count || count > 63); 3753 3754 *batch++ = MI_LOAD_REGISTER_IMM(count); 3755 do { 3756 *batch++ = i915_mmio_reg_offset(lri->reg); 3757 *batch++ = lri->value; 3758 } while (lri++, --count); 3759 *batch++ = MI_NOOP; 3760 3761 return batch; 3762 } 3763 3764 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3765 { 3766 static const struct lri lri[] = { 3767 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ 3768 { 3769 COMMON_SLICE_CHICKEN2, 3770 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, 3771 0), 3772 }, 3773 3774 /* BSpec: 11391 */ 3775 { 3776 FF_SLICE_CHICKEN, 3777 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, 3778 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), 3779 }, 3780 3781 /* BSpec: 11299 */ 3782 { 3783 _3D_CHICKEN3, 3784 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, 3785 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), 3786 } 3787 }; 3788 3789 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 3790 3791 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ 3792 batch = gen8_emit_flush_coherentl3_wa(engine, batch); 3793 3794 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */ 3795 batch = gen8_emit_pipe_control(batch, 3796 PIPE_CONTROL_FLUSH_L3 | 3797 PIPE_CONTROL_STORE_DATA_INDEX | 3798 PIPE_CONTROL_CS_STALL | 3799 PIPE_CONTROL_QW_WRITE, 3800 LRC_PPHWSP_SCRATCH_ADDR); 3801 3802 batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); 3803 3804 /* WaMediaPoolStateCmdInWABB:bxt,glk */ 3805 if (HAS_POOLED_EU(engine->i915)) { 3806 /* 3807 * EU pool configuration is setup along with golden context 3808 * during context initialization. This value depends on 3809 * device type (2x6 or 3x6) and needs to be updated based 3810 * on which subslice is disabled especially for 2x6 3811 * devices, however it is safe to load default 3812 * configuration of 3x6 device instead of masking off 3813 * corresponding bits because HW ignores bits of a disabled 3814 * subslice and drops down to appropriate config. Please 3815 * see render_state_setup() in i915_gem_render_state.c for 3816 * possible configurations, to avoid duplication they are 3817 * not shown here again. 3818 */ 3819 *batch++ = GEN9_MEDIA_POOL_STATE; 3820 *batch++ = GEN9_MEDIA_POOL_ENABLE; 3821 *batch++ = 0x00777000; 3822 *batch++ = 0; 3823 *batch++ = 0; 3824 *batch++ = 0; 3825 } 3826 3827 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 3828 3829 /* Pad to end of cacheline */ 3830 while ((unsigned long)batch % CACHELINE_BYTES) 3831 *batch++ = MI_NOOP; 3832 3833 return batch; 3834 } 3835 3836 static u32 * 3837 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) 3838 { 3839 int i; 3840 3841 /* 3842 * WaPipeControlBefore3DStateSamplePattern: cnl 3843 * 3844 * Ensure the engine is idle prior to programming a 3845 * 3DSTATE_SAMPLE_PATTERN during a context restore. 3846 */ 3847 batch = gen8_emit_pipe_control(batch, 3848 PIPE_CONTROL_CS_STALL, 3849 0); 3850 /* 3851 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for 3852 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in 3853 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is 3854 * confusing. Since gen8_emit_pipe_control() already advances the 3855 * batch by 6 dwords, we advance the other 10 here, completing a 3856 * cacheline. It's not clear if the workaround requires this padding 3857 * before other commands, or if it's just the regular padding we would 3858 * already have for the workaround bb, so leave it here for now. 3859 */ 3860 for (i = 0; i < 10; i++) 3861 *batch++ = MI_NOOP; 3862 3863 /* Pad to end of cacheline */ 3864 while ((unsigned long)batch % CACHELINE_BYTES) 3865 *batch++ = MI_NOOP; 3866 3867 return batch; 3868 } 3869 3870 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) 3871 3872 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) 3873 { 3874 struct drm_i915_gem_object *obj; 3875 struct i915_vma *vma; 3876 int err; 3877 3878 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); 3879 if (IS_ERR(obj)) 3880 return PTR_ERR(obj); 3881 3882 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL); 3883 if (IS_ERR(vma)) { 3884 err = PTR_ERR(vma); 3885 goto err; 3886 } 3887 3888 err = i915_ggtt_pin(vma, 0, PIN_HIGH); 3889 if (err) 3890 goto err; 3891 3892 engine->wa_ctx.vma = vma; 3893 return 0; 3894 3895 err: 3896 i915_gem_object_put(obj); 3897 return err; 3898 } 3899 3900 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) 3901 { 3902 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); 3903 } 3904 3905 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); 3906 3907 static int intel_init_workaround_bb(struct intel_engine_cs *engine) 3908 { 3909 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; 3910 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, 3911 &wa_ctx->per_ctx }; 3912 wa_bb_func_t wa_bb_fn[2]; 3913 void *batch, *batch_ptr; 3914 unsigned int i; 3915 int ret; 3916 3917 if (engine->class != RENDER_CLASS) 3918 return 0; 3919 3920 switch (INTEL_GEN(engine->i915)) { 3921 case 12: 3922 case 11: 3923 return 0; 3924 case 10: 3925 wa_bb_fn[0] = gen10_init_indirectctx_bb; 3926 wa_bb_fn[1] = NULL; 3927 break; 3928 case 9: 3929 wa_bb_fn[0] = gen9_init_indirectctx_bb; 3930 wa_bb_fn[1] = NULL; 3931 break; 3932 case 8: 3933 wa_bb_fn[0] = gen8_init_indirectctx_bb; 3934 wa_bb_fn[1] = NULL; 3935 break; 3936 default: 3937 MISSING_CASE(INTEL_GEN(engine->i915)); 3938 return 0; 3939 } 3940 3941 ret = lrc_setup_wa_ctx(engine); 3942 if (ret) { 3943 drm_dbg(&engine->i915->drm, 3944 "Failed to setup context WA page: %d\n", ret); 3945 return ret; 3946 } 3947 3948 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB); 3949 3950 /* 3951 * Emit the two workaround batch buffers, recording the offset from the 3952 * start of the workaround batch buffer object for each and their 3953 * respective sizes. 3954 */ 3955 batch_ptr = batch; 3956 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { 3957 wa_bb[i]->offset = batch_ptr - batch; 3958 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, 3959 CACHELINE_BYTES))) { 3960 ret = -EINVAL; 3961 break; 3962 } 3963 if (wa_bb_fn[i]) 3964 batch_ptr = wa_bb_fn[i](engine, batch_ptr); 3965 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); 3966 } 3967 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); 3968 3969 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch); 3970 __i915_gem_object_release_map(wa_ctx->vma->obj); 3971 if (ret) 3972 lrc_destroy_wa_ctx(engine); 3973 3974 return ret; 3975 } 3976 3977 static void reset_csb_pointers(struct intel_engine_cs *engine) 3978 { 3979 struct intel_engine_execlists * const execlists = &engine->execlists; 3980 const unsigned int reset_value = execlists->csb_size - 1; 3981 3982 ring_set_paused(engine, 0); 3983 3984 /* 3985 * Sometimes Icelake forgets to reset its pointers on a GPU reset. 3986 * Bludgeon them with a mmio update to be sure. 3987 */ 3988 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 3989 0xffff << 16 | reset_value << 8 | reset_value); 3990 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 3991 3992 /* 3993 * After a reset, the HW starts writing into CSB entry [0]. We 3994 * therefore have to set our HEAD pointer back one entry so that 3995 * the *first* entry we check is entry 0. To complicate this further, 3996 * as we don't wait for the first interrupt after reset, we have to 3997 * fake the HW write to point back to the last entry so that our 3998 * inline comparison of our cached head position against the last HW 3999 * write works even before the first interrupt. 4000 */ 4001 execlists->csb_head = reset_value; 4002 WRITE_ONCE(*execlists->csb_write, reset_value); 4003 wmb(); /* Make sure this is visible to HW (paranoia?) */ 4004 4005 invalidate_csb_entries(&execlists->csb_status[0], 4006 &execlists->csb_status[reset_value]); 4007 4008 /* Once more for luck and our trusty paranoia */ 4009 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, 4010 0xffff << 16 | reset_value << 8 | reset_value); 4011 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); 4012 4013 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); 4014 } 4015 4016 static void execlists_sanitize(struct intel_engine_cs *engine) 4017 { 4018 /* 4019 * Poison residual state on resume, in case the suspend didn't! 4020 * 4021 * We have to assume that across suspend/resume (or other loss 4022 * of control) that the contents of our pinned buffers has been 4023 * lost, replaced by garbage. Since this doesn't always happen, 4024 * let's poison such state so that we more quickly spot when 4025 * we falsely assume it has been preserved. 4026 */ 4027 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 4028 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); 4029 4030 reset_csb_pointers(engine); 4031 4032 /* 4033 * The kernel_context HWSP is stored in the status_page. As above, 4034 * that may be lost on resume/initialisation, and so we need to 4035 * reset the value in the HWSP. 4036 */ 4037 intel_timeline_reset_seqno(engine->kernel_context->timeline); 4038 4039 /* And scrub the dirty cachelines for the HWSP */ 4040 clflush_cache_range(engine->status_page.addr, PAGE_SIZE); 4041 } 4042 4043 static void enable_error_interrupt(struct intel_engine_cs *engine) 4044 { 4045 u32 status; 4046 4047 engine->execlists.error_interrupt = 0; 4048 ENGINE_WRITE(engine, RING_EMR, ~0u); 4049 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ 4050 4051 status = ENGINE_READ(engine, RING_ESR); 4052 if (unlikely(status)) { 4053 drm_err(&engine->i915->drm, 4054 "engine '%s' resumed still in error: %08x\n", 4055 engine->name, status); 4056 __intel_gt_reset(engine->gt, engine->mask); 4057 } 4058 4059 /* 4060 * On current gen8+, we have 2 signals to play with 4061 * 4062 * - I915_ERROR_INSTUCTION (bit 0) 4063 * 4064 * Generate an error if the command parser encounters an invalid 4065 * instruction 4066 * 4067 * This is a fatal error. 4068 * 4069 * - CP_PRIV (bit 2) 4070 * 4071 * Generate an error on privilege violation (where the CP replaces 4072 * the instruction with a no-op). This also fires for writes into 4073 * read-only scratch pages. 4074 * 4075 * This is a non-fatal error, parsing continues. 4076 * 4077 * * there are a few others defined for odd HW that we do not use 4078 * 4079 * Since CP_PRIV fires for cases where we have chosen to ignore the 4080 * error (as the HW is validating and suppressing the mistakes), we 4081 * only unmask the instruction error bit. 4082 */ 4083 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); 4084 } 4085 4086 static void enable_execlists(struct intel_engine_cs *engine) 4087 { 4088 u32 mode; 4089 4090 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL); 4091 4092 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ 4093 4094 if (INTEL_GEN(engine->i915) >= 11) 4095 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE); 4096 else 4097 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE); 4098 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode); 4099 4100 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); 4101 4102 ENGINE_WRITE_FW(engine, 4103 RING_HWS_PGA, 4104 i915_ggtt_offset(engine->status_page.vma)); 4105 ENGINE_POSTING_READ(engine, RING_HWS_PGA); 4106 4107 enable_error_interrupt(engine); 4108 4109 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0); 4110 } 4111 4112 static bool unexpected_starting_state(struct intel_engine_cs *engine) 4113 { 4114 bool unexpected = false; 4115 4116 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { 4117 drm_dbg(&engine->i915->drm, 4118 "STOP_RING still set in RING_MI_MODE\n"); 4119 unexpected = true; 4120 } 4121 4122 return unexpected; 4123 } 4124 4125 static int execlists_resume(struct intel_engine_cs *engine) 4126 { 4127 intel_mocs_init_engine(engine); 4128 4129 intel_engine_reset_breadcrumbs(engine); 4130 4131 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { 4132 struct drm_printer p = drm_debug_printer(__func__); 4133 4134 intel_engine_dump(engine, &p, NULL); 4135 } 4136 4137 enable_execlists(engine); 4138 4139 return 0; 4140 } 4141 4142 static void execlists_reset_prepare(struct intel_engine_cs *engine) 4143 { 4144 struct intel_engine_execlists * const execlists = &engine->execlists; 4145 unsigned long flags; 4146 4147 ENGINE_TRACE(engine, "depth<-%d\n", 4148 atomic_read(&execlists->tasklet.count)); 4149 4150 /* 4151 * Prevent request submission to the hardware until we have 4152 * completed the reset in i915_gem_reset_finish(). If a request 4153 * is completed by one engine, it may then queue a request 4154 * to a second via its execlists->tasklet *just* as we are 4155 * calling engine->resume() and also writing the ELSP. 4156 * Turning off the execlists->tasklet until the reset is over 4157 * prevents the race. 4158 */ 4159 __tasklet_disable_sync_once(&execlists->tasklet); 4160 GEM_BUG_ON(!reset_in_progress(execlists)); 4161 4162 /* And flush any current direct submission. */ 4163 spin_lock_irqsave(&engine->active.lock, flags); 4164 spin_unlock_irqrestore(&engine->active.lock, flags); 4165 4166 /* 4167 * We stop engines, otherwise we might get failed reset and a 4168 * dead gpu (on elk). Also as modern gpu as kbl can suffer 4169 * from system hang if batchbuffer is progressing when 4170 * the reset is issued, regardless of READY_TO_RESET ack. 4171 * Thus assume it is best to stop engines on all gens 4172 * where we have a gpu reset. 4173 * 4174 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) 4175 * 4176 * FIXME: Wa for more modern gens needs to be validated 4177 */ 4178 ring_set_paused(engine, 1); 4179 intel_engine_stop_cs(engine); 4180 4181 engine->execlists.reset_ccid = active_ccid(engine); 4182 } 4183 4184 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) 4185 { 4186 int x; 4187 4188 x = lrc_ring_mi_mode(engine); 4189 if (x != -1) { 4190 regs[x + 1] &= ~STOP_RING; 4191 regs[x + 1] |= STOP_RING << 16; 4192 } 4193 } 4194 4195 static void __execlists_reset_reg_state(const struct intel_context *ce, 4196 const struct intel_engine_cs *engine) 4197 { 4198 u32 *regs = ce->lrc_reg_state; 4199 4200 __reset_stop_ring(regs, engine); 4201 } 4202 4203 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) 4204 { 4205 struct intel_engine_execlists * const execlists = &engine->execlists; 4206 struct intel_context *ce; 4207 struct i915_request *rq; 4208 u32 head; 4209 4210 mb(); /* paranoia: read the CSB pointers from after the reset */ 4211 clflush(execlists->csb_write); 4212 mb(); 4213 4214 process_csb(engine); /* drain preemption events */ 4215 4216 /* Following the reset, we need to reload the CSB read/write pointers */ 4217 reset_csb_pointers(engine); 4218 4219 /* 4220 * Save the currently executing context, even if we completed 4221 * its request, it was still running at the time of the 4222 * reset and will have been clobbered. 4223 */ 4224 rq = active_context(engine, engine->execlists.reset_ccid); 4225 if (!rq) 4226 goto unwind; 4227 4228 ce = rq->context; 4229 GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); 4230 4231 if (i915_request_completed(rq)) { 4232 /* Idle context; tidy up the ring so we can restart afresh */ 4233 head = intel_ring_wrap(ce->ring, rq->tail); 4234 goto out_replay; 4235 } 4236 4237 /* We still have requests in-flight; the engine should be active */ 4238 GEM_BUG_ON(!intel_engine_pm_is_awake(engine)); 4239 4240 /* Context has requests still in-flight; it should not be idle! */ 4241 GEM_BUG_ON(i915_active_is_idle(&ce->active)); 4242 4243 rq = active_request(ce->timeline, rq); 4244 head = intel_ring_wrap(ce->ring, rq->head); 4245 GEM_BUG_ON(head == ce->ring->tail); 4246 4247 /* 4248 * If this request hasn't started yet, e.g. it is waiting on a 4249 * semaphore, we need to avoid skipping the request or else we 4250 * break the signaling chain. However, if the context is corrupt 4251 * the request will not restart and we will be stuck with a wedged 4252 * device. It is quite often the case that if we issue a reset 4253 * while the GPU is loading the context image, that the context 4254 * image becomes corrupt. 4255 * 4256 * Otherwise, if we have not started yet, the request should replay 4257 * perfectly and we do not need to flag the result as being erroneous. 4258 */ 4259 if (!i915_request_started(rq)) 4260 goto out_replay; 4261 4262 /* 4263 * If the request was innocent, we leave the request in the ELSP 4264 * and will try to replay it on restarting. The context image may 4265 * have been corrupted by the reset, in which case we may have 4266 * to service a new GPU hang, but more likely we can continue on 4267 * without impact. 4268 * 4269 * If the request was guilty, we presume the context is corrupt 4270 * and have to at least restore the RING register in the context 4271 * image back to the expected values to skip over the guilty request. 4272 */ 4273 __i915_request_reset(rq, stalled); 4274 4275 /* 4276 * We want a simple context + ring to execute the breadcrumb update. 4277 * We cannot rely on the context being intact across the GPU hang, 4278 * so clear it and rebuild just what we need for the breadcrumb. 4279 * All pending requests for this context will be zapped, and any 4280 * future request will be after userspace has had the opportunity 4281 * to recreate its own state. 4282 */ 4283 out_replay: 4284 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", 4285 head, ce->ring->tail); 4286 __execlists_reset_reg_state(ce, engine); 4287 __execlists_update_reg_state(ce, engine, head); 4288 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */ 4289 4290 unwind: 4291 /* Push back any incomplete requests for replay after the reset. */ 4292 cancel_port_requests(execlists); 4293 __unwind_incomplete_requests(engine); 4294 } 4295 4296 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled) 4297 { 4298 unsigned long flags; 4299 4300 ENGINE_TRACE(engine, "\n"); 4301 4302 spin_lock_irqsave(&engine->active.lock, flags); 4303 4304 __execlists_reset(engine, stalled); 4305 4306 spin_unlock_irqrestore(&engine->active.lock, flags); 4307 } 4308 4309 static void nop_submission_tasklet(unsigned long data) 4310 { 4311 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; 4312 4313 /* The driver is wedged; don't process any more events. */ 4314 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN); 4315 } 4316 4317 static void execlists_reset_cancel(struct intel_engine_cs *engine) 4318 { 4319 struct intel_engine_execlists * const execlists = &engine->execlists; 4320 struct i915_request *rq, *rn; 4321 struct rb_node *rb; 4322 unsigned long flags; 4323 4324 ENGINE_TRACE(engine, "\n"); 4325 4326 /* 4327 * Before we call engine->cancel_requests(), we should have exclusive 4328 * access to the submission state. This is arranged for us by the 4329 * caller disabling the interrupt generation, the tasklet and other 4330 * threads that may then access the same state, giving us a free hand 4331 * to reset state. However, we still need to let lockdep be aware that 4332 * we know this state may be accessed in hardirq context, so we 4333 * disable the irq around this manipulation and we want to keep 4334 * the spinlock focused on its duties and not accidentally conflate 4335 * coverage to the submission's irq state. (Similarly, although we 4336 * shouldn't need to disable irq around the manipulation of the 4337 * submission's irq state, we also wish to remind ourselves that 4338 * it is irq state.) 4339 */ 4340 spin_lock_irqsave(&engine->active.lock, flags); 4341 4342 __execlists_reset(engine, true); 4343 4344 /* Mark all executing requests as skipped. */ 4345 list_for_each_entry(rq, &engine->active.requests, sched.link) 4346 mark_eio(rq); 4347 4348 /* Flush the queued requests to the timeline list (for retiring). */ 4349 while ((rb = rb_first_cached(&execlists->queue))) { 4350 struct i915_priolist *p = to_priolist(rb); 4351 int i; 4352 4353 priolist_for_each_request_consume(rq, rn, p, i) { 4354 mark_eio(rq); 4355 __i915_request_submit(rq); 4356 } 4357 4358 rb_erase_cached(&p->node, &execlists->queue); 4359 i915_priolist_free(p); 4360 } 4361 4362 /* On-hold requests will be flushed to timeline upon their release */ 4363 list_for_each_entry(rq, &engine->active.hold, sched.link) 4364 mark_eio(rq); 4365 4366 /* Cancel all attached virtual engines */ 4367 while ((rb = rb_first_cached(&execlists->virtual))) { 4368 struct virtual_engine *ve = 4369 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 4370 4371 rb_erase_cached(rb, &execlists->virtual); 4372 RB_CLEAR_NODE(rb); 4373 4374 spin_lock(&ve->base.active.lock); 4375 rq = fetch_and_zero(&ve->request); 4376 if (rq) { 4377 mark_eio(rq); 4378 4379 rq->engine = engine; 4380 __i915_request_submit(rq); 4381 i915_request_put(rq); 4382 4383 ve->base.execlists.queue_priority_hint = INT_MIN; 4384 } 4385 spin_unlock(&ve->base.active.lock); 4386 } 4387 4388 /* Remaining _unready_ requests will be nop'ed when submitted */ 4389 4390 execlists->queue_priority_hint = INT_MIN; 4391 execlists->queue = RB_ROOT_CACHED; 4392 4393 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); 4394 execlists->tasklet.func = nop_submission_tasklet; 4395 4396 spin_unlock_irqrestore(&engine->active.lock, flags); 4397 } 4398 4399 static void execlists_reset_finish(struct intel_engine_cs *engine) 4400 { 4401 struct intel_engine_execlists * const execlists = &engine->execlists; 4402 4403 /* 4404 * After a GPU reset, we may have requests to replay. Do so now while 4405 * we still have the forcewake to be sure that the GPU is not allowed 4406 * to sleep before we restart and reload a context. 4407 */ 4408 GEM_BUG_ON(!reset_in_progress(execlists)); 4409 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) 4410 execlists->tasklet.func(execlists->tasklet.data); 4411 4412 if (__tasklet_enable(&execlists->tasklet)) 4413 /* And kick in case we missed a new request submission. */ 4414 tasklet_hi_schedule(&execlists->tasklet); 4415 ENGINE_TRACE(engine, "depth->%d\n", 4416 atomic_read(&execlists->tasklet.count)); 4417 } 4418 4419 static int gen8_emit_bb_start_noarb(struct i915_request *rq, 4420 u64 offset, u32 len, 4421 const unsigned int flags) 4422 { 4423 u32 *cs; 4424 4425 cs = intel_ring_begin(rq, 4); 4426 if (IS_ERR(cs)) 4427 return PTR_ERR(cs); 4428 4429 /* 4430 * WaDisableCtxRestoreArbitration:bdw,chv 4431 * 4432 * We don't need to perform MI_ARB_ENABLE as often as we do (in 4433 * particular all the gen that do not need the w/a at all!), if we 4434 * took care to make sure that on every switch into this context 4435 * (both ordinary and for preemption) that arbitrartion was enabled 4436 * we would be fine. However, for gen8 there is another w/a that 4437 * requires us to not preempt inside GPGPU execution, so we keep 4438 * arbitration disabled for gen8 batches. Arbitration will be 4439 * re-enabled before we close the request 4440 * (engine->emit_fini_breadcrumb). 4441 */ 4442 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4443 4444 /* FIXME(BDW+): Address space and security selectors. */ 4445 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4446 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4447 *cs++ = lower_32_bits(offset); 4448 *cs++ = upper_32_bits(offset); 4449 4450 intel_ring_advance(rq, cs); 4451 4452 return 0; 4453 } 4454 4455 static int gen8_emit_bb_start(struct i915_request *rq, 4456 u64 offset, u32 len, 4457 const unsigned int flags) 4458 { 4459 u32 *cs; 4460 4461 cs = intel_ring_begin(rq, 6); 4462 if (IS_ERR(cs)) 4463 return PTR_ERR(cs); 4464 4465 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4466 4467 *cs++ = MI_BATCH_BUFFER_START_GEN8 | 4468 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); 4469 *cs++ = lower_32_bits(offset); 4470 *cs++ = upper_32_bits(offset); 4471 4472 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; 4473 *cs++ = MI_NOOP; 4474 4475 intel_ring_advance(rq, cs); 4476 4477 return 0; 4478 } 4479 4480 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) 4481 { 4482 ENGINE_WRITE(engine, RING_IMR, 4483 ~(engine->irq_enable_mask | engine->irq_keep_mask)); 4484 ENGINE_POSTING_READ(engine, RING_IMR); 4485 } 4486 4487 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) 4488 { 4489 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); 4490 } 4491 4492 static int gen8_emit_flush(struct i915_request *request, u32 mode) 4493 { 4494 u32 cmd, *cs; 4495 4496 cs = intel_ring_begin(request, 4); 4497 if (IS_ERR(cs)) 4498 return PTR_ERR(cs); 4499 4500 cmd = MI_FLUSH_DW + 1; 4501 4502 /* We always require a command barrier so that subsequent 4503 * commands, such as breadcrumb interrupts, are strictly ordered 4504 * wrt the contents of the write cache being flushed to memory 4505 * (and thus being coherent from the CPU). 4506 */ 4507 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4508 4509 if (mode & EMIT_INVALIDATE) { 4510 cmd |= MI_INVALIDATE_TLB; 4511 if (request->engine->class == VIDEO_DECODE_CLASS) 4512 cmd |= MI_INVALIDATE_BSD; 4513 } 4514 4515 *cs++ = cmd; 4516 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4517 *cs++ = 0; /* upper addr */ 4518 *cs++ = 0; /* value */ 4519 intel_ring_advance(request, cs); 4520 4521 return 0; 4522 } 4523 4524 static int gen8_emit_flush_render(struct i915_request *request, 4525 u32 mode) 4526 { 4527 bool vf_flush_wa = false, dc_flush_wa = false; 4528 u32 *cs, flags = 0; 4529 int len; 4530 4531 flags |= PIPE_CONTROL_CS_STALL; 4532 4533 if (mode & EMIT_FLUSH) { 4534 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4535 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4536 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4537 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4538 } 4539 4540 if (mode & EMIT_INVALIDATE) { 4541 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4542 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4543 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4544 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4545 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4546 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4547 flags |= PIPE_CONTROL_QW_WRITE; 4548 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4549 4550 /* 4551 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL 4552 * pipe control. 4553 */ 4554 if (IS_GEN(request->engine->i915, 9)) 4555 vf_flush_wa = true; 4556 4557 /* WaForGAMHang:kbl */ 4558 if (IS_KBL_REVID(request->engine->i915, 0, KBL_REVID_B0)) 4559 dc_flush_wa = true; 4560 } 4561 4562 len = 6; 4563 4564 if (vf_flush_wa) 4565 len += 6; 4566 4567 if (dc_flush_wa) 4568 len += 12; 4569 4570 cs = intel_ring_begin(request, len); 4571 if (IS_ERR(cs)) 4572 return PTR_ERR(cs); 4573 4574 if (vf_flush_wa) 4575 cs = gen8_emit_pipe_control(cs, 0, 0); 4576 4577 if (dc_flush_wa) 4578 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, 4579 0); 4580 4581 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4582 4583 if (dc_flush_wa) 4584 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); 4585 4586 intel_ring_advance(request, cs); 4587 4588 return 0; 4589 } 4590 4591 static int gen11_emit_flush_render(struct i915_request *request, 4592 u32 mode) 4593 { 4594 if (mode & EMIT_FLUSH) { 4595 u32 *cs; 4596 u32 flags = 0; 4597 4598 flags |= PIPE_CONTROL_CS_STALL; 4599 4600 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4601 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4602 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4603 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4604 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4605 flags |= PIPE_CONTROL_QW_WRITE; 4606 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4607 4608 cs = intel_ring_begin(request, 6); 4609 if (IS_ERR(cs)) 4610 return PTR_ERR(cs); 4611 4612 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4613 intel_ring_advance(request, cs); 4614 } 4615 4616 if (mode & EMIT_INVALIDATE) { 4617 u32 *cs; 4618 u32 flags = 0; 4619 4620 flags |= PIPE_CONTROL_CS_STALL; 4621 4622 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4623 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4624 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4625 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4626 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4627 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4628 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4629 flags |= PIPE_CONTROL_QW_WRITE; 4630 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4631 4632 cs = intel_ring_begin(request, 6); 4633 if (IS_ERR(cs)) 4634 return PTR_ERR(cs); 4635 4636 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4637 intel_ring_advance(request, cs); 4638 } 4639 4640 return 0; 4641 } 4642 4643 static u32 preparser_disable(bool state) 4644 { 4645 return MI_ARB_CHECK | 1 << 8 | state; 4646 } 4647 4648 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) 4649 { 4650 static const i915_reg_t vd[] = { 4651 GEN12_VD0_AUX_NV, 4652 GEN12_VD1_AUX_NV, 4653 GEN12_VD2_AUX_NV, 4654 GEN12_VD3_AUX_NV, 4655 }; 4656 4657 static const i915_reg_t ve[] = { 4658 GEN12_VE0_AUX_NV, 4659 GEN12_VE1_AUX_NV, 4660 }; 4661 4662 if (engine->class == VIDEO_DECODE_CLASS) 4663 return vd[engine->instance]; 4664 4665 if (engine->class == VIDEO_ENHANCEMENT_CLASS) 4666 return ve[engine->instance]; 4667 4668 GEM_BUG_ON("unknown aux_inv_reg\n"); 4669 4670 return INVALID_MMIO_REG; 4671 } 4672 4673 static u32 * 4674 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) 4675 { 4676 *cs++ = MI_LOAD_REGISTER_IMM(1); 4677 *cs++ = i915_mmio_reg_offset(inv_reg); 4678 *cs++ = AUX_INV; 4679 *cs++ = MI_NOOP; 4680 4681 return cs; 4682 } 4683 4684 static int gen12_emit_flush_render(struct i915_request *request, 4685 u32 mode) 4686 { 4687 if (mode & EMIT_FLUSH) { 4688 u32 flags = 0; 4689 u32 *cs; 4690 4691 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; 4692 flags |= PIPE_CONTROL_FLUSH_L3; 4693 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; 4694 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; 4695 /* Wa_1409600907:tgl */ 4696 flags |= PIPE_CONTROL_DEPTH_STALL; 4697 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; 4698 flags |= PIPE_CONTROL_FLUSH_ENABLE; 4699 4700 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4701 flags |= PIPE_CONTROL_QW_WRITE; 4702 4703 flags |= PIPE_CONTROL_CS_STALL; 4704 4705 cs = intel_ring_begin(request, 6); 4706 if (IS_ERR(cs)) 4707 return PTR_ERR(cs); 4708 4709 cs = gen12_emit_pipe_control(cs, 4710 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4711 flags, LRC_PPHWSP_SCRATCH_ADDR); 4712 intel_ring_advance(request, cs); 4713 } 4714 4715 if (mode & EMIT_INVALIDATE) { 4716 u32 flags = 0; 4717 u32 *cs; 4718 4719 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE; 4720 flags |= PIPE_CONTROL_TLB_INVALIDATE; 4721 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; 4722 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; 4723 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; 4724 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; 4725 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; 4726 4727 flags |= PIPE_CONTROL_STORE_DATA_INDEX; 4728 flags |= PIPE_CONTROL_QW_WRITE; 4729 4730 flags |= PIPE_CONTROL_CS_STALL; 4731 4732 cs = intel_ring_begin(request, 8 + 4); 4733 if (IS_ERR(cs)) 4734 return PTR_ERR(cs); 4735 4736 /* 4737 * Prevent the pre-parser from skipping past the TLB 4738 * invalidate and loading a stale page for the batch 4739 * buffer / request payload. 4740 */ 4741 *cs++ = preparser_disable(true); 4742 4743 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); 4744 4745 /* hsdes: 1809175790 */ 4746 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); 4747 4748 *cs++ = preparser_disable(false); 4749 intel_ring_advance(request, cs); 4750 } 4751 4752 return 0; 4753 } 4754 4755 static int gen12_emit_flush(struct i915_request *request, u32 mode) 4756 { 4757 intel_engine_mask_t aux_inv = 0; 4758 u32 cmd, *cs; 4759 4760 if (mode & EMIT_INVALIDATE) 4761 aux_inv = request->engine->mask & ~BIT(BCS0); 4762 4763 cs = intel_ring_begin(request, 4764 4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0)); 4765 if (IS_ERR(cs)) 4766 return PTR_ERR(cs); 4767 4768 cmd = MI_FLUSH_DW + 1; 4769 4770 /* We always require a command barrier so that subsequent 4771 * commands, such as breadcrumb interrupts, are strictly ordered 4772 * wrt the contents of the write cache being flushed to memory 4773 * (and thus being coherent from the CPU). 4774 */ 4775 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; 4776 4777 if (mode & EMIT_INVALIDATE) { 4778 cmd |= MI_INVALIDATE_TLB; 4779 if (request->engine->class == VIDEO_DECODE_CLASS) 4780 cmd |= MI_INVALIDATE_BSD; 4781 } 4782 4783 *cs++ = cmd; 4784 *cs++ = LRC_PPHWSP_SCRATCH_ADDR; 4785 *cs++ = 0; /* upper addr */ 4786 *cs++ = 0; /* value */ 4787 4788 if (aux_inv) { /* hsdes: 1809175790 */ 4789 struct intel_engine_cs *engine; 4790 unsigned int tmp; 4791 4792 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); 4793 for_each_engine_masked(engine, request->engine->gt, 4794 aux_inv, tmp) { 4795 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); 4796 *cs++ = AUX_INV; 4797 } 4798 *cs++ = MI_NOOP; 4799 } 4800 intel_ring_advance(request, cs); 4801 4802 return 0; 4803 } 4804 4805 static void assert_request_valid(struct i915_request *rq) 4806 { 4807 struct intel_ring *ring __maybe_unused = rq->ring; 4808 4809 /* Can we unwind this request without appearing to go forwards? */ 4810 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); 4811 } 4812 4813 /* 4814 * Reserve space for 2 NOOPs at the end of each request to be 4815 * used as a workaround for not being allowed to do lite 4816 * restore with HEAD==TAIL (WaIdleLiteRestore). 4817 */ 4818 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) 4819 { 4820 /* Ensure there's always at least one preemption point per-request. */ 4821 *cs++ = MI_ARB_CHECK; 4822 *cs++ = MI_NOOP; 4823 request->wa_tail = intel_ring_offset(request, cs); 4824 4825 /* Check that entire request is less than half the ring */ 4826 assert_request_valid(request); 4827 4828 return cs; 4829 } 4830 4831 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) 4832 { 4833 *cs++ = MI_SEMAPHORE_WAIT | 4834 MI_SEMAPHORE_GLOBAL_GTT | 4835 MI_SEMAPHORE_POLL | 4836 MI_SEMAPHORE_SAD_EQ_SDD; 4837 *cs++ = 0; 4838 *cs++ = intel_hws_preempt_address(request->engine); 4839 *cs++ = 0; 4840 4841 return cs; 4842 } 4843 4844 static __always_inline u32* 4845 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4846 { 4847 *cs++ = MI_USER_INTERRUPT; 4848 4849 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4850 if (intel_engine_has_semaphores(request->engine)) 4851 cs = emit_preempt_busywait(request, cs); 4852 4853 request->tail = intel_ring_offset(request, cs); 4854 assert_ring_tail_valid(request->ring, request->tail); 4855 4856 return gen8_emit_wa_tail(request, cs); 4857 } 4858 4859 static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) 4860 { 4861 u32 addr = i915_request_active_timeline(request)->hwsp_offset; 4862 4863 return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); 4864 } 4865 4866 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4867 { 4868 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4869 } 4870 4871 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4872 { 4873 cs = gen8_emit_pipe_control(cs, 4874 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4875 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4876 PIPE_CONTROL_DC_FLUSH_ENABLE, 4877 0); 4878 4879 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ 4880 cs = gen8_emit_ggtt_write_rcs(cs, 4881 request->fence.seqno, 4882 i915_request_active_timeline(request)->hwsp_offset, 4883 PIPE_CONTROL_FLUSH_ENABLE | 4884 PIPE_CONTROL_CS_STALL); 4885 4886 return gen8_emit_fini_breadcrumb_tail(request, cs); 4887 } 4888 4889 static u32 * 4890 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4891 { 4892 cs = gen8_emit_ggtt_write_rcs(cs, 4893 request->fence.seqno, 4894 i915_request_active_timeline(request)->hwsp_offset, 4895 PIPE_CONTROL_CS_STALL | 4896 PIPE_CONTROL_TILE_CACHE_FLUSH | 4897 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4898 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4899 PIPE_CONTROL_DC_FLUSH_ENABLE | 4900 PIPE_CONTROL_FLUSH_ENABLE); 4901 4902 return gen8_emit_fini_breadcrumb_tail(request, cs); 4903 } 4904 4905 /* 4906 * Note that the CS instruction pre-parser will not stall on the breadcrumb 4907 * flush and will continue pre-fetching the instructions after it before the 4908 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at 4909 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble 4910 * of the next request before the memory has been flushed, we're guaranteed that 4911 * we won't access the batch itself too early. 4912 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, 4913 * so, if the current request is modifying an instruction in the next request on 4914 * the same intel_context, we might pre-fetch and then execute the pre-update 4915 * instruction. To avoid this, the users of self-modifying code should either 4916 * disable the parser around the code emitting the memory writes, via a new flag 4917 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For 4918 * the in-kernel use-cases we've opted to use a separate context, see 4919 * reloc_gpu() as an example. 4920 * All the above applies only to the instructions themselves. Non-inline data 4921 * used by the instructions is not pre-fetched. 4922 */ 4923 4924 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) 4925 { 4926 *cs++ = MI_SEMAPHORE_WAIT_TOKEN | 4927 MI_SEMAPHORE_GLOBAL_GTT | 4928 MI_SEMAPHORE_POLL | 4929 MI_SEMAPHORE_SAD_EQ_SDD; 4930 *cs++ = 0; 4931 *cs++ = intel_hws_preempt_address(request->engine); 4932 *cs++ = 0; 4933 *cs++ = 0; 4934 *cs++ = MI_NOOP; 4935 4936 return cs; 4937 } 4938 4939 static __always_inline u32* 4940 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) 4941 { 4942 *cs++ = MI_USER_INTERRUPT; 4943 4944 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; 4945 if (intel_engine_has_semaphores(request->engine)) 4946 cs = gen12_emit_preempt_busywait(request, cs); 4947 4948 request->tail = intel_ring_offset(request, cs); 4949 assert_ring_tail_valid(request->ring, request->tail); 4950 4951 return gen8_emit_wa_tail(request, cs); 4952 } 4953 4954 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) 4955 { 4956 return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); 4957 } 4958 4959 static u32 * 4960 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) 4961 { 4962 cs = gen12_emit_ggtt_write_rcs(cs, 4963 request->fence.seqno, 4964 i915_request_active_timeline(request)->hwsp_offset, 4965 PIPE_CONTROL0_HDC_PIPELINE_FLUSH, 4966 PIPE_CONTROL_CS_STALL | 4967 PIPE_CONTROL_TILE_CACHE_FLUSH | 4968 PIPE_CONTROL_FLUSH_L3 | 4969 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | 4970 PIPE_CONTROL_DEPTH_CACHE_FLUSH | 4971 /* Wa_1409600907:tgl */ 4972 PIPE_CONTROL_DEPTH_STALL | 4973 PIPE_CONTROL_DC_FLUSH_ENABLE | 4974 PIPE_CONTROL_FLUSH_ENABLE); 4975 4976 return gen12_emit_fini_breadcrumb_tail(request, cs); 4977 } 4978 4979 static void execlists_park(struct intel_engine_cs *engine) 4980 { 4981 cancel_timer(&engine->execlists.timer); 4982 cancel_timer(&engine->execlists.preempt); 4983 } 4984 4985 void intel_execlists_set_default_submission(struct intel_engine_cs *engine) 4986 { 4987 engine->submit_request = execlists_submit_request; 4988 engine->schedule = i915_schedule; 4989 engine->execlists.tasklet.func = execlists_submission_tasklet; 4990 4991 engine->reset.prepare = execlists_reset_prepare; 4992 engine->reset.rewind = execlists_reset_rewind; 4993 engine->reset.cancel = execlists_reset_cancel; 4994 engine->reset.finish = execlists_reset_finish; 4995 4996 engine->park = execlists_park; 4997 engine->unpark = NULL; 4998 4999 engine->flags |= I915_ENGINE_SUPPORTS_STATS; 5000 if (!intel_vgpu_active(engine->i915)) { 5001 engine->flags |= I915_ENGINE_HAS_SEMAPHORES; 5002 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) { 5003 engine->flags |= I915_ENGINE_HAS_PREEMPTION; 5004 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) 5005 engine->flags |= I915_ENGINE_HAS_TIMESLICES; 5006 } 5007 } 5008 5009 if (INTEL_GEN(engine->i915) >= 12) 5010 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO; 5011 5012 if (intel_engine_has_preemption(engine)) 5013 engine->emit_bb_start = gen8_emit_bb_start; 5014 else 5015 engine->emit_bb_start = gen8_emit_bb_start_noarb; 5016 } 5017 5018 static void execlists_shutdown(struct intel_engine_cs *engine) 5019 { 5020 /* Synchronise with residual timers and any softirq they raise */ 5021 del_timer_sync(&engine->execlists.timer); 5022 del_timer_sync(&engine->execlists.preempt); 5023 tasklet_kill(&engine->execlists.tasklet); 5024 } 5025 5026 static void execlists_release(struct intel_engine_cs *engine) 5027 { 5028 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ 5029 5030 execlists_shutdown(engine); 5031 5032 intel_engine_cleanup_common(engine); 5033 lrc_destroy_wa_ctx(engine); 5034 } 5035 5036 static void 5037 logical_ring_default_vfuncs(struct intel_engine_cs *engine) 5038 { 5039 /* Default vfuncs which can be overriden by each engine. */ 5040 5041 engine->resume = execlists_resume; 5042 5043 engine->cops = &execlists_context_ops; 5044 engine->request_alloc = execlists_request_alloc; 5045 5046 engine->emit_flush = gen8_emit_flush; 5047 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; 5048 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; 5049 if (INTEL_GEN(engine->i915) >= 12) { 5050 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; 5051 engine->emit_flush = gen12_emit_flush; 5052 } 5053 engine->set_default_submission = intel_execlists_set_default_submission; 5054 5055 if (INTEL_GEN(engine->i915) < 11) { 5056 engine->irq_enable = gen8_logical_ring_enable_irq; 5057 engine->irq_disable = gen8_logical_ring_disable_irq; 5058 } else { 5059 /* 5060 * TODO: On Gen11 interrupt masks need to be clear 5061 * to allow C6 entry. Keep interrupts enabled at 5062 * and take the hit of generating extra interrupts 5063 * until a more refined solution exists. 5064 */ 5065 } 5066 } 5067 5068 static inline void 5069 logical_ring_default_irqs(struct intel_engine_cs *engine) 5070 { 5071 unsigned int shift = 0; 5072 5073 if (INTEL_GEN(engine->i915) < 11) { 5074 const u8 irq_shifts[] = { 5075 [RCS0] = GEN8_RCS_IRQ_SHIFT, 5076 [BCS0] = GEN8_BCS_IRQ_SHIFT, 5077 [VCS0] = GEN8_VCS0_IRQ_SHIFT, 5078 [VCS1] = GEN8_VCS1_IRQ_SHIFT, 5079 [VECS0] = GEN8_VECS_IRQ_SHIFT, 5080 }; 5081 5082 shift = irq_shifts[engine->id]; 5083 } 5084 5085 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; 5086 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; 5087 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; 5088 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; 5089 } 5090 5091 static void rcs_submission_override(struct intel_engine_cs *engine) 5092 { 5093 switch (INTEL_GEN(engine->i915)) { 5094 case 12: 5095 engine->emit_flush = gen12_emit_flush_render; 5096 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs; 5097 break; 5098 case 11: 5099 engine->emit_flush = gen11_emit_flush_render; 5100 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs; 5101 break; 5102 default: 5103 engine->emit_flush = gen8_emit_flush_render; 5104 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; 5105 break; 5106 } 5107 } 5108 5109 int intel_execlists_submission_setup(struct intel_engine_cs *engine) 5110 { 5111 struct intel_engine_execlists * const execlists = &engine->execlists; 5112 struct drm_i915_private *i915 = engine->i915; 5113 struct intel_uncore *uncore = engine->uncore; 5114 u32 base = engine->mmio_base; 5115 5116 tasklet_init(&engine->execlists.tasklet, 5117 execlists_submission_tasklet, (unsigned long)engine); 5118 timer_setup(&engine->execlists.timer, execlists_timeslice, 0); 5119 timer_setup(&engine->execlists.preempt, execlists_preempt, 0); 5120 5121 logical_ring_default_vfuncs(engine); 5122 logical_ring_default_irqs(engine); 5123 5124 if (engine->class == RENDER_CLASS) 5125 rcs_submission_override(engine); 5126 5127 if (intel_init_workaround_bb(engine)) 5128 /* 5129 * We continue even if we fail to initialize WA batch 5130 * because we only expect rare glitches but nothing 5131 * critical to prevent us from using GPU 5132 */ 5133 drm_err(&i915->drm, "WA batch buffer initialization failed\n"); 5134 5135 if (HAS_LOGICAL_RING_ELSQ(i915)) { 5136 execlists->submit_reg = uncore->regs + 5137 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); 5138 execlists->ctrl_reg = uncore->regs + 5139 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); 5140 } else { 5141 execlists->submit_reg = uncore->regs + 5142 i915_mmio_reg_offset(RING_ELSP(base)); 5143 } 5144 5145 execlists->csb_status = 5146 &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; 5147 5148 execlists->csb_write = 5149 &engine->status_page.addr[intel_hws_csb_write_index(i915)]; 5150 5151 if (INTEL_GEN(i915) < 11) 5152 execlists->csb_size = GEN8_CSB_ENTRIES; 5153 else 5154 execlists->csb_size = GEN11_CSB_ENTRIES; 5155 5156 if (INTEL_GEN(engine->i915) >= 11) { 5157 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32); 5158 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); 5159 } 5160 5161 /* Finally, take ownership and responsibility for cleanup! */ 5162 engine->sanitize = execlists_sanitize; 5163 engine->release = execlists_release; 5164 5165 return 0; 5166 } 5167 5168 static void init_common_reg_state(u32 * const regs, 5169 const struct intel_engine_cs *engine, 5170 const struct intel_ring *ring, 5171 bool inhibit) 5172 { 5173 u32 ctl; 5174 5175 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH); 5176 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); 5177 if (inhibit) 5178 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT; 5179 if (INTEL_GEN(engine->i915) < 11) 5180 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | 5181 CTX_CTRL_RS_CTX_ENABLE); 5182 regs[CTX_CONTEXT_CONTROL] = ctl; 5183 5184 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; 5185 regs[CTX_TIMESTAMP] = 0; 5186 } 5187 5188 static void init_wa_bb_reg_state(u32 * const regs, 5189 const struct intel_engine_cs *engine) 5190 { 5191 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; 5192 5193 if (wa_ctx->per_ctx.size) { 5194 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); 5195 5196 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); 5197 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = 5198 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; 5199 } 5200 5201 if (wa_ctx->indirect_ctx.size) { 5202 lrc_ring_setup_indirect_ctx(regs, engine, 5203 i915_ggtt_offset(wa_ctx->vma) + 5204 wa_ctx->indirect_ctx.offset, 5205 wa_ctx->indirect_ctx.size); 5206 } 5207 } 5208 5209 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt) 5210 { 5211 if (i915_vm_is_4lvl(&ppgtt->vm)) { 5212 /* 64b PPGTT (48bit canonical) 5213 * PDP0_DESCRIPTOR contains the base address to PML4 and 5214 * other PDP Descriptors are ignored. 5215 */ 5216 ASSIGN_CTX_PML4(ppgtt, regs); 5217 } else { 5218 ASSIGN_CTX_PDP(ppgtt, regs, 3); 5219 ASSIGN_CTX_PDP(ppgtt, regs, 2); 5220 ASSIGN_CTX_PDP(ppgtt, regs, 1); 5221 ASSIGN_CTX_PDP(ppgtt, regs, 0); 5222 } 5223 } 5224 5225 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm) 5226 { 5227 if (i915_is_ggtt(vm)) 5228 return i915_vm_to_ggtt(vm)->alias; 5229 else 5230 return i915_vm_to_ppgtt(vm); 5231 } 5232 5233 static void execlists_init_reg_state(u32 *regs, 5234 const struct intel_context *ce, 5235 const struct intel_engine_cs *engine, 5236 const struct intel_ring *ring, 5237 bool inhibit) 5238 { 5239 /* 5240 * A context is actually a big batch buffer with several 5241 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The 5242 * values we are setting here are only for the first context restore: 5243 * on a subsequent save, the GPU will recreate this batchbuffer with new 5244 * values (including all the missing MI_LOAD_REGISTER_IMM commands that 5245 * we are not initializing here). 5246 * 5247 * Must keep consistent with virtual_update_register_offsets(). 5248 */ 5249 set_offsets(regs, reg_offsets(engine), engine, inhibit); 5250 5251 init_common_reg_state(regs, engine, ring, inhibit); 5252 init_ppgtt_reg_state(regs, vm_alias(ce->vm)); 5253 5254 init_wa_bb_reg_state(regs, engine); 5255 5256 __reset_stop_ring(regs, engine); 5257 } 5258 5259 static int 5260 populate_lr_context(struct intel_context *ce, 5261 struct drm_i915_gem_object *ctx_obj, 5262 struct intel_engine_cs *engine, 5263 struct intel_ring *ring) 5264 { 5265 bool inhibit = true; 5266 void *vaddr; 5267 5268 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); 5269 if (IS_ERR(vaddr)) { 5270 drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); 5271 return PTR_ERR(vaddr); 5272 } 5273 5274 set_redzone(vaddr, engine); 5275 5276 if (engine->default_state) { 5277 shmem_read(engine->default_state, 0, 5278 vaddr, engine->context_size); 5279 __set_bit(CONTEXT_VALID_BIT, &ce->flags); 5280 inhibit = false; 5281 } 5282 5283 /* Clear the ppHWSP (inc. per-context counters) */ 5284 memset(vaddr, 0, PAGE_SIZE); 5285 5286 /* 5287 * The second page of the context object contains some registers which 5288 * must be set up prior to the first execution. 5289 */ 5290 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, 5291 ce, engine, ring, inhibit); 5292 5293 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); 5294 i915_gem_object_unpin_map(ctx_obj); 5295 return 0; 5296 } 5297 5298 static int __execlists_context_alloc(struct intel_context *ce, 5299 struct intel_engine_cs *engine) 5300 { 5301 struct drm_i915_gem_object *ctx_obj; 5302 struct intel_ring *ring; 5303 struct i915_vma *vma; 5304 u32 context_size; 5305 int ret; 5306 5307 GEM_BUG_ON(ce->state); 5308 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); 5309 5310 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) 5311 context_size += I915_GTT_PAGE_SIZE; /* for redzone */ 5312 5313 if (INTEL_GEN(engine->i915) == 12) { 5314 ce->wa_bb_page = context_size / PAGE_SIZE; 5315 context_size += PAGE_SIZE; 5316 } 5317 5318 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); 5319 if (IS_ERR(ctx_obj)) 5320 return PTR_ERR(ctx_obj); 5321 5322 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL); 5323 if (IS_ERR(vma)) { 5324 ret = PTR_ERR(vma); 5325 goto error_deref_obj; 5326 } 5327 5328 if (!ce->timeline) { 5329 struct intel_timeline *tl; 5330 struct i915_vma *hwsp; 5331 5332 /* 5333 * Use the static global HWSP for the kernel context, and 5334 * a dynamically allocated cacheline for everyone else. 5335 */ 5336 hwsp = NULL; 5337 if (unlikely(intel_context_is_barrier(ce))) 5338 hwsp = engine->status_page.vma; 5339 5340 tl = intel_timeline_create(engine->gt, hwsp); 5341 if (IS_ERR(tl)) { 5342 ret = PTR_ERR(tl); 5343 goto error_deref_obj; 5344 } 5345 5346 ce->timeline = tl; 5347 } 5348 5349 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring); 5350 if (IS_ERR(ring)) { 5351 ret = PTR_ERR(ring); 5352 goto error_deref_obj; 5353 } 5354 5355 ret = populate_lr_context(ce, ctx_obj, engine, ring); 5356 if (ret) { 5357 drm_dbg(&engine->i915->drm, 5358 "Failed to populate LRC: %d\n", ret); 5359 goto error_ring_free; 5360 } 5361 5362 ce->ring = ring; 5363 ce->state = vma; 5364 5365 return 0; 5366 5367 error_ring_free: 5368 intel_ring_put(ring); 5369 error_deref_obj: 5370 i915_gem_object_put(ctx_obj); 5371 return ret; 5372 } 5373 5374 static struct list_head *virtual_queue(struct virtual_engine *ve) 5375 { 5376 return &ve->base.execlists.default_priolist.requests[0]; 5377 } 5378 5379 static void virtual_context_destroy(struct kref *kref) 5380 { 5381 struct virtual_engine *ve = 5382 container_of(kref, typeof(*ve), context.ref); 5383 unsigned int n; 5384 5385 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5386 GEM_BUG_ON(ve->request); 5387 GEM_BUG_ON(ve->context.inflight); 5388 5389 for (n = 0; n < ve->num_siblings; n++) { 5390 struct intel_engine_cs *sibling = ve->siblings[n]; 5391 struct rb_node *node = &ve->nodes[sibling->id].rb; 5392 unsigned long flags; 5393 5394 if (RB_EMPTY_NODE(node)) 5395 continue; 5396 5397 spin_lock_irqsave(&sibling->active.lock, flags); 5398 5399 /* Detachment is lazily performed in the execlists tasklet */ 5400 if (!RB_EMPTY_NODE(node)) 5401 rb_erase_cached(node, &sibling->execlists.virtual); 5402 5403 spin_unlock_irqrestore(&sibling->active.lock, flags); 5404 } 5405 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); 5406 5407 if (ve->context.state) 5408 __execlists_context_fini(&ve->context); 5409 intel_context_fini(&ve->context); 5410 5411 intel_engine_free_request_pool(&ve->base); 5412 5413 kfree(ve->bonds); 5414 kfree(ve); 5415 } 5416 5417 static void virtual_engine_initial_hint(struct virtual_engine *ve) 5418 { 5419 int swp; 5420 5421 /* 5422 * Pick a random sibling on starting to help spread the load around. 5423 * 5424 * New contexts are typically created with exactly the same order 5425 * of siblings, and often started in batches. Due to the way we iterate 5426 * the array of sibling when submitting requests, sibling[0] is 5427 * prioritised for dequeuing. If we make sure that sibling[0] is fairly 5428 * randomised across the system, we also help spread the load by the 5429 * first engine we inspect being different each time. 5430 * 5431 * NB This does not force us to execute on this engine, it will just 5432 * typically be the first we inspect for submission. 5433 */ 5434 swp = prandom_u32_max(ve->num_siblings); 5435 if (swp) 5436 swap(ve->siblings[swp], ve->siblings[0]); 5437 } 5438 5439 static int virtual_context_alloc(struct intel_context *ce) 5440 { 5441 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5442 5443 return __execlists_context_alloc(ce, ve->siblings[0]); 5444 } 5445 5446 static int virtual_context_pin(struct intel_context *ce) 5447 { 5448 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5449 5450 /* Note: we must use a real engine class for setting up reg state */ 5451 return __execlists_context_pin(ce, ve->siblings[0]); 5452 } 5453 5454 static void virtual_context_enter(struct intel_context *ce) 5455 { 5456 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5457 unsigned int n; 5458 5459 for (n = 0; n < ve->num_siblings; n++) 5460 intel_engine_pm_get(ve->siblings[n]); 5461 5462 intel_timeline_enter(ce->timeline); 5463 } 5464 5465 static void virtual_context_exit(struct intel_context *ce) 5466 { 5467 struct virtual_engine *ve = container_of(ce, typeof(*ve), context); 5468 unsigned int n; 5469 5470 intel_timeline_exit(ce->timeline); 5471 5472 for (n = 0; n < ve->num_siblings; n++) 5473 intel_engine_pm_put(ve->siblings[n]); 5474 } 5475 5476 static const struct intel_context_ops virtual_context_ops = { 5477 .alloc = virtual_context_alloc, 5478 5479 .pin = virtual_context_pin, 5480 .unpin = execlists_context_unpin, 5481 5482 .enter = virtual_context_enter, 5483 .exit = virtual_context_exit, 5484 5485 .destroy = virtual_context_destroy, 5486 }; 5487 5488 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) 5489 { 5490 struct i915_request *rq; 5491 intel_engine_mask_t mask; 5492 5493 rq = READ_ONCE(ve->request); 5494 if (!rq) 5495 return 0; 5496 5497 /* The rq is ready for submission; rq->execution_mask is now stable. */ 5498 mask = rq->execution_mask; 5499 if (unlikely(!mask)) { 5500 /* Invalid selection, submit to a random engine in error */ 5501 i915_request_set_error_once(rq, -ENODEV); 5502 mask = ve->siblings[0]->mask; 5503 } 5504 5505 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n", 5506 rq->fence.context, rq->fence.seqno, 5507 mask, ve->base.execlists.queue_priority_hint); 5508 5509 return mask; 5510 } 5511 5512 static void virtual_submission_tasklet(unsigned long data) 5513 { 5514 struct virtual_engine * const ve = (struct virtual_engine *)data; 5515 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint); 5516 intel_engine_mask_t mask; 5517 unsigned int n; 5518 5519 rcu_read_lock(); 5520 mask = virtual_submission_mask(ve); 5521 rcu_read_unlock(); 5522 if (unlikely(!mask)) 5523 return; 5524 5525 local_irq_disable(); 5526 for (n = 0; n < ve->num_siblings; n++) { 5527 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); 5528 struct ve_node * const node = &ve->nodes[sibling->id]; 5529 struct rb_node **parent, *rb; 5530 bool first; 5531 5532 if (!READ_ONCE(ve->request)) 5533 break; /* already handled by a sibling's tasklet */ 5534 5535 if (unlikely(!(mask & sibling->mask))) { 5536 if (!RB_EMPTY_NODE(&node->rb)) { 5537 spin_lock(&sibling->active.lock); 5538 rb_erase_cached(&node->rb, 5539 &sibling->execlists.virtual); 5540 RB_CLEAR_NODE(&node->rb); 5541 spin_unlock(&sibling->active.lock); 5542 } 5543 continue; 5544 } 5545 5546 spin_lock(&sibling->active.lock); 5547 5548 if (!RB_EMPTY_NODE(&node->rb)) { 5549 /* 5550 * Cheat and avoid rebalancing the tree if we can 5551 * reuse this node in situ. 5552 */ 5553 first = rb_first_cached(&sibling->execlists.virtual) == 5554 &node->rb; 5555 if (prio == node->prio || (prio > node->prio && first)) 5556 goto submit_engine; 5557 5558 rb_erase_cached(&node->rb, &sibling->execlists.virtual); 5559 } 5560 5561 rb = NULL; 5562 first = true; 5563 parent = &sibling->execlists.virtual.rb_root.rb_node; 5564 while (*parent) { 5565 struct ve_node *other; 5566 5567 rb = *parent; 5568 other = rb_entry(rb, typeof(*other), rb); 5569 if (prio > other->prio) { 5570 parent = &rb->rb_left; 5571 } else { 5572 parent = &rb->rb_right; 5573 first = false; 5574 } 5575 } 5576 5577 rb_link_node(&node->rb, rb, parent); 5578 rb_insert_color_cached(&node->rb, 5579 &sibling->execlists.virtual, 5580 first); 5581 5582 submit_engine: 5583 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); 5584 node->prio = prio; 5585 if (first && prio > sibling->execlists.queue_priority_hint) 5586 tasklet_hi_schedule(&sibling->execlists.tasklet); 5587 5588 spin_unlock(&sibling->active.lock); 5589 } 5590 local_irq_enable(); 5591 } 5592 5593 static void virtual_submit_request(struct i915_request *rq) 5594 { 5595 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5596 struct i915_request *old; 5597 unsigned long flags; 5598 5599 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n", 5600 rq->fence.context, 5601 rq->fence.seqno); 5602 5603 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); 5604 5605 spin_lock_irqsave(&ve->base.active.lock, flags); 5606 5607 old = ve->request; 5608 if (old) { /* background completion event from preempt-to-busy */ 5609 GEM_BUG_ON(!i915_request_completed(old)); 5610 __i915_request_submit(old); 5611 i915_request_put(old); 5612 } 5613 5614 if (i915_request_completed(rq)) { 5615 __i915_request_submit(rq); 5616 5617 ve->base.execlists.queue_priority_hint = INT_MIN; 5618 ve->request = NULL; 5619 } else { 5620 ve->base.execlists.queue_priority_hint = rq_prio(rq); 5621 ve->request = i915_request_get(rq); 5622 5623 GEM_BUG_ON(!list_empty(virtual_queue(ve))); 5624 list_move_tail(&rq->sched.link, virtual_queue(ve)); 5625 5626 tasklet_hi_schedule(&ve->base.execlists.tasklet); 5627 } 5628 5629 spin_unlock_irqrestore(&ve->base.active.lock, flags); 5630 } 5631 5632 static struct ve_bond * 5633 virtual_find_bond(struct virtual_engine *ve, 5634 const struct intel_engine_cs *master) 5635 { 5636 int i; 5637 5638 for (i = 0; i < ve->num_bonds; i++) { 5639 if (ve->bonds[i].master == master) 5640 return &ve->bonds[i]; 5641 } 5642 5643 return NULL; 5644 } 5645 5646 static void 5647 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) 5648 { 5649 struct virtual_engine *ve = to_virtual_engine(rq->engine); 5650 intel_engine_mask_t allowed, exec; 5651 struct ve_bond *bond; 5652 5653 allowed = ~to_request(signal)->engine->mask; 5654 5655 bond = virtual_find_bond(ve, to_request(signal)->engine); 5656 if (bond) 5657 allowed &= bond->sibling_mask; 5658 5659 /* Restrict the bonded request to run on only the available engines */ 5660 exec = READ_ONCE(rq->execution_mask); 5661 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed)) 5662 ; 5663 5664 /* Prevent the master from being re-run on the bonded engines */ 5665 to_request(signal)->execution_mask &= ~allowed; 5666 } 5667 5668 struct intel_context * 5669 intel_execlists_create_virtual(struct intel_engine_cs **siblings, 5670 unsigned int count) 5671 { 5672 struct virtual_engine *ve; 5673 unsigned int n; 5674 int err; 5675 5676 if (count == 0) 5677 return ERR_PTR(-EINVAL); 5678 5679 if (count == 1) 5680 return intel_context_create(siblings[0]); 5681 5682 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); 5683 if (!ve) 5684 return ERR_PTR(-ENOMEM); 5685 5686 ve->base.i915 = siblings[0]->i915; 5687 ve->base.gt = siblings[0]->gt; 5688 ve->base.uncore = siblings[0]->uncore; 5689 ve->base.id = -1; 5690 5691 ve->base.class = OTHER_CLASS; 5692 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; 5693 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5694 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; 5695 5696 /* 5697 * The decision on whether to submit a request using semaphores 5698 * depends on the saturated state of the engine. We only compute 5699 * this during HW submission of the request, and we need for this 5700 * state to be globally applied to all requests being submitted 5701 * to this engine. Virtual engines encompass more than one physical 5702 * engine and so we cannot accurately tell in advance if one of those 5703 * engines is already saturated and so cannot afford to use a semaphore 5704 * and be pessimized in priority for doing so -- if we are the only 5705 * context using semaphores after all other clients have stopped, we 5706 * will be starved on the saturated system. Such a global switch for 5707 * semaphores is less than ideal, but alas is the current compromise. 5708 */ 5709 ve->base.saturated = ALL_ENGINES; 5710 5711 snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); 5712 5713 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); 5714 intel_engine_init_breadcrumbs(&ve->base); 5715 intel_engine_init_execlists(&ve->base); 5716 ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */ 5717 5718 ve->base.cops = &virtual_context_ops; 5719 ve->base.request_alloc = execlists_request_alloc; 5720 5721 ve->base.schedule = i915_schedule; 5722 ve->base.submit_request = virtual_submit_request; 5723 ve->base.bond_execute = virtual_bond_execute; 5724 5725 INIT_LIST_HEAD(virtual_queue(ve)); 5726 ve->base.execlists.queue_priority_hint = INT_MIN; 5727 tasklet_init(&ve->base.execlists.tasklet, 5728 virtual_submission_tasklet, 5729 (unsigned long)ve); 5730 5731 intel_context_init(&ve->context, &ve->base); 5732 5733 for (n = 0; n < count; n++) { 5734 struct intel_engine_cs *sibling = siblings[n]; 5735 5736 GEM_BUG_ON(!is_power_of_2(sibling->mask)); 5737 if (sibling->mask & ve->base.mask) { 5738 DRM_DEBUG("duplicate %s entry in load balancer\n", 5739 sibling->name); 5740 err = -EINVAL; 5741 goto err_put; 5742 } 5743 5744 /* 5745 * The virtual engine implementation is tightly coupled to 5746 * the execlists backend -- we push out request directly 5747 * into a tree inside each physical engine. We could support 5748 * layering if we handle cloning of the requests and 5749 * submitting a copy into each backend. 5750 */ 5751 if (sibling->execlists.tasklet.func != 5752 execlists_submission_tasklet) { 5753 err = -ENODEV; 5754 goto err_put; 5755 } 5756 5757 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); 5758 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); 5759 5760 ve->siblings[ve->num_siblings++] = sibling; 5761 ve->base.mask |= sibling->mask; 5762 5763 /* 5764 * All physical engines must be compatible for their emission 5765 * functions (as we build the instructions during request 5766 * construction and do not alter them before submission 5767 * on the physical engine). We use the engine class as a guide 5768 * here, although that could be refined. 5769 */ 5770 if (ve->base.class != OTHER_CLASS) { 5771 if (ve->base.class != sibling->class) { 5772 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", 5773 sibling->class, ve->base.class); 5774 err = -EINVAL; 5775 goto err_put; 5776 } 5777 continue; 5778 } 5779 5780 ve->base.class = sibling->class; 5781 ve->base.uabi_class = sibling->uabi_class; 5782 snprintf(ve->base.name, sizeof(ve->base.name), 5783 "v%dx%d", ve->base.class, count); 5784 ve->base.context_size = sibling->context_size; 5785 5786 ve->base.emit_bb_start = sibling->emit_bb_start; 5787 ve->base.emit_flush = sibling->emit_flush; 5788 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; 5789 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; 5790 ve->base.emit_fini_breadcrumb_dw = 5791 sibling->emit_fini_breadcrumb_dw; 5792 5793 ve->base.flags = sibling->flags; 5794 } 5795 5796 ve->base.flags |= I915_ENGINE_IS_VIRTUAL; 5797 5798 virtual_engine_initial_hint(ve); 5799 return &ve->context; 5800 5801 err_put: 5802 intel_context_put(&ve->context); 5803 return ERR_PTR(err); 5804 } 5805 5806 struct intel_context * 5807 intel_execlists_clone_virtual(struct intel_engine_cs *src) 5808 { 5809 struct virtual_engine *se = to_virtual_engine(src); 5810 struct intel_context *dst; 5811 5812 dst = intel_execlists_create_virtual(se->siblings, 5813 se->num_siblings); 5814 if (IS_ERR(dst)) 5815 return dst; 5816 5817 if (se->num_bonds) { 5818 struct virtual_engine *de = to_virtual_engine(dst->engine); 5819 5820 de->bonds = kmemdup(se->bonds, 5821 sizeof(*se->bonds) * se->num_bonds, 5822 GFP_KERNEL); 5823 if (!de->bonds) { 5824 intel_context_put(dst); 5825 return ERR_PTR(-ENOMEM); 5826 } 5827 5828 de->num_bonds = se->num_bonds; 5829 } 5830 5831 return dst; 5832 } 5833 5834 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, 5835 const struct intel_engine_cs *master, 5836 const struct intel_engine_cs *sibling) 5837 { 5838 struct virtual_engine *ve = to_virtual_engine(engine); 5839 struct ve_bond *bond; 5840 int n; 5841 5842 /* Sanity check the sibling is part of the virtual engine */ 5843 for (n = 0; n < ve->num_siblings; n++) 5844 if (sibling == ve->siblings[n]) 5845 break; 5846 if (n == ve->num_siblings) 5847 return -EINVAL; 5848 5849 bond = virtual_find_bond(ve, master); 5850 if (bond) { 5851 bond->sibling_mask |= sibling->mask; 5852 return 0; 5853 } 5854 5855 bond = krealloc(ve->bonds, 5856 sizeof(*bond) * (ve->num_bonds + 1), 5857 GFP_KERNEL); 5858 if (!bond) 5859 return -ENOMEM; 5860 5861 bond[ve->num_bonds].master = master; 5862 bond[ve->num_bonds].sibling_mask = sibling->mask; 5863 5864 ve->bonds = bond; 5865 ve->num_bonds++; 5866 5867 return 0; 5868 } 5869 5870 struct intel_engine_cs * 5871 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine, 5872 unsigned int sibling) 5873 { 5874 struct virtual_engine *ve = to_virtual_engine(engine); 5875 5876 if (sibling >= ve->num_siblings) 5877 return NULL; 5878 5879 return ve->siblings[sibling]; 5880 } 5881 5882 void intel_execlists_show_requests(struct intel_engine_cs *engine, 5883 struct drm_printer *m, 5884 void (*show_request)(struct drm_printer *m, 5885 struct i915_request *rq, 5886 const char *prefix), 5887 unsigned int max) 5888 { 5889 const struct intel_engine_execlists *execlists = &engine->execlists; 5890 struct i915_request *rq, *last; 5891 unsigned long flags; 5892 unsigned int count; 5893 struct rb_node *rb; 5894 5895 spin_lock_irqsave(&engine->active.lock, flags); 5896 5897 last = NULL; 5898 count = 0; 5899 list_for_each_entry(rq, &engine->active.requests, sched.link) { 5900 if (count++ < max - 1) 5901 show_request(m, rq, "\t\tE "); 5902 else 5903 last = rq; 5904 } 5905 if (last) { 5906 if (count > max) { 5907 drm_printf(m, 5908 "\t\t...skipping %d executing requests...\n", 5909 count - max); 5910 } 5911 show_request(m, last, "\t\tE "); 5912 } 5913 5914 if (execlists->switch_priority_hint != INT_MIN) 5915 drm_printf(m, "\t\tSwitch priority hint: %d\n", 5916 READ_ONCE(execlists->switch_priority_hint)); 5917 if (execlists->queue_priority_hint != INT_MIN) 5918 drm_printf(m, "\t\tQueue priority hint: %d\n", 5919 READ_ONCE(execlists->queue_priority_hint)); 5920 5921 last = NULL; 5922 count = 0; 5923 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { 5924 struct i915_priolist *p = rb_entry(rb, typeof(*p), node); 5925 int i; 5926 5927 priolist_for_each_request(rq, p, i) { 5928 if (count++ < max - 1) 5929 show_request(m, rq, "\t\tQ "); 5930 else 5931 last = rq; 5932 } 5933 } 5934 if (last) { 5935 if (count > max) { 5936 drm_printf(m, 5937 "\t\t...skipping %d queued requests...\n", 5938 count - max); 5939 } 5940 show_request(m, last, "\t\tQ "); 5941 } 5942 5943 last = NULL; 5944 count = 0; 5945 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { 5946 struct virtual_engine *ve = 5947 rb_entry(rb, typeof(*ve), nodes[engine->id].rb); 5948 struct i915_request *rq = READ_ONCE(ve->request); 5949 5950 if (rq) { 5951 if (count++ < max - 1) 5952 show_request(m, rq, "\t\tV "); 5953 else 5954 last = rq; 5955 } 5956 } 5957 if (last) { 5958 if (count > max) { 5959 drm_printf(m, 5960 "\t\t...skipping %d virtual requests...\n", 5961 count - max); 5962 } 5963 show_request(m, last, "\t\tV "); 5964 } 5965 5966 spin_unlock_irqrestore(&engine->active.lock, flags); 5967 } 5968 5969 void intel_lr_context_reset(struct intel_engine_cs *engine, 5970 struct intel_context *ce, 5971 u32 head, 5972 bool scrub) 5973 { 5974 GEM_BUG_ON(!intel_context_is_pinned(ce)); 5975 5976 /* 5977 * We want a simple context + ring to execute the breadcrumb update. 5978 * We cannot rely on the context being intact across the GPU hang, 5979 * so clear it and rebuild just what we need for the breadcrumb. 5980 * All pending requests for this context will be zapped, and any 5981 * future request will be after userspace has had the opportunity 5982 * to recreate its own state. 5983 */ 5984 if (scrub) 5985 restore_default_state(ce, engine); 5986 5987 /* Rerun the request; its payload has been neutered (if guilty). */ 5988 __execlists_update_reg_state(ce, engine, head); 5989 } 5990 5991 bool 5992 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine) 5993 { 5994 return engine->set_default_submission == 5995 intel_execlists_set_default_submission; 5996 } 5997 5998 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) 5999 #include "selftest_lrc.c" 6000 #endif 6001